Open navigation menu
Close suggestions
Search
Search
en
Change Language
Upload
Sign in
Sign in
Download free for days
0 ratings
0% found this document useful (0 votes)
4 views
Leer
Uploaded by
Diego Andrés Cares Oyarzo
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content,
claim it here
.
Available Formats
Download as TXT, PDF, TXT or read online on Scribd
Download now
Download
Save Leer For Later
Download
Save
Save Leer For Later
0%
0% found this document useful, undefined
0%
, undefined
Embed
Share
Print
Report
0 ratings
0% found this document useful (0 votes)
4 views
Leer
Uploaded by
Diego Andrés Cares Oyarzo
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content,
claim it here
.
Available Formats
Download as TXT, PDF, TXT or read online on Scribd
Download now
Download
Save Leer For Later
Carousel Previous
Carousel Next
Save
Save Leer For Later
0%
0% found this document useful, undefined
0%
, undefined
Embed
Share
Print
Report
Download now
Download
You are on page 1
/ 19
Search
Fullscreen
Logramos arreglar el problema de valores extremadamente altos.
Ahora, hay que fijar los valores negativos en 0 (revisa el % que son negativos:
1,35% )
+ for (col in missing_cols) {
+ data2 <- data2 %>%
+ mutate(!!sym(col) := NA)
+ }
+
+ missing_cols <- setdiff(colnames(data2), colnames(data1))
+ for (col in missing_cols) {
+ data1 <- data1 %>%
+ mutate(!!sym(col) := NA)
+ }
+
+ df <- rbind(data1, data2)
+ rm(data1)
+ rm(data2)
+ gc()
+
+
+ # Ordenar el dataframe de manera que las filas con valores no NA en
VENTA.Un. aparezcan primero
+ df <- df[order(!is.na(df$VENTA.Un.), decreasing = TRUE), ]
+
+ # Eliminar duplicados manteniendo la primera aparición (la cual tendrá
VENTA.Un. no NA si existe)
+ df <- df[!duplicated(df[c("item", "date_PD", "aisle", "aisle_part",
"orient")]), ]
+
+ setwd(output_dir)
+ write.csv(df, file2)
+ }
+ }
+
+
+
+
+
+ }
>
> detect_changepoints_in_data <- function() {
+
+ input_dir <- "C:/Users/Admin/Downloads/Zippedi/Data/Stage
3/Merged_complete_and_incomplete"
+
+ # Check files available
+ setwd(input_dir)
+ files <- list.files(path = input_dir, pattern = "\\.csv$")
+
+ detect_changepoints <- function(data, column_name) {
+ # Agrupa los datos por 'aisle', 'aisle_part', 'orient', 'item'
+ grouped_data <- data %>%
+ group_by(aisle, aisle_part, orient, item)
+
+ # Aplica el análisis de puntos de cambio a cada grupo
+ results <- grouped_data %>%
+ mutate(CP_Date_X = {
+ # Verifica el tamaño del grupo antes de aplicar el análisis
+ if (n() > 2) {
+ cpt_result <- cpt.mean(.data[["Location_x_mean"]], penalty = "AIC",
pen.value = 0, method = "PELT", Q = 5,
+ test.stat = "Normal", class = TRUE,
param.estimates = TRUE)
+ cp_dates <- rep(0, n())
+ cp_dates[cpts(cpt_result)] <- 1
+ } else {
+ cp_dates <- rep(0, n())
+ }
+ cp_dates
+ },
+ CP_Date_Z = {
+ # Verifica el tamaño del grupo antes de aplicar el análisis
+ if (n() > 2) {
+ cpt_result <- cpt.mean(.data[["Location_z_mean"]], penalty = "AIC",
pen.value = 0, method = "PELT", Q = 5,
+ test.stat = "Normal", class = TRUE,
param.estimates = TRUE)
+ cp_dates <- rep(0, n())
+ cp_dates[cpts(cpt_result)] <- 1
+ } else {
+ cp_dates <- rep(0, n())
+ }
+ cp_dates
+ })
+
+ # Desagrupa los datos
+ ungrouped_results <- results %>% ungroup()
+
+ return(ungrouped_results)
+ }
+
+ for (file in files) {
+ # Leer el archivo CSV
+ df <- read.csv(file)
+
+ # Detectar puntos de cambio para Location_x_mean
+ df <- detect_changepoints(df, "Location_x_mean")
+
+ # Detectar puntos de cambio para Location_z_mean (si es necesario)
+ df <- detect_changepoints(df, "Location_z_mean")
+
+ # Guardar el archivo CSV modificado
+ write.csv(df, file, row.names = FALSE)
+
+ # Liberar memoria
+ rm(df)
+ gc()
+ }
+ }
>
> #REVISAR
>
> # 1) Revisar valores negativos y extremadamente altos de linear_px_final OK
>
> change_linear_px_neg_values <- function() {
+
+ setwd("C:/Users/Admin/Downloads/Zippedi/Data/Stage 3")
+
+ # Set directories to use
+ input_dir <- "C:/Users/Admin/Downloads/Zippedi/Data/Stage 3/Merged_data"
+ output_dir <- "C:/Users/Admin/Downloads/Zippedi/Data/Stage 3/Merged_data_fix"
+
+ # Create directory in case it does not exist
+ if (!file.exists(output_dir)) {
+ print("Creating directory...")
+ dir.create(output_dir, recursive = TRUE)
+ print("Directory successfully created.")
+ }
+
+ # Check files available
+ files <- list.files(path = input_dir, pattern = "\\.csv$")
+
+ for (file in files) {
+ # Read the CSV file
+ data <- read.csv(file.path(input_dir, file))
+
+ # Replace negative values in linear_px_final with 0
+ data$linear_px_final[data$linear_px_final < 0] <- 0
+
+ # Write the modified data to the output directory
+ write.csv(data, file.path(output_dir, file), row.names = FALSE)
+
+ # Clean up
+ rm(data)
+ gc()
+ }
+
+ # Set directories to use
+ input_dir <- "C:/Users/Admin/Downloads/Zippedi/Data/Stage 3/Product_data"
+ output_dir <- "C:/Users/Admin/Downloads/Zippedi/Data/Stage 3/Product_data_fix"
+
+ # Create directory in case it does not exist
+ if (!file.exists(output_dir)) {
+ print("Creating directory...")
+ dir.create(output_dir, recursive = TRUE)
+ print("Directory successfully created.")
+ }
+
+ # Check files available
+ files <- list.files(path = input_dir, pattern = "\\.csv$")
+
+ for (file in files) {
+ # Read the CSV file
+ data <- read.csv(file.path(input_dir, file))
+
+ # Replace negative values in linear_px_final with 0
+ data$linear_px_final[data$linear_px_final < 0] <- 0
+
+ # Write the modified data to the output directory
+ write.csv(data, file.path(output_dir, file), row.names = FALSE)
+
+ # Clean up
+ rm(data)
+ gc()
+ }
+ }
>
> # 2) Hacer una serie de tiempo y ver cuantos cambios se detectan por fecha (puede
ser más de 1 en una fecha, si ocurre para múltiples productos), y también, en qué
fechas
> # ocurren puntos de cambio
>
> summary_changepoints_in_data <- function() {
+
+ input_dir <- "C:/Users/Admin/Downloads/Zippedi/Data/Stage
3/Merged_complete_and_incomplete"
+ output_dir <- "C:/Users/Admin/Downloads/Zippedi/Data/Stage 3/Summary_for_data"
+
+ # Create directory in case it does not exist
+ if (!file.exists(output_dir)) {
+ print("Creating directory...")
+ dir.create(output_dir, recursive = TRUE)
+ print("Directory successfully created.")
+ }
+
+ # Check files available
+ files <- list.files(path = input_dir, pattern = "\\.csv$")
+
+ for (file in files) {
+ # Read the CSV file
+ print(paste("Reading", file))
+ data <- read.csv(file.path(input_dir, file))
+
+ # Create CP_Date column
+ data$CP_Date <- ifelse(data$CP_Date_X == 1 | data$CP_Date_Y == 1, 1, 0)
+
+ # Create timeseries to observe how many changepoints are registered by day in
every section of the store
+
+ summary_cp <- data %>%
+ group_by(date_PD, aisle, aisle_part, orient) %>%
+ summarise(
+ aisle = first(aisle),
+ aisle_part = first(aisle_part),
+ orient = first(orient),
+ date_PD = first(date_PD),
+ store = first(store),
+ is_CP_Date_in_section = max(CP_Date),
+ CPs_registered_in_date_section = sum(CP_Date)
+ ) %>%
+ ungroup()
+
+ # 3) ¿Cuántas fechas (en total) de la línea de tiempo poseen puntos de
cambio?
+
+ summary_cp <- summary_cp %>%
+ group_by(aisle, aisle_part, orient) %>%
+ summarise(
+ amount_of_dates_in_section = n_distinct(date_PD)
+ ) %>%
+ ungroup()
+
+ summary_cp <- summary_cp %>%
+ group_by(aisle, aisle_part, orient) %>%
+ filter(is_CP_Date_in_section == 1) %>%
+ summarise(
+ amount_of_dates_with_CP_in_section = n_distinct(date_PD)
+ ) %>%
+ ungroup()
+
+ setwd(output_dir)
+
+ write.csv(summary_cp, file.path(output_dir, file), row.names = FALSE)
+ rm(summary_cp)
+ gc()
+ }
+ }
Error in file(out, "wt") : no se puede abrir la conexión
> summary_changepoints_in_data()
[1] "Creating directory..."
[1] "Directory successfully created."
[1] "Reading PD_J501.csv"
Error in `$<-.data.frame`(`*tmp*`, "CP_Date", value = logical(0)) :
replacement has 0 rows, data has 4539518
Called from: `$<-.data.frame`(`*tmp*`, "CP_Date", value = logical(0))
Browse[1]> Q
> summary_changepoints_in_data <- function() {
+
+ input_dir <- "C:/Users/Admin/Downloads/Zippedi/Data/Stage
3/Merged_complete_and_incomplete"
+ output_dir <- "C:/Users/Admin/Downloads/Zippedi/Data/Stage 3/Summary_for_data"
+
+ # Create directory in case it does not exist
+ if (!file.exists(output_dir)) {
+ print("Creating directory...")
+ dir.create(output_dir, recursive = TRUE)
+ print("Directory successfully created.")
+ }
+
+ # Check files available
+ files <- list.files(path = input_dir, pattern = "\\.csv$")
+
+ for (file in files) {
+ # Read the CSV file
+ print(paste("Reading", file))
+ data <- read.csv(file.path(input_dir, file))
+
+ # Create CP_Date column
+ data$CP_Date <- ifelse(data$CP_Date_X == 1 | data$CP_Date_Y == 1, 1, 0)
+
+ # Create timeseries to observe how many changepoints are registered by day in
every section of the store
+
+ summary_cp <- data %>%
+ group_by(date_PD, aisle, aisle_part, orient) %>%
+ summarise(
+ aisle = first(aisle),
+ aisle_part = first(aisle_part),
+ orient = first(orient),
+ date_PD = first(date_PD),
+ store = first(store),
+ is_CP_Date_in_section = max(CP_Date),
+ CPs_registered_in_date_section = sum(CP_Date),
+ .groups = 'drop'
+ )
+
+ # 3) ¿Cuántas fechas (en total) de la línea de tiempo poseen puntos de
cambio?
+
+ summary_cp_dates <- summary_cp %>%
+ group_by(aisle, aisle_part, orient) %>%
+ summarise(
+ amount_of_dates_in_section = n_distinct(date_PD),
+ .groups = 'drop'
+ )
+
+ summary_cp_with_cp <- summary_cp %>%
+ group_by(aisle, aisle_part, orient) %>%
+ filter(is_CP_Date_in_section == 1) %>%
+ summarise(
+ amount_of_dates_with_CP_in_section = n_distinct(date_PD),
+ .groups = 'drop'
+ )
+
+ # Verificar si el número de entradas después del filtrado es 0
+ if (nrow(summary_cp_with_cp) == 0) {
+ summary_cp_with_cp <- data.frame(
+ aisle = character(0),
+ aisle_part = character(0),
+ orient = character(0),
+ amount_of_dates_with_CP_in_section = integer(0)
+ )
+ }
+
+ # Combinar ambos resultados
+ summary_cp <- left_join(summary_cp_dates, summary_cp_with_cp, by = c("aisle",
"aisle_part", "orient"))
+
+ setwd(output_dir)
+
+ write.csv(summary_cp, file.path(output_dir, file), row.names = FALSE)
+ rm(summary_cp)
+ gc()
+ }
+ }
Error in file(out, "wt") : no se puede abrir la conexión
> summary_changepoints_in_data()
[1] "Reading PD_J501.csv"
Error in `$<-.data.frame`(`*tmp*`, "CP_Date", value = logical(0)) :
replacement has 0 rows, data has 4539518
Called from: `$<-.data.frame`(`*tmp*`, "CP_Date", value = logical(0))
Browse[1]> Q
Error in file(out, "wt") : no se puede abrir la conexión
> summary_changepoints_in_data <- function() {
+
+ input_dir <- "C:/Users/Admin/Downloads/Zippedi/Data/Stage
3/Merged_complete_and_incomplete"
+ output_dir <- "C:/Users/Admin/Downloads/Zippedi/Data/Stage 3/Summary_for_data"
+
+ # Create directory in case it does not exist
+ if (!file.exists(output_dir)) {
+ print("Creating directory...")
+ dir.create(output_dir, recursive = TRUE)
+ print("Directory successfully created.")
+ }
+
+ # Check files available
+ files <- list.files(path = input_dir, pattern = "\\.csv$")
+
+ for (file in files) {
+ # Read the CSV file
+ print(paste("Reading", file))
+ data <- read.csv(file.path(input_dir, file))
+
+ # Create CP_Date column
+ data$CP_Date <- ifelse(data$CP_Date_X == 1 | data$CP_Date_Y == 1, 1, 0)
+
+ # Create timeseries to observe how many changepoints are registered by day in
every section of the store
+
+ summary_cp <- data %>%
+ group_by(date_PD, aisle, aisle_part, orient) %>%
+ summarise(
+ store = first(store),
+ is_CP_Date_in_section = max(CP_Date),
+ CPs_registered_in_date_section = sum(CP_Date),
+ ) %>%
+ ungroup()
+
+ # 3) ¿Cuántas fechas (en total) de la línea de tiempo poseen puntos de
cambio?
+
+ summary_cp<- summary_cp %>%
+ group_by(aisle, aisle_part, orient) %>%
+ summarise(
+ amount_of_dates_in_section = n_distinct(date_PD),
+ amount_of_dates_with_CP_in_section =
n_distinct(date_PD[is_CP_Date_in_section == 1])
+ ) %>%
+ ungroup()
+
+ setwd(output_dir)
+
+ write.csv(summary_cp, file.path(output_dir, file), row.names = FALSE)
+ rm(summary_cp)
+ gc()
+ }
+ }
> summary_changepoints_in_data()
[1] "Reading PD_J501.csv"
Error in `$<-.data.frame`(`*tmp*`, "CP_Date", value = logical(0)) :
replacement has 0 rows, data has 4539518
Called from: `$<-.data.frame`(`*tmp*`, "CP_Date", value = logical(0))
Browse[1]> Q
> summary_changepoints_in_data <- function() {
+
+ input_dir <- "C:/Users/Admin/Downloads/Zippedi/Data/Stage
3/Merged_complete_and_incomplete"
+ output_dir <- "C:/Users/Admin/Downloads/Zippedi/Data/Stage 3/Summary_for_data"
+
+ # Create directory in case it does not exist
+ if (!file.exists(output_dir)) {
+ print("Creating directory...")
+ dir.create(output_dir, recursive = TRUE)
+ print("Directory successfully created.")
+ }
+
+ # Check files available
+ files <- list.files(path = input_dir, pattern = "\\.csv$")
+
+ for (file in files) {
+ # Read the CSV file
+ print(paste("Reading", file))
+ data <- read.csv(file.path(input_dir, file))
+
+ # Create CP_Date column
+ data$CP_Date <- ifelse(data$CP_Date_X == 1 | data$CP_Date_Z == 1, 1, 0)
+
+ # Create timeseries to observe how many changepoints are registered by day in
every section of the store
+
+ summary_cp <- data %>%
+ group_by(date_PD, aisle, aisle_part, orient) %>%
+ summarise(
+ store = first(store),
+ is_CP_Date_in_section = max(CP_Date),
+ CPs_registered_in_date_section = sum(CP_Date),
+ ) %>%
+ ungroup()
+
+ # 3) ¿Cuántas fechas (en total) de la línea de tiempo poseen puntos de
cambio?
+
+ summary_cp<- summary_cp %>%
+ group_by(aisle, aisle_part, orient) %>%
+ summarise(
+ amount_of_dates_in_section = n_distinct(date_PD),
+ amount_of_dates_with_CP_in_section =
n_distinct(date_PD[is_CP_Date_in_section == 1])
+ ) %>%
+ ungroup()
+
+ setwd(output_dir)
+
+ write.csv(summary_cp, file.path(output_dir, file), row.names = FALSE)
+ rm(summary_cp)
+ gc()
+ }
+ }
Error in file(out, "wt") : no se puede abrir la conexión
> summary_changepoints_in_data()
[1] "Reading PD_J501.csv"
`summarise()` has grouped output by 'date_PD', 'aisle', 'aisle_part'. You can
override using the `.groups`
argument.
`summarise()` has grouped output by 'aisle', 'aisle_part'. You can override using
the `.groups` argument.
[1] "Reading PD_J502.csv"
`summarise()` has grouped output by 'date_PD', 'aisle', 'aisle_part'. You can
override using the `.groups`
argument.
`summarise()` has grouped output by 'aisle', 'aisle_part'. You can override using
the `.groups` argument.
[1] "Reading PD_J503.csv"
Called from: scan(file = file, what = what, sep = sep, quote = quote, dec = dec,
nmax = nrows, skip = 0, na.strings = na.strings, quiet = TRUE,
fill = fill, strip.white = strip.white, blank.lines.skip = blank.lines.skip,
multi.line = FALSE, comment.char = comment.char, allowEscapes = allowEscapes,
flush = flush, encoding = encoding, skipNul = skipNul)
Browse[1]> Q
> summary_changepoints_in_data <- function() {
+
+ input_dir <- "C:/Users/Admin/Downloads/Zippedi/Data/Stage
3/Merged_complete_and_incomplete"
+ output_dir <- "C:/Users/Admin/Downloads/Zippedi/Data/Stage 3/Summary_for_data"
+
+ # Create directory in case it does not exist
+ if (!file.exists(output_dir)) {
+ print("Creating directory...")
+ dir.create(output_dir, recursive = TRUE)
+ print("Directory successfully created.")
+ }
+
+ # Check files available
+ files <- list.files(path = input_dir, pattern = "\\.csv$")
+
+ for (file in files) {
+ # Read the CSV file
+ print(paste("Reading", file))
+ data <- read.csv(file.path(input_dir, file))
+
+ # Ensure CP_Date_X and CP_Date_Z columns exist
+ if (!("CP_Date_X" %in% colnames(data)) | !("CP_Date_Z" %in% colnames(data)))
{
+ stop("CP_Date_X or CP_Date_Z column is missing in the data")
+ }
+
+ # Create CP_Date column
+ data$CP_Date <- ifelse(data$CP_Date_X == 1 | data$CP_Date_Z == 1, 1, 0)
+
+ # Create timeseries to observe how many changepoints are registered by day in
every section of the store
+ summary_cp <- data %>%
+ group_by(date_PD, aisle, aisle_part, orient) %>%
+ summarise(
+ store = first(store),
+ is_CP_Date_in_section = max(CP_Date),
+ CPs_registered_in_date_section = sum(CP_Date),
+ .groups = 'drop'
+ ) %>%
+ ungroup() %>%
+ group_by(aisle, aisle_part, orient) %>%
+ summarise(
+ store = first(store),
+ amount_of_dates_in_section = n_distinct(date_PD),
+ amount_of_dates_with_CP_in_section =
n_distinct(date_PD[is_CP_Date_in_section == 1]),
+ .groups = 'drop'
+ ) %>%
+ ungroup()
+
+ setwd(output_dir)
+
+ write.csv(summary_cp, file.path(output_dir, file), row.names = FALSE)
+ rm(summary_cp)
+ gc()
+ }
+ }
> summary_changepoints_in_data()
[1] "Reading PD_J501.csv"
[1] "Reading PD_J502.csv"
[1] "Reading PD_J503.csv"
[1] "Reading PD_J504.csv"
[1] "Reading PD_J510.csv"
[1] "Reading PD_J512.csv"
[1] "Reading PD_J513.csv"
Called from: scan(file = file, what = what, sep = sep, quote = quote, dec = dec,
nmax = nrows, skip = 0, na.strings = na.strings, quiet = TRUE,
fill = fill, strip.white = strip.white, blank.lines.skip = blank.lines.skip,
multi.line = FALSE, comment.char = comment.char, allowEscapes = allowEscapes,
flush = flush, encoding = encoding, skipNul = skipNul)
Browse[1]> Q
Error in file(out, "wt") : no se puede abrir la conexión
Error in file(out, "wt") : no se puede abrir la conexión
Además: Warning messages:
1: In if (match < 0) { :
la condición tiene longitud > 1 y sólo el primer elemento será usado
2: In if (match < 0) { :
la condición tiene longitud > 1 y sólo el primer elemento será usado
3: In if (match < 0) { :
la condición tiene longitud > 1 y sólo el primer elemento será usado
> summary_changepoints_in_data <- function() {
+
+ input_dir <- "C:/Users/Admin/Downloads/Zippedi/Data/Stage
3/Merged_complete_and_incomplete"
+ output_dir <- "C:/Users/Admin/Downloads/Zippedi/Data/Stage 3/Summary_for_data"
+
+ # Create directory in case it does not exist
+ if (!file.exists(output_dir)) {
+ print("Creating directory...")
+ dir.create(output_dir, recursive = TRUE)
+ print("Directory successfully created.")
+ }
+
+ # Check files available
+ files <- list.files(path = input_dir, pattern = "\\.csv$")
+
+ for (file in files) {
+ # Read the CSV file
+ print(paste("Reading", file))
+ data <- read.csv(file.path(input_dir, file))
+
+ # Ensure CP_Date_X and CP_Date_Z columns exist
+ if (!("CP_Date_X" %in% colnames(data)) | !("CP_Date_Z" %in% colnames(data)))
{
+ stop("CP_Date_X or CP_Date_Z column is missing in the data")
+ }
+
+ # Create CP_Date column
+ data$CP_Date <- ifelse(data$CP_Date_X == 1 | data$CP_Date_Z == 1, 1, 0)
+
+ # Create timeseries to observe how many changepoints are registered by day in
every section of the store
+ summary_cp_by_date <- data %>%
+ group_by(store, date_PD, aisle, aisle_part, orient) %>%
+ summarise(
+ is_CP_Date_in_section = max(CP_Date),
+ CPs_registered_in_date_section = sum(CP_Date),
+ .groups = 'drop'
+ )
+
+ # Summarise the general statistics
+ summary_cp_general <- summary_cp_by_date %>%
+ group_by(store, aisle, aisle_part, orient) %>%
+ summarise(
+ amount_of_dates_in_section = n_distinct(date_PD),
+ amount_of_dates_with_CP_in_section =
n_distinct(date_PD[is_CP_Date_in_section == 1]),
+ .groups = 'drop'
+ )
+
+ # Since we do not need additional summarising here, the left join is done
directly
+ summary_cp <- summary_cp_general %>%
+ left_join(summary_cp_by_date,by = c("store", "aisle", "aisle_part",
"orient"))
+
+
+ setwd(output_dir)
+
+ write.csv(summary_cp, file.path(output_dir, file), row.names = FALSE)
+ rm(summary_cp)
+ gc()
+ }
+ }
> summary_changepoints_in_data()
[1] "Reading PD_J501.csv"
[1] "Reading PD_J502.csv"
[1] "Reading PD_J503.csv"
[1] "Reading PD_J504.csv"
Called from: scan(file = file, what = what, sep = sep, quote = quote, dec = dec,
nmax = nrows, skip = 0, na.strings = na.strings, quiet = TRUE,
fill = fill, strip.white = strip.white, blank.lines.skip = blank.lines.skip,
multi.line = FALSE, comment.char = comment.char, allowEscapes = allowEscapes,
flush = flush, encoding = encoding, skipNul = skipNul)
Browse[1]> summary_changepoints_in_data <- function() {
+
+ input_dir <- "C:/Users/Admin/Downloads/Zippedi/Data/Stage
3/Merged_complete_and_incomplete"
+ output_dir <- "C:/Users/Admin/Downloads/Zippedi/Data/Stage 3/Summary_for_data"
+
+ # Create directory in case it does not exist
+ if (!file.exists(output_dir)) {
+ print("Creating directory...")
+ dir.create(output_dir, recursive = TRUE)
+ print("Directory successfully created.")
+ }
+
+ # Check files available
+ files <- list.files(path = input_dir, pattern = "\\.csv$")
+
+ for (file in files) {
+ # Read the CSV file
+ print(paste("Reading", file))
+ data <- read.csv(file.path(input_dir, file))
+
+ # Ensure CP_Date_X and CP_Date_Z columns exist
+ if (!("CP_Date_X" %in% colnames(data)) | !("CP_Date_Z" %in% colnames(data)))
{
+ stop("CP_Date_X or CP_Date_Z column is missing in the data")
+ }
+
+ # Create CP_Date column
+ data$CP_Date <- ifelse(data$CP_Date_X == 1 | data$CP_Date_Z == 1, 1, 0)
+
+ # Create timeseries to observe how many changepoints are registered by day in
every section of the store
+ summary_cp_by_date <- data %>%
+ group_by(store, date_PD, aisle, aisle_part, orient) %>%
+ summarise(
+ is_CP_Date_in_section = max(CP_Date),
+ CPs_registered_in_date_section = sum(CP_Date),
+ .groups = 'drop'
+ )
+
+ # Summarise the general statistics
+ summary_cp_general <- summary_cp_by_date %>%
+ group_by(store, aisle, aisle_part, orient) %>%
+ summarise(
+ amount_of_dates_in_section = n_distinct(date_PD),
+ amount_of_dates_with_CP_in_section =
n_distinct(date_PD[is_CP_Date_in_section == 1]),
+ .groups = 'drop'
+ )
+
+ # Join the summaries
+ summary_cp <- summary_cp_general %>%
+ left_join(summary_cp_by_date, by = c("store", "aisle", "aisle_part",
"orient"))
+
+ # Reorder columns
+ summary_cp <- summary_cp %>%
+ select(
+ date_PD, store, aisle, aisle_part, orient,
+ amount_of_dates_in_section, amount_of_dates_with_CP_in_section,
+ is_CP_Date_in_section, CPs_registered_in_date_section
+ )
+
+ setwd(output_dir)
+
+ write.csv(summary_cp, file.path(output_dir, file), row.names = FALSE)
+ rm(summary_cp)
+ gc()
+ }
+ }
Browse[1]> Q
> summary_changepoints_in_data <- function() {
+
+ input_dir <- "C:/Users/Admin/Downloads/Zippedi/Data/Stage
3/Merged_complete_and_incomplete"
+ output_dir <- "C:/Users/Admin/Downloads/Zippedi/Data/Stage 3/Summary_for_data"
+
+ # Create directory in case it does not exist
+ if (!file.exists(output_dir)) {
+ print("Creating directory...")
+ dir.create(output_dir, recursive = TRUE)
+ print("Directory successfully created.")
+ }
+
+ # Check files available
+ files <- list.files(path = input_dir, pattern = "\\.csv$")
+
+ for (file in files) {
+ # Read the CSV file
+ print(paste("Reading", file))
+ data <- read.csv(file.path(input_dir, file))
+
+ # Ensure CP_Date_X and CP_Date_Z columns exist
+ if (!("CP_Date_X" %in% colnames(data)) | !("CP_Date_Z" %in% colnames(data)))
{
+ stop("CP_Date_X or CP_Date_Z column is missing in the data")
+ }
+
+ # Create CP_Date column
+ data$CP_Date <- ifelse(data$CP_Date_X == 1 | data$CP_Date_Z == 1, 1, 0)
+
+ # Create timeseries to observe how many changepoints are registered by day in
every section of the store
+ summary_cp_by_date <- data %>%
+ group_by(store, date_PD, aisle, aisle_part, orient) %>%
+ summarise(
+ is_CP_Date_in_section = max(CP_Date),
+ CPs_registered_in_date_section = sum(CP_Date),
+ .groups = 'drop'
+ )
+
+ # Summarise the general statistics
+ summary_cp_general <- summary_cp_by_date %>%
+ group_by(store, aisle, aisle_part, orient) %>%
+ summarise(
+ amount_of_dates_in_section = n_distinct(date_PD),
+ amount_of_dates_with_CP_in_section =
n_distinct(date_PD[is_CP_Date_in_section == 1]),
+ .groups = 'drop'
+ )
+
+ # Join the summaries
+ summary_cp <- summary_cp_general %>%
+ left_join(summary_cp_by_date, by = c("store", "aisle", "aisle_part",
"orient"))
+
+ # Reorder columns
+ summary_cp <- summary_cp %>%
+ select(
+ date_PD, store, aisle, aisle_part, orient,
+ amount_of_dates_in_section, amount_of_dates_with_CP_in_section,
+ is_CP_Date_in_section, CPs_registered_in_date_section
+ )
+
+ setwd(output_dir)
+
+ write.csv(summary_cp, file.path(output_dir, file), row.names = FALSE)
+ rm(summary_cp)
+ gc()
+ }
+ }
> summary_changepoints_in_data()
[1] "Reading PD_J501.csv"
[1] "Reading PD_J502.csv"
[1] "Reading PD_J503.csv"
[1] "Reading PD_J504.csv"
[1] "Reading PD_J510.csv"
[1] "Reading PD_J512.csv"
[1] "Reading PD_J513.csv"
[1] "Reading PD_J514.csv"
[1] "Reading PD_J519.csv"
[1] "Reading PD_J624.csv"
[1] "Reading PD_J659.csv"
[1] "Reading PD_J762.csv"
[1] "Reading PD_J770.csv"
[1] "Reading PD_J775.csv"
> setwd("C:/Users/Admin/Downloads/Zippedi/Data/Stage 3/Merged_data_fix")
> results_501 <- read.csv("PD_J501.csv")
Error in file(file, "rt") : no se puede abrir la conexión
Además: Warning message:
In file(file, "rt") :
no fue posible abrir el archivo 'PD_J501.csv': No such file or directory
> setwd("C:/Users/Admin/Downloads/Zippedi/Data/Stage 3/Summary_for_data")
> results_501 <- read.csv("PD_J501.csv")
> View(results_501)
> setwd("C:/Users/Admin/Downloads/Zippedi/Data/Stage
3/Merged_complete_and_incomplete")
Error in file(out, "wt") : no se puede abrir la conexión
> data_501 <- read.csv("PD_J501.csv")
> View(results_501)
> View(data_501)
> print(paste("Número de valores con linear_px_final = 0:",
sum(data_501$linear_px_final == 0)))
[1] "Número de valores con linear_px_final = 0: 61284"
Error in file(out, "wt") : no se puede abrir la conexión
> print(paste("Porcentaje de valores con linear_px_final = 0:",
100*sum(data_501$linear_px_final == 0)/nrow(data_501), "%"))
[1] "Porcentaje de valores con linear_px_final = 0: 1.35001116858662 %"
Warning message:
In if (match < 0) { :
la condición tiene longitud > 1 y sólo el primer elemento será usado
> setwd("C:/Users/Admin/Downloads/Zippedi/Data/Provided_data/Facings")
Error in file(out, "wt") : no se puede abrir la conexión
> setwd("C:/Users/Admin/Downloads/Zippedi/Data/Stage 2/Predicted_Data")
Error in file(out, "wt") : no se puede abrir la conexión
> facings_501 <- read.csv("PD_J501.csv")
Error in file(out, "wt") : no se puede abrir la conexión
> setwd("C:/Users/Admin/Downloads/Zippedi/Data/Provided_data/Facings")
> facings_501 <- read.csv("Facings_J501.csv")
> View(facings_501)
> facings_501$linear_px <- facings_501$linear_px/1000
Warning message:
In if (match < 0) { :
la condición tiene longitud > 1 y sólo el primer elemento será usado
> # Crear una tabla de frecuencia de los valores en la columna
CPs_registered_in_date_section
> frequencies <- table(results_501$CPs_registered_in_date_section)
>
> # Convertir la tabla de frecuencia en un dataframe
> frequency_df <- as.data.frame(frequencies)
>
> # Renombrar las columnas para mayor claridad
> colnames(frequency_df) <- c("CPs_registered_in_date_section", "Frequency")
>
> # Mostrar el dataframe resultante
> print(frequency_df)
CPs_registered_in_date_section Frequency
1 0 34447
2 1 9162
3 2 3936
4 3 1895
5 4 1057
6 5 588
7 6 405
8 7 263
9 8 183
10 9 125
11 10 105
12 11 89
13 12 52
14 13 62
15 14 30
16 15 44
17 16 40
18 17 21
19 18 22
20 19 21
21 20 20
22 21 15
23 22 17
24 23 17
25 24 10
26 25 14
27 26 12
28 27 5
29 28 6
30 29 12
31 30 11
32 31 5
33 32 4
34 33 11
35 34 9
36 35 7
37 36 6
38 37 8
39 38 3
40 39 3
41 40 8
42 41 5
43 42 4
44 43 8
45 44 1
46 45 5
47 46 4
48 47 4
49 48 5
50 49 3
51 50 5
52 51 3
53 52 5
54 53 5
55 54 1
56 55 4
57 56 3
58 57 3
59 58 4
60 59 1
61 60 1
62 61 4
63 62 3
64 63 3
65 64 1
66 65 7
67 67 2
68 68 2
69 69 1
70 70 1
71 71 3
72 72 4
73 73 5
74 74 5
75 75 6
76 76 3
77 77 1
78 78 2
79 80 1
80 81 3
81 83 5
82 84 3
83 85 2
84 86 1
85 87 2
86 89 3
87 90 4
88 92 2
89 93 5
90 94 2
91 95 1
92 97 1
93 98 1
94 99 3
95 101 1
96 103 1
97 106 1
98 108 1
99 109 2
100 110 1
101 111 1
102 112 2
103 114 2
104 117 1
105 120 1
106 121 1
107 125 2
108 128 1
109 131 1
110 132 1
111 134 1
112 141 1
113 142 3
114 146 1
115 147 1
116 150 1
117 151 2
118 154 1
119 156 1
120 157 2
121 160 1
122 164 1
123 165 1
124 172 1
125 186 1
126 187 1
127 203 1
128 210 1
129 220 1
130 223 2
131 242 1
132 247 1
133 253 1
134 307 1
135 354 1
136 373 1
137 401 1
>
> results_grouped <- results_501 %>%
+ group_by(fecha) %>%
+ summarise(total_CPs_in_date = sum(CPs_registered_in_date_section, na.rm =
TRUE))
Error in `group_by()`:
! Must group by variables found in `.data`.
x Column `fecha` is not found.
Run `rlang::last_trace()` to see where the error occurred.
> results_grouped <- results_501 %>%
+ group_by(date_PD) %>%
+ summarise(total_CPs_in_date = sum(CPs_registered_in_date_section, na.rm =
TRUE))
> View(results_grouped)
> #Revisar cambios estructurales
> #usar fechas con menos cambios
> results_grouped <- results_grouped %>%
+ mutate(date_PD = as.Date(fecha, format = "%Y-%m-%d"))
Error in `mutate()`:
i In argument: `date_PD = as.Date(fecha, format = "%Y-%m-%d")`.
Caused by error in `as.Date()`:
! objeto 'fecha' no encontrado
Run `rlang::last_trace()` to see where the error occurred.
> results_grouped <- results_grouped %>%
+ mutate(date_PD = as.Date(date_PD, format = "%Y-%m-%d"))
> p <- ggplot(results_grouped, aes(x = total_CPs_in_date, y = date_PD)) +
+ geom_histogram(stat = "identity", fill = "blue") +
+ labs(title = "Histograma de CPs Totales por Fecha",
+ x = "Total CPs en la Fecha",
+ y = "Fecha") +
+ theme_minimal()
Warning message:
Ignoring unknown parameters: binwidth, bins, pad
> print(p)
> p <- ggplot(results_grouped, aes(x = date_PD, y = total_CPs_in_date)) +
+ geom_col(fill = "blue") +
+ labs(title = "Histograma de CPs Totales por Fecha",
+ x = "Fecha",
+ y = "Total CPs en la Fecha") +
+ theme_minimal()
> print(p)
> #Revisar pasillos donde solo hay unileiver
> Unileiver_aisles <- c("13B", "13C", "14B", "15B", "15C", "16C", "17B", "17C",
"22B", "23B", "24B")
> results_501_unileiver <- results_501 %>%
Warning messages:
1: In if (match < 0) { :
la condición tiene longitud > 1 y sólo el primer elemento será usado
2: In if (match < 0) { :
la condición tiene longitud > 1 y sólo el primer elemento será usado
+ filter(aisle %in% Unileiver_aisles)
> View(results_501_unileiver)
> results_grouped_501_unileiver <- results_501_unileiver %>% group_by(date_PD) %>%
summarise(total_CPs_in_date = sum(CPs_registered_in_date_section, na.rm = TRUE))
Warning message:
In if (match < 0) { :
la condición tiene longitud > 1 y sólo el primer elemento será usado
> View(results_grouped_501_unileiver)
> p <- ggplot(results_grouped_501_unileiver, aes(x = date_PD, y =
total_CPs_in_date)) + geom_col(fill = "blue") + labs(title = "Histograma de CPs
Totales por Fecha para pasillos con productos Unileiver", x = "Fecha", y = "Total
CPs en la Fecha") + theme_minimal()
> > print(p)
Error: inesperado '>' en ">"
> p <- ggplot(results_grouped_501_unileiver, aes(x = date_PD, y =
total_CPs_in_date)) + geom_col(fill = "blue") + labs(title = "Histograma de CPs
Totales por Fecha para pasillos con productos Unileiver", x = "Fecha", y = "Total
CPs en la Fecha") + theme_minimal()
> print(p)
> results_grouped_501_unileiver <- results_grouped_501_unileiver %>% mutate(date_PD
= as.Date(date_PD, format = "%Y-%m-%d"))
> p <- ggplot(results_grouped_501_unileiver, aes(x = date_PD, y =
total_CPs_in_date)) + geom_col(fill = "blue") + labs(title = "Histograma de CPs
Totales por Fecha para pasillos con productos Unileiver", x = "Fecha", y = "Total
CPs en la Fecha") + theme_minimal()
> print(p)
> View(results_grouped_501_unileiver)
> View(results_501_unileiver)
> # Crear una tabla de frecuencia de los valores en la columna
CPs_registered_in_date_section
Warning messages:
1: In if (match < 0) { :
la condición tiene longitud > 1 y sólo el primer elemento será usado
2: In if (match < 0) { :
la condición tiene longitud > 1 y sólo el primer elemento será usado
> frequencies <- table(results_501_unileiver$CPs_registered_in_date_section)
> # Convertir la tabla de frecuencia en un dataframe
> frequency_501_unileiver <- as.data.frame(frequencies)
> # Renombrar las columnas para mayor claridad
> colnames(frequency_501_unileiver) <- c("CPs_registered_in_date_section",
"Frequency")
> View(frequency_501_unileiver)
> View(results_grouped_501_unileiver)
You might also like
Verzani Answers
PDF
100% (8)
Verzani Answers
94 pages
The Practice of Statistics PDF
PDF
0% (2)
The Practice of Statistics PDF
1 page
ANZ Virtual Internship Module Model Answer For Task 1
PDF
No ratings yet
ANZ Virtual Internship Module Model Answer For Task 1
9 pages
notas
PDF
No ratings yet
notas
19 pages
Chapter 2. Pre-Processing Data
PDF
No ratings yet
Chapter 2. Pre-Processing Data
37 pages
InsideSherpa - Task2 - DraftSolutions - Template - RMD - Notepad - InsideSherpa - Task2 - DraftSolutions - Template
PDF
No ratings yet
InsideSherpa - Task2 - DraftSolutions - Template - RMD - Notepad - InsideSherpa - Task2 - DraftSolutions - Template
18 pages
report
PDF
No ratings yet
report
25 pages
R Functions
PDF
No ratings yet
R Functions
8 pages
Matrix, Dataframes, List
PDF
No ratings yet
Matrix, Dataframes, List
8 pages
Basic R Programming
PDF
No ratings yet
Basic R Programming
37 pages
RLAB KP
PDF
No ratings yet
RLAB KP
16 pages
Bellabeat R Script Template
PDF
No ratings yet
Bellabeat R Script Template
4 pages
R Programming
PDF
No ratings yet
R Programming
11 pages
7 K-Means Clustering
PDF
No ratings yet
7 K-Means Clustering
27 pages
DWM Practical
PDF
No ratings yet
DWM Practical
12 pages
ML Assignment Presentation
PDF
No ratings yet
ML Assignment Presentation
37 pages
Project1 - Cold Storage Case Study
PDF
No ratings yet
Project1 - Cold Storage Case Study
11 pages
PythonForMachineLearning
PDF
No ratings yet
PythonForMachineLearning
66 pages
Task 2 - Experimentation and uplift testing - Jupyter Notebook
PDF
No ratings yet
Task 2 - Experimentation and uplift testing - Jupyter Notebook
41 pages
Praktikum Modul 3
PDF
No ratings yet
Praktikum Modul 3
5 pages
Research File 3
PDF
No ratings yet
Research File 3
10 pages
Analysis Using Statistical: Introduction & Data Exploration
PDF
No ratings yet
Analysis Using Statistical: Introduction & Data Exploration
23 pages
HW 1 Math 380 R Code
PDF
No ratings yet
HW 1 Math 380 R Code
4 pages
Da (22C01156)
PDF
No ratings yet
Da (22C01156)
26 pages
Python For Business Decision Making Asm2
PDF
No ratings yet
Python For Business Decision Making Asm2
21 pages
1data Cleansing Cheklist
PDF
No ratings yet
1data Cleansing Cheklist
2 pages
Big Data - Lab 3
PDF
No ratings yet
Big Data - Lab 3
25 pages
Machine Learning Project Roadmap
PDF
No ratings yet
Machine Learning Project Roadmap
4 pages
DATA MINING EX1
PDF
No ratings yet
DATA MINING EX1
10 pages
Report For Task2
PDF
No ratings yet
Report For Task2
23 pages
Cluster R
PDF
No ratings yet
Cluster R
1 page
Exercise - 6: DS203-2024-S1 Problem1:: Statistics
PDF
No ratings yet
Exercise - 6: DS203-2024-S1 Problem1:: Statistics
10 pages
Report For Task2
PDF
No ratings yet
Report For Task2
23 pages
Experiment 5
PDF
No ratings yet
Experiment 5
13 pages
Customer Segmentation 1683225943
PDF
No ratings yet
Customer Segmentation 1683225943
34 pages
Zhang Haoze 202112 MSC
PDF
No ratings yet
Zhang Haoze 202112 MSC
114 pages
Creating A Single Data Frame From A Collection of Files
PDF
No ratings yet
Creating A Single Data Frame From A Collection of Files
6 pages
saurabh
PDF
No ratings yet
saurabh
22 pages
Semi-Automated Exploratory Data Analysis (EDA) in Python - by Destin Gong - Mar, 2021 - Towards Data
PDF
No ratings yet
Semi-Automated Exploratory Data Analysis (EDA) in Python - by Destin Gong - Mar, 2021 - Towards Data
3 pages
R 5 Marks
PDF
No ratings yet
R 5 Marks
11 pages
Da Lab It
PDF
No ratings yet
Da Lab It
20 pages
BigMart PDF
PDF
100% (1)
BigMart PDF
42 pages
Finalproj Aml
PDF
No ratings yet
Finalproj Aml
69 pages
dv mid internal 1
PDF
No ratings yet
dv mid internal 1
8 pages
FDA_E0323040_20_12_24
PDF
No ratings yet
FDA_E0323040_20_12_24
4 pages
R-Programming Lab Mannual
PDF
No ratings yet
R-Programming Lab Mannual
33 pages
Sta238 Wks - Week1+2
PDF
No ratings yet
Sta238 Wks - Week1+2
35 pages
Intro To Data Science Lecture 4
PDF
No ratings yet
Intro To Data Science Lecture 4
13 pages
Curso Básico de Iniciación A La Programación Con R Álvaro Mauricio Bustamante Lozano
PDF
No ratings yet
Curso Básico de Iniciación A La Programación Con R Álvaro Mauricio Bustamante Lozano
9 pages
DEV RECORD AIDS
PDF
No ratings yet
DEV RECORD AIDS
24 pages
DMBI IAT-2 IMP QUES SOLN
PDF
No ratings yet
DMBI IAT-2 IMP QUES SOLN
43 pages
The Error Code
PDF
No ratings yet
The Error Code
17 pages
lab book
PDF
No ratings yet
lab book
24 pages
Data Manipulation Workshop Handout
PDF
No ratings yet
Data Manipulation Workshop Handout
46 pages
基于Engle-Granger的低频、高频统计套利研究
PDF
No ratings yet
基于Engle-Granger的低频、高频统计套利研究
22 pages
R Program Record Book Iba
PDF
No ratings yet
R Program Record Book Iba
24 pages
Stastistics and Probability With R Programming Language: Lab Report
PDF
50% (2)
Stastistics and Probability With R Programming Language: Lab Report
44 pages
All Codes
PDF
No ratings yet
All Codes
10 pages
Sakhil Assignment 02
PDF
No ratings yet
Sakhil Assignment 02
8 pages
Oracle Certified Professional Java Programmer OCPJP 1Z0 809
From Everand
Oracle Certified Professional Java Programmer OCPJP 1Z0 809
Manish Soni
No ratings yet
Computer Engineering Laboratory Solution Primer
From Everand
Computer Engineering Laboratory Solution Primer
Karan Bhandari
No ratings yet
Sap BPC
PDF
No ratings yet
Sap BPC
6 pages
Waveshaper 1000b 4000b Ds
PDF
No ratings yet
Waveshaper 1000b 4000b Ds
4 pages
Iot Unit6
PDF
No ratings yet
Iot Unit6
35 pages
Node JS IMP Question and Answer SY BBA CA Sem IV
PDF
No ratings yet
Node JS IMP Question and Answer SY BBA CA Sem IV
33 pages
Chapter 11 Arrays1
PDF
No ratings yet
Chapter 11 Arrays1
14 pages
Learn Powerpoint
PDF
No ratings yet
Learn Powerpoint
320 pages
Files in Computer Programming
PDF
No ratings yet
Files in Computer Programming
35 pages
Mayank_resume (1)
PDF
No ratings yet
Mayank_resume (1)
1 page
Guido De Vita - Technical Project Manager CV
PDF
No ratings yet
Guido De Vita - Technical Project Manager CV
4 pages
Ey Erformance Ndicators: 4wire KPI Dashboard K P I
PDF
No ratings yet
Ey Erformance Ndicators: 4wire KPI Dashboard K P I
4 pages
WWW - Slideshare - Net - Rahulkushwaha06 - Computer Science Investigatory Project Class 12
PDF
No ratings yet
WWW - Slideshare - Net - Rahulkushwaha06 - Computer Science Investigatory Project Class 12
35 pages
Infrastructure Pentesting PDF
PDF
No ratings yet
Infrastructure Pentesting PDF
13 pages
File Handling: Prepared By: Sudipta Baladhikary Computer Science Department Hariyana Vidya Mandir
PDF
No ratings yet
File Handling: Prepared By: Sudipta Baladhikary Computer Science Department Hariyana Vidya Mandir
9 pages
Develop Computer Program Assignment
PDF
No ratings yet
Develop Computer Program Assignment
2 pages
Untitled
PDF
No ratings yet
Untitled
10 pages
System Administration Assignment
PDF
No ratings yet
System Administration Assignment
12 pages
JD - PEGA Developer
PDF
No ratings yet
JD - PEGA Developer
1 page
GRNK Guithth Onbt
PDF
No ratings yet
GRNK Guithth Onbt
2 pages
18csl58 Dbms Lab Manual 2022-23
PDF
No ratings yet
18csl58 Dbms Lab Manual 2022-23
72 pages
What Is DevOps? - IBM
PDF
No ratings yet
What Is DevOps? - IBM
20 pages
CRM Requirements Template: Activities General
PDF
No ratings yet
CRM Requirements Template: Activities General
5 pages
Init Setup
PDF
No ratings yet
Init Setup
2 pages
Food Delivery
PDF
No ratings yet
Food Delivery
9 pages
Cloud Security Implications For Finanical Services
PDF
No ratings yet
Cloud Security Implications For Finanical Services
9 pages
Magic Mirror: The Magpi Issue 54
PDF
No ratings yet
Magic Mirror: The Magpi Issue 54
2 pages
Final Term Question Paper Class XI Computer Science
PDF
No ratings yet
Final Term Question Paper Class XI Computer Science
4 pages
PDF (Ebook PDF) Computer Systems: A Programmer's Perspective 3nd Edition Download
PDF
100% (6)
PDF (Ebook PDF) Computer Systems: A Programmer's Perspective 3nd Edition Download
41 pages
Divertidos Ensayos Persuasivos
PDF
100% (1)
Divertidos Ensayos Persuasivos
6 pages
Network Security Record
PDF
No ratings yet
Network Security Record
57 pages