0% found this document useful (0 votes)
4 views

Leer

Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as TXT, PDF, TXT or read online on Scribd
0% found this document useful (0 votes)
4 views

Leer

Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as TXT, PDF, TXT or read online on Scribd
You are on page 1/ 19

Logramos arreglar el problema de valores extremadamente altos.

Ahora, hay que fijar los valores negativos en 0 (revisa el % que son negativos:
1,35% )

+ for (col in missing_cols) {


+ data2 <- data2 %>%
+ mutate(!!sym(col) := NA)
+ }
+
+ missing_cols <- setdiff(colnames(data2), colnames(data1))
+ for (col in missing_cols) {
+ data1 <- data1 %>%
+ mutate(!!sym(col) := NA)
+ }
+
+ df <- rbind(data1, data2)
+ rm(data1)
+ rm(data2)
+ gc()
+
+
+ # Ordenar el dataframe de manera que las filas con valores no NA en
VENTA.Un. aparezcan primero
+ df <- df[order(!is.na(df$VENTA.Un.), decreasing = TRUE), ]
+
+ # Eliminar duplicados manteniendo la primera aparición (la cual tendrá
VENTA.Un. no NA si existe)
+ df <- df[!duplicated(df[c("item", "date_PD", "aisle", "aisle_part",
"orient")]), ]
+
+ setwd(output_dir)
+ write.csv(df, file2)
+ }
+ }
+
+
+
+
+
+ }
>
> detect_changepoints_in_data <- function() {
+
+ input_dir <- "C:/Users/Admin/Downloads/Zippedi/Data/Stage
3/Merged_complete_and_incomplete"
+
+ # Check files available
+ setwd(input_dir)
+ files <- list.files(path = input_dir, pattern = "\\.csv$")
+
+ detect_changepoints <- function(data, column_name) {
+ # Agrupa los datos por 'aisle', 'aisle_part', 'orient', 'item'
+ grouped_data <- data %>%
+ group_by(aisle, aisle_part, orient, item)
+
+ # Aplica el análisis de puntos de cambio a cada grupo
+ results <- grouped_data %>%
+ mutate(CP_Date_X = {
+ # Verifica el tamaño del grupo antes de aplicar el análisis
+ if (n() > 2) {
+ cpt_result <- cpt.mean(.data[["Location_x_mean"]], penalty = "AIC",
pen.value = 0, method = "PELT", Q = 5,
+ test.stat = "Normal", class = TRUE,
param.estimates = TRUE)
+ cp_dates <- rep(0, n())
+ cp_dates[cpts(cpt_result)] <- 1
+ } else {
+ cp_dates <- rep(0, n())
+ }
+ cp_dates
+ },
+ CP_Date_Z = {
+ # Verifica el tamaño del grupo antes de aplicar el análisis
+ if (n() > 2) {
+ cpt_result <- cpt.mean(.data[["Location_z_mean"]], penalty = "AIC",
pen.value = 0, method = "PELT", Q = 5,
+ test.stat = "Normal", class = TRUE,
param.estimates = TRUE)
+ cp_dates <- rep(0, n())
+ cp_dates[cpts(cpt_result)] <- 1
+ } else {
+ cp_dates <- rep(0, n())
+ }
+ cp_dates
+ })
+
+ # Desagrupa los datos
+ ungrouped_results <- results %>% ungroup()
+
+ return(ungrouped_results)
+ }
+
+ for (file in files) {
+ # Leer el archivo CSV
+ df <- read.csv(file)
+
+ # Detectar puntos de cambio para Location_x_mean
+ df <- detect_changepoints(df, "Location_x_mean")
+
+ # Detectar puntos de cambio para Location_z_mean (si es necesario)
+ df <- detect_changepoints(df, "Location_z_mean")
+
+ # Guardar el archivo CSV modificado
+ write.csv(df, file, row.names = FALSE)
+
+ # Liberar memoria
+ rm(df)
+ gc()
+ }
+ }
>
> #REVISAR
>
> # 1) Revisar valores negativos y extremadamente altos de linear_px_final OK
>
> change_linear_px_neg_values <- function() {
+
+ setwd("C:/Users/Admin/Downloads/Zippedi/Data/Stage 3")
+
+ # Set directories to use
+ input_dir <- "C:/Users/Admin/Downloads/Zippedi/Data/Stage 3/Merged_data"
+ output_dir <- "C:/Users/Admin/Downloads/Zippedi/Data/Stage 3/Merged_data_fix"
+
+ # Create directory in case it does not exist
+ if (!file.exists(output_dir)) {
+ print("Creating directory...")
+ dir.create(output_dir, recursive = TRUE)
+ print("Directory successfully created.")
+ }
+
+ # Check files available
+ files <- list.files(path = input_dir, pattern = "\\.csv$")
+
+ for (file in files) {
+ # Read the CSV file
+ data <- read.csv(file.path(input_dir, file))
+
+ # Replace negative values in linear_px_final with 0
+ data$linear_px_final[data$linear_px_final < 0] <- 0
+
+ # Write the modified data to the output directory
+ write.csv(data, file.path(output_dir, file), row.names = FALSE)
+
+ # Clean up
+ rm(data)
+ gc()
+ }
+
+ # Set directories to use
+ input_dir <- "C:/Users/Admin/Downloads/Zippedi/Data/Stage 3/Product_data"
+ output_dir <- "C:/Users/Admin/Downloads/Zippedi/Data/Stage 3/Product_data_fix"
+
+ # Create directory in case it does not exist
+ if (!file.exists(output_dir)) {
+ print("Creating directory...")
+ dir.create(output_dir, recursive = TRUE)
+ print("Directory successfully created.")
+ }
+
+ # Check files available
+ files <- list.files(path = input_dir, pattern = "\\.csv$")
+
+ for (file in files) {
+ # Read the CSV file
+ data <- read.csv(file.path(input_dir, file))
+
+ # Replace negative values in linear_px_final with 0
+ data$linear_px_final[data$linear_px_final < 0] <- 0
+
+ # Write the modified data to the output directory
+ write.csv(data, file.path(output_dir, file), row.names = FALSE)
+
+ # Clean up
+ rm(data)
+ gc()
+ }
+ }
>
> # 2) Hacer una serie de tiempo y ver cuantos cambios se detectan por fecha (puede
ser más de 1 en una fecha, si ocurre para múltiples productos), y también, en qué
fechas
> # ocurren puntos de cambio
>
> summary_changepoints_in_data <- function() {
+
+ input_dir <- "C:/Users/Admin/Downloads/Zippedi/Data/Stage
3/Merged_complete_and_incomplete"
+ output_dir <- "C:/Users/Admin/Downloads/Zippedi/Data/Stage 3/Summary_for_data"
+
+ # Create directory in case it does not exist
+ if (!file.exists(output_dir)) {
+ print("Creating directory...")
+ dir.create(output_dir, recursive = TRUE)
+ print("Directory successfully created.")
+ }
+
+ # Check files available
+ files <- list.files(path = input_dir, pattern = "\\.csv$")
+
+ for (file in files) {
+ # Read the CSV file
+ print(paste("Reading", file))
+ data <- read.csv(file.path(input_dir, file))
+
+ # Create CP_Date column
+ data$CP_Date <- ifelse(data$CP_Date_X == 1 | data$CP_Date_Y == 1, 1, 0)
+
+ # Create timeseries to observe how many changepoints are registered by day in
every section of the store
+
+ summary_cp <- data %>%
+ group_by(date_PD, aisle, aisle_part, orient) %>%
+ summarise(
+ aisle = first(aisle),
+ aisle_part = first(aisle_part),
+ orient = first(orient),
+ date_PD = first(date_PD),
+ store = first(store),
+ is_CP_Date_in_section = max(CP_Date),
+ CPs_registered_in_date_section = sum(CP_Date)
+ ) %>%
+ ungroup()
+
+ # 3) ¿Cuántas fechas (en total) de la línea de tiempo poseen puntos de
cambio?
+
+ summary_cp <- summary_cp %>%
+ group_by(aisle, aisle_part, orient) %>%
+ summarise(
+ amount_of_dates_in_section = n_distinct(date_PD)
+ ) %>%
+ ungroup()
+
+ summary_cp <- summary_cp %>%
+ group_by(aisle, aisle_part, orient) %>%
+ filter(is_CP_Date_in_section == 1) %>%
+ summarise(
+ amount_of_dates_with_CP_in_section = n_distinct(date_PD)
+ ) %>%
+ ungroup()
+
+ setwd(output_dir)
+
+ write.csv(summary_cp, file.path(output_dir, file), row.names = FALSE)
+ rm(summary_cp)
+ gc()
+ }
+ }
Error in file(out, "wt") : no se puede abrir la conexión
> summary_changepoints_in_data()
[1] "Creating directory..."
[1] "Directory successfully created."
[1] "Reading PD_J501.csv"
Error in `$<-.data.frame`(`*tmp*`, "CP_Date", value = logical(0)) :
replacement has 0 rows, data has 4539518
Called from: `$<-.data.frame`(`*tmp*`, "CP_Date", value = logical(0))
Browse[1]> Q
> summary_changepoints_in_data <- function() {
+
+ input_dir <- "C:/Users/Admin/Downloads/Zippedi/Data/Stage
3/Merged_complete_and_incomplete"
+ output_dir <- "C:/Users/Admin/Downloads/Zippedi/Data/Stage 3/Summary_for_data"
+
+ # Create directory in case it does not exist
+ if (!file.exists(output_dir)) {
+ print("Creating directory...")
+ dir.create(output_dir, recursive = TRUE)
+ print("Directory successfully created.")
+ }
+
+ # Check files available
+ files <- list.files(path = input_dir, pattern = "\\.csv$")
+
+ for (file in files) {
+ # Read the CSV file
+ print(paste("Reading", file))
+ data <- read.csv(file.path(input_dir, file))
+
+ # Create CP_Date column
+ data$CP_Date <- ifelse(data$CP_Date_X == 1 | data$CP_Date_Y == 1, 1, 0)
+
+ # Create timeseries to observe how many changepoints are registered by day in
every section of the store
+
+ summary_cp <- data %>%
+ group_by(date_PD, aisle, aisle_part, orient) %>%
+ summarise(
+ aisle = first(aisle),
+ aisle_part = first(aisle_part),
+ orient = first(orient),
+ date_PD = first(date_PD),
+ store = first(store),
+ is_CP_Date_in_section = max(CP_Date),
+ CPs_registered_in_date_section = sum(CP_Date),
+ .groups = 'drop'
+ )
+
+ # 3) ¿Cuántas fechas (en total) de la línea de tiempo poseen puntos de
cambio?
+
+ summary_cp_dates <- summary_cp %>%
+ group_by(aisle, aisle_part, orient) %>%
+ summarise(
+ amount_of_dates_in_section = n_distinct(date_PD),
+ .groups = 'drop'
+ )
+
+ summary_cp_with_cp <- summary_cp %>%
+ group_by(aisle, aisle_part, orient) %>%
+ filter(is_CP_Date_in_section == 1) %>%
+ summarise(
+ amount_of_dates_with_CP_in_section = n_distinct(date_PD),
+ .groups = 'drop'
+ )
+
+ # Verificar si el número de entradas después del filtrado es 0
+ if (nrow(summary_cp_with_cp) == 0) {
+ summary_cp_with_cp <- data.frame(
+ aisle = character(0),
+ aisle_part = character(0),
+ orient = character(0),
+ amount_of_dates_with_CP_in_section = integer(0)
+ )
+ }
+
+ # Combinar ambos resultados
+ summary_cp <- left_join(summary_cp_dates, summary_cp_with_cp, by = c("aisle",
"aisle_part", "orient"))
+
+ setwd(output_dir)
+
+ write.csv(summary_cp, file.path(output_dir, file), row.names = FALSE)
+ rm(summary_cp)
+ gc()
+ }
+ }
Error in file(out, "wt") : no se puede abrir la conexión
> summary_changepoints_in_data()
[1] "Reading PD_J501.csv"
Error in `$<-.data.frame`(`*tmp*`, "CP_Date", value = logical(0)) :
replacement has 0 rows, data has 4539518
Called from: `$<-.data.frame`(`*tmp*`, "CP_Date", value = logical(0))
Browse[1]> Q
Error in file(out, "wt") : no se puede abrir la conexión
> summary_changepoints_in_data <- function() {
+
+ input_dir <- "C:/Users/Admin/Downloads/Zippedi/Data/Stage
3/Merged_complete_and_incomplete"
+ output_dir <- "C:/Users/Admin/Downloads/Zippedi/Data/Stage 3/Summary_for_data"
+
+ # Create directory in case it does not exist
+ if (!file.exists(output_dir)) {
+ print("Creating directory...")
+ dir.create(output_dir, recursive = TRUE)
+ print("Directory successfully created.")
+ }
+
+ # Check files available
+ files <- list.files(path = input_dir, pattern = "\\.csv$")
+
+ for (file in files) {
+ # Read the CSV file
+ print(paste("Reading", file))
+ data <- read.csv(file.path(input_dir, file))
+
+ # Create CP_Date column
+ data$CP_Date <- ifelse(data$CP_Date_X == 1 | data$CP_Date_Y == 1, 1, 0)
+
+ # Create timeseries to observe how many changepoints are registered by day in
every section of the store
+
+ summary_cp <- data %>%
+ group_by(date_PD, aisle, aisle_part, orient) %>%
+ summarise(
+ store = first(store),
+ is_CP_Date_in_section = max(CP_Date),
+ CPs_registered_in_date_section = sum(CP_Date),
+ ) %>%
+ ungroup()
+
+ # 3) ¿Cuántas fechas (en total) de la línea de tiempo poseen puntos de
cambio?
+
+ summary_cp<- summary_cp %>%
+ group_by(aisle, aisle_part, orient) %>%
+ summarise(
+ amount_of_dates_in_section = n_distinct(date_PD),
+ amount_of_dates_with_CP_in_section =
n_distinct(date_PD[is_CP_Date_in_section == 1])
+ ) %>%
+ ungroup()
+
+ setwd(output_dir)
+
+ write.csv(summary_cp, file.path(output_dir, file), row.names = FALSE)
+ rm(summary_cp)
+ gc()
+ }
+ }
> summary_changepoints_in_data()
[1] "Reading PD_J501.csv"
Error in `$<-.data.frame`(`*tmp*`, "CP_Date", value = logical(0)) :
replacement has 0 rows, data has 4539518
Called from: `$<-.data.frame`(`*tmp*`, "CP_Date", value = logical(0))
Browse[1]> Q
> summary_changepoints_in_data <- function() {
+
+ input_dir <- "C:/Users/Admin/Downloads/Zippedi/Data/Stage
3/Merged_complete_and_incomplete"
+ output_dir <- "C:/Users/Admin/Downloads/Zippedi/Data/Stage 3/Summary_for_data"
+
+ # Create directory in case it does not exist
+ if (!file.exists(output_dir)) {
+ print("Creating directory...")
+ dir.create(output_dir, recursive = TRUE)
+ print("Directory successfully created.")
+ }
+
+ # Check files available
+ files <- list.files(path = input_dir, pattern = "\\.csv$")
+
+ for (file in files) {
+ # Read the CSV file
+ print(paste("Reading", file))
+ data <- read.csv(file.path(input_dir, file))
+
+ # Create CP_Date column
+ data$CP_Date <- ifelse(data$CP_Date_X == 1 | data$CP_Date_Z == 1, 1, 0)
+
+ # Create timeseries to observe how many changepoints are registered by day in
every section of the store
+
+ summary_cp <- data %>%
+ group_by(date_PD, aisle, aisle_part, orient) %>%
+ summarise(
+ store = first(store),
+ is_CP_Date_in_section = max(CP_Date),
+ CPs_registered_in_date_section = sum(CP_Date),
+ ) %>%
+ ungroup()
+
+ # 3) ¿Cuántas fechas (en total) de la línea de tiempo poseen puntos de
cambio?
+
+ summary_cp<- summary_cp %>%
+ group_by(aisle, aisle_part, orient) %>%
+ summarise(
+ amount_of_dates_in_section = n_distinct(date_PD),
+ amount_of_dates_with_CP_in_section =
n_distinct(date_PD[is_CP_Date_in_section == 1])
+ ) %>%
+ ungroup()
+
+ setwd(output_dir)
+
+ write.csv(summary_cp, file.path(output_dir, file), row.names = FALSE)
+ rm(summary_cp)
+ gc()
+ }
+ }
Error in file(out, "wt") : no se puede abrir la conexión
> summary_changepoints_in_data()
[1] "Reading PD_J501.csv"
`summarise()` has grouped output by 'date_PD', 'aisle', 'aisle_part'. You can
override using the `.groups`
argument.
`summarise()` has grouped output by 'aisle', 'aisle_part'. You can override using
the `.groups` argument.
[1] "Reading PD_J502.csv"
`summarise()` has grouped output by 'date_PD', 'aisle', 'aisle_part'. You can
override using the `.groups`
argument.
`summarise()` has grouped output by 'aisle', 'aisle_part'. You can override using
the `.groups` argument.
[1] "Reading PD_J503.csv"
Called from: scan(file = file, what = what, sep = sep, quote = quote, dec = dec,
nmax = nrows, skip = 0, na.strings = na.strings, quiet = TRUE,
fill = fill, strip.white = strip.white, blank.lines.skip = blank.lines.skip,
multi.line = FALSE, comment.char = comment.char, allowEscapes = allowEscapes,
flush = flush, encoding = encoding, skipNul = skipNul)
Browse[1]> Q
> summary_changepoints_in_data <- function() {
+
+ input_dir <- "C:/Users/Admin/Downloads/Zippedi/Data/Stage
3/Merged_complete_and_incomplete"
+ output_dir <- "C:/Users/Admin/Downloads/Zippedi/Data/Stage 3/Summary_for_data"
+
+ # Create directory in case it does not exist
+ if (!file.exists(output_dir)) {
+ print("Creating directory...")
+ dir.create(output_dir, recursive = TRUE)
+ print("Directory successfully created.")
+ }
+
+ # Check files available
+ files <- list.files(path = input_dir, pattern = "\\.csv$")
+
+ for (file in files) {
+ # Read the CSV file
+ print(paste("Reading", file))
+ data <- read.csv(file.path(input_dir, file))
+
+ # Ensure CP_Date_X and CP_Date_Z columns exist
+ if (!("CP_Date_X" %in% colnames(data)) | !("CP_Date_Z" %in% colnames(data)))
{
+ stop("CP_Date_X or CP_Date_Z column is missing in the data")
+ }
+
+ # Create CP_Date column
+ data$CP_Date <- ifelse(data$CP_Date_X == 1 | data$CP_Date_Z == 1, 1, 0)
+
+ # Create timeseries to observe how many changepoints are registered by day in
every section of the store
+ summary_cp <- data %>%
+ group_by(date_PD, aisle, aisle_part, orient) %>%
+ summarise(
+ store = first(store),
+ is_CP_Date_in_section = max(CP_Date),
+ CPs_registered_in_date_section = sum(CP_Date),
+ .groups = 'drop'
+ ) %>%
+ ungroup() %>%
+ group_by(aisle, aisle_part, orient) %>%
+ summarise(
+ store = first(store),
+ amount_of_dates_in_section = n_distinct(date_PD),
+ amount_of_dates_with_CP_in_section =
n_distinct(date_PD[is_CP_Date_in_section == 1]),
+ .groups = 'drop'
+ ) %>%
+ ungroup()
+
+ setwd(output_dir)
+
+ write.csv(summary_cp, file.path(output_dir, file), row.names = FALSE)
+ rm(summary_cp)
+ gc()
+ }
+ }
> summary_changepoints_in_data()
[1] "Reading PD_J501.csv"
[1] "Reading PD_J502.csv"
[1] "Reading PD_J503.csv"
[1] "Reading PD_J504.csv"
[1] "Reading PD_J510.csv"
[1] "Reading PD_J512.csv"
[1] "Reading PD_J513.csv"
Called from: scan(file = file, what = what, sep = sep, quote = quote, dec = dec,
nmax = nrows, skip = 0, na.strings = na.strings, quiet = TRUE,
fill = fill, strip.white = strip.white, blank.lines.skip = blank.lines.skip,
multi.line = FALSE, comment.char = comment.char, allowEscapes = allowEscapes,
flush = flush, encoding = encoding, skipNul = skipNul)
Browse[1]> Q
Error in file(out, "wt") : no se puede abrir la conexión
Error in file(out, "wt") : no se puede abrir la conexión
Además: Warning messages:
1: In if (match < 0) { :
la condición tiene longitud > 1 y sólo el primer elemento será usado
2: In if (match < 0) { :
la condición tiene longitud > 1 y sólo el primer elemento será usado
3: In if (match < 0) { :
la condición tiene longitud > 1 y sólo el primer elemento será usado
> summary_changepoints_in_data <- function() {
+
+ input_dir <- "C:/Users/Admin/Downloads/Zippedi/Data/Stage
3/Merged_complete_and_incomplete"
+ output_dir <- "C:/Users/Admin/Downloads/Zippedi/Data/Stage 3/Summary_for_data"
+
+ # Create directory in case it does not exist
+ if (!file.exists(output_dir)) {
+ print("Creating directory...")
+ dir.create(output_dir, recursive = TRUE)
+ print("Directory successfully created.")
+ }
+
+ # Check files available
+ files <- list.files(path = input_dir, pattern = "\\.csv$")
+
+ for (file in files) {
+ # Read the CSV file
+ print(paste("Reading", file))
+ data <- read.csv(file.path(input_dir, file))
+
+ # Ensure CP_Date_X and CP_Date_Z columns exist
+ if (!("CP_Date_X" %in% colnames(data)) | !("CP_Date_Z" %in% colnames(data)))
{
+ stop("CP_Date_X or CP_Date_Z column is missing in the data")
+ }
+
+ # Create CP_Date column
+ data$CP_Date <- ifelse(data$CP_Date_X == 1 | data$CP_Date_Z == 1, 1, 0)
+
+ # Create timeseries to observe how many changepoints are registered by day in
every section of the store
+ summary_cp_by_date <- data %>%
+ group_by(store, date_PD, aisle, aisle_part, orient) %>%
+ summarise(
+ is_CP_Date_in_section = max(CP_Date),
+ CPs_registered_in_date_section = sum(CP_Date),
+ .groups = 'drop'
+ )
+
+ # Summarise the general statistics
+ summary_cp_general <- summary_cp_by_date %>%
+ group_by(store, aisle, aisle_part, orient) %>%
+ summarise(
+ amount_of_dates_in_section = n_distinct(date_PD),
+ amount_of_dates_with_CP_in_section =
n_distinct(date_PD[is_CP_Date_in_section == 1]),
+ .groups = 'drop'
+ )
+
+ # Since we do not need additional summarising here, the left join is done
directly
+ summary_cp <- summary_cp_general %>%
+ left_join(summary_cp_by_date,by = c("store", "aisle", "aisle_part",
"orient"))
+
+
+ setwd(output_dir)
+
+ write.csv(summary_cp, file.path(output_dir, file), row.names = FALSE)
+ rm(summary_cp)
+ gc()
+ }
+ }
> summary_changepoints_in_data()
[1] "Reading PD_J501.csv"
[1] "Reading PD_J502.csv"
[1] "Reading PD_J503.csv"
[1] "Reading PD_J504.csv"
Called from: scan(file = file, what = what, sep = sep, quote = quote, dec = dec,
nmax = nrows, skip = 0, na.strings = na.strings, quiet = TRUE,
fill = fill, strip.white = strip.white, blank.lines.skip = blank.lines.skip,
multi.line = FALSE, comment.char = comment.char, allowEscapes = allowEscapes,
flush = flush, encoding = encoding, skipNul = skipNul)
Browse[1]> summary_changepoints_in_data <- function() {
+
+ input_dir <- "C:/Users/Admin/Downloads/Zippedi/Data/Stage
3/Merged_complete_and_incomplete"
+ output_dir <- "C:/Users/Admin/Downloads/Zippedi/Data/Stage 3/Summary_for_data"
+
+ # Create directory in case it does not exist
+ if (!file.exists(output_dir)) {
+ print("Creating directory...")
+ dir.create(output_dir, recursive = TRUE)
+ print("Directory successfully created.")
+ }
+
+ # Check files available
+ files <- list.files(path = input_dir, pattern = "\\.csv$")
+
+ for (file in files) {
+ # Read the CSV file
+ print(paste("Reading", file))
+ data <- read.csv(file.path(input_dir, file))
+
+ # Ensure CP_Date_X and CP_Date_Z columns exist
+ if (!("CP_Date_X" %in% colnames(data)) | !("CP_Date_Z" %in% colnames(data)))
{
+ stop("CP_Date_X or CP_Date_Z column is missing in the data")
+ }
+
+ # Create CP_Date column
+ data$CP_Date <- ifelse(data$CP_Date_X == 1 | data$CP_Date_Z == 1, 1, 0)
+
+ # Create timeseries to observe how many changepoints are registered by day in
every section of the store
+ summary_cp_by_date <- data %>%
+ group_by(store, date_PD, aisle, aisle_part, orient) %>%
+ summarise(
+ is_CP_Date_in_section = max(CP_Date),
+ CPs_registered_in_date_section = sum(CP_Date),
+ .groups = 'drop'
+ )
+
+ # Summarise the general statistics
+ summary_cp_general <- summary_cp_by_date %>%
+ group_by(store, aisle, aisle_part, orient) %>%
+ summarise(
+ amount_of_dates_in_section = n_distinct(date_PD),
+ amount_of_dates_with_CP_in_section =
n_distinct(date_PD[is_CP_Date_in_section == 1]),
+ .groups = 'drop'
+ )
+
+ # Join the summaries
+ summary_cp <- summary_cp_general %>%
+ left_join(summary_cp_by_date, by = c("store", "aisle", "aisle_part",
"orient"))
+
+ # Reorder columns
+ summary_cp <- summary_cp %>%
+ select(
+ date_PD, store, aisle, aisle_part, orient,
+ amount_of_dates_in_section, amount_of_dates_with_CP_in_section,
+ is_CP_Date_in_section, CPs_registered_in_date_section
+ )
+
+ setwd(output_dir)
+
+ write.csv(summary_cp, file.path(output_dir, file), row.names = FALSE)
+ rm(summary_cp)
+ gc()
+ }
+ }
Browse[1]> Q
> summary_changepoints_in_data <- function() {
+
+ input_dir <- "C:/Users/Admin/Downloads/Zippedi/Data/Stage
3/Merged_complete_and_incomplete"
+ output_dir <- "C:/Users/Admin/Downloads/Zippedi/Data/Stage 3/Summary_for_data"
+
+ # Create directory in case it does not exist
+ if (!file.exists(output_dir)) {
+ print("Creating directory...")
+ dir.create(output_dir, recursive = TRUE)
+ print("Directory successfully created.")
+ }
+
+ # Check files available
+ files <- list.files(path = input_dir, pattern = "\\.csv$")
+
+ for (file in files) {
+ # Read the CSV file
+ print(paste("Reading", file))
+ data <- read.csv(file.path(input_dir, file))
+
+ # Ensure CP_Date_X and CP_Date_Z columns exist
+ if (!("CP_Date_X" %in% colnames(data)) | !("CP_Date_Z" %in% colnames(data)))
{
+ stop("CP_Date_X or CP_Date_Z column is missing in the data")
+ }
+
+ # Create CP_Date column
+ data$CP_Date <- ifelse(data$CP_Date_X == 1 | data$CP_Date_Z == 1, 1, 0)
+
+ # Create timeseries to observe how many changepoints are registered by day in
every section of the store
+ summary_cp_by_date <- data %>%
+ group_by(store, date_PD, aisle, aisle_part, orient) %>%
+ summarise(
+ is_CP_Date_in_section = max(CP_Date),
+ CPs_registered_in_date_section = sum(CP_Date),
+ .groups = 'drop'
+ )
+
+ # Summarise the general statistics
+ summary_cp_general <- summary_cp_by_date %>%
+ group_by(store, aisle, aisle_part, orient) %>%
+ summarise(
+ amount_of_dates_in_section = n_distinct(date_PD),
+ amount_of_dates_with_CP_in_section =
n_distinct(date_PD[is_CP_Date_in_section == 1]),
+ .groups = 'drop'
+ )
+
+ # Join the summaries
+ summary_cp <- summary_cp_general %>%
+ left_join(summary_cp_by_date, by = c("store", "aisle", "aisle_part",
"orient"))
+
+ # Reorder columns
+ summary_cp <- summary_cp %>%
+ select(
+ date_PD, store, aisle, aisle_part, orient,
+ amount_of_dates_in_section, amount_of_dates_with_CP_in_section,
+ is_CP_Date_in_section, CPs_registered_in_date_section
+ )
+
+ setwd(output_dir)
+
+ write.csv(summary_cp, file.path(output_dir, file), row.names = FALSE)
+ rm(summary_cp)
+ gc()
+ }
+ }
> summary_changepoints_in_data()
[1] "Reading PD_J501.csv"
[1] "Reading PD_J502.csv"
[1] "Reading PD_J503.csv"
[1] "Reading PD_J504.csv"
[1] "Reading PD_J510.csv"
[1] "Reading PD_J512.csv"
[1] "Reading PD_J513.csv"
[1] "Reading PD_J514.csv"
[1] "Reading PD_J519.csv"
[1] "Reading PD_J624.csv"
[1] "Reading PD_J659.csv"
[1] "Reading PD_J762.csv"
[1] "Reading PD_J770.csv"
[1] "Reading PD_J775.csv"
> setwd("C:/Users/Admin/Downloads/Zippedi/Data/Stage 3/Merged_data_fix")
> results_501 <- read.csv("PD_J501.csv")
Error in file(file, "rt") : no se puede abrir la conexión
Además: Warning message:
In file(file, "rt") :
no fue posible abrir el archivo 'PD_J501.csv': No such file or directory
> setwd("C:/Users/Admin/Downloads/Zippedi/Data/Stage 3/Summary_for_data")
> results_501 <- read.csv("PD_J501.csv")
> View(results_501)
> setwd("C:/Users/Admin/Downloads/Zippedi/Data/Stage
3/Merged_complete_and_incomplete")
Error in file(out, "wt") : no se puede abrir la conexión
> data_501 <- read.csv("PD_J501.csv")
> View(results_501)
> View(data_501)
> print(paste("Número de valores con linear_px_final = 0:",
sum(data_501$linear_px_final == 0)))
[1] "Número de valores con linear_px_final = 0: 61284"
Error in file(out, "wt") : no se puede abrir la conexión
> print(paste("Porcentaje de valores con linear_px_final = 0:",
100*sum(data_501$linear_px_final == 0)/nrow(data_501), "%"))
[1] "Porcentaje de valores con linear_px_final = 0: 1.35001116858662 %"
Warning message:
In if (match < 0) { :
la condición tiene longitud > 1 y sólo el primer elemento será usado
> setwd("C:/Users/Admin/Downloads/Zippedi/Data/Provided_data/Facings")
Error in file(out, "wt") : no se puede abrir la conexión
> setwd("C:/Users/Admin/Downloads/Zippedi/Data/Stage 2/Predicted_Data")
Error in file(out, "wt") : no se puede abrir la conexión
> facings_501 <- read.csv("PD_J501.csv")
Error in file(out, "wt") : no se puede abrir la conexión
> setwd("C:/Users/Admin/Downloads/Zippedi/Data/Provided_data/Facings")
> facings_501 <- read.csv("Facings_J501.csv")
> View(facings_501)
> facings_501$linear_px <- facings_501$linear_px/1000
Warning message:
In if (match < 0) { :
la condición tiene longitud > 1 y sólo el primer elemento será usado
> # Crear una tabla de frecuencia de los valores en la columna
CPs_registered_in_date_section
> frequencies <- table(results_501$CPs_registered_in_date_section)
>
> # Convertir la tabla de frecuencia en un dataframe
> frequency_df <- as.data.frame(frequencies)
>
> # Renombrar las columnas para mayor claridad
> colnames(frequency_df) <- c("CPs_registered_in_date_section", "Frequency")
>
> # Mostrar el dataframe resultante
> print(frequency_df)
CPs_registered_in_date_section Frequency
1 0 34447
2 1 9162
3 2 3936
4 3 1895
5 4 1057
6 5 588
7 6 405
8 7 263
9 8 183
10 9 125
11 10 105
12 11 89
13 12 52
14 13 62
15 14 30
16 15 44
17 16 40
18 17 21
19 18 22
20 19 21
21 20 20
22 21 15
23 22 17
24 23 17
25 24 10
26 25 14
27 26 12
28 27 5
29 28 6
30 29 12
31 30 11
32 31 5
33 32 4
34 33 11
35 34 9
36 35 7
37 36 6
38 37 8
39 38 3
40 39 3
41 40 8
42 41 5
43 42 4
44 43 8
45 44 1
46 45 5
47 46 4
48 47 4
49 48 5
50 49 3
51 50 5
52 51 3
53 52 5
54 53 5
55 54 1
56 55 4
57 56 3
58 57 3
59 58 4
60 59 1
61 60 1
62 61 4
63 62 3
64 63 3
65 64 1
66 65 7
67 67 2
68 68 2
69 69 1
70 70 1
71 71 3
72 72 4
73 73 5
74 74 5
75 75 6
76 76 3
77 77 1
78 78 2
79 80 1
80 81 3
81 83 5
82 84 3
83 85 2
84 86 1
85 87 2
86 89 3
87 90 4
88 92 2
89 93 5
90 94 2
91 95 1
92 97 1
93 98 1
94 99 3
95 101 1
96 103 1
97 106 1
98 108 1
99 109 2
100 110 1
101 111 1
102 112 2
103 114 2
104 117 1
105 120 1
106 121 1
107 125 2
108 128 1
109 131 1
110 132 1
111 134 1
112 141 1
113 142 3
114 146 1
115 147 1
116 150 1
117 151 2
118 154 1
119 156 1
120 157 2
121 160 1
122 164 1
123 165 1
124 172 1
125 186 1
126 187 1
127 203 1
128 210 1
129 220 1
130 223 2
131 242 1
132 247 1
133 253 1
134 307 1
135 354 1
136 373 1
137 401 1
>
> results_grouped <- results_501 %>%
+ group_by(fecha) %>%
+ summarise(total_CPs_in_date = sum(CPs_registered_in_date_section, na.rm =
TRUE))
Error in `group_by()`:
! Must group by variables found in `.data`.
x Column `fecha` is not found.
Run `rlang::last_trace()` to see where the error occurred.
> results_grouped <- results_501 %>%
+ group_by(date_PD) %>%
+ summarise(total_CPs_in_date = sum(CPs_registered_in_date_section, na.rm =
TRUE))
> View(results_grouped)
> #Revisar cambios estructurales
> #usar fechas con menos cambios
> results_grouped <- results_grouped %>%
+ mutate(date_PD = as.Date(fecha, format = "%Y-%m-%d"))
Error in `mutate()`:
i In argument: `date_PD = as.Date(fecha, format = "%Y-%m-%d")`.
Caused by error in `as.Date()`:
! objeto 'fecha' no encontrado
Run `rlang::last_trace()` to see where the error occurred.
> results_grouped <- results_grouped %>%
+ mutate(date_PD = as.Date(date_PD, format = "%Y-%m-%d"))
> p <- ggplot(results_grouped, aes(x = total_CPs_in_date, y = date_PD)) +
+ geom_histogram(stat = "identity", fill = "blue") +
+ labs(title = "Histograma de CPs Totales por Fecha",
+ x = "Total CPs en la Fecha",
+ y = "Fecha") +
+ theme_minimal()
Warning message:
Ignoring unknown parameters: binwidth, bins, pad
> print(p)
> p <- ggplot(results_grouped, aes(x = date_PD, y = total_CPs_in_date)) +
+ geom_col(fill = "blue") +
+ labs(title = "Histograma de CPs Totales por Fecha",
+ x = "Fecha",
+ y = "Total CPs en la Fecha") +
+ theme_minimal()
> print(p)
> #Revisar pasillos donde solo hay unileiver
> Unileiver_aisles <- c("13B", "13C", "14B", "15B", "15C", "16C", "17B", "17C",
"22B", "23B", "24B")
> results_501_unileiver <- results_501 %>%
Warning messages:
1: In if (match < 0) { :
la condición tiene longitud > 1 y sólo el primer elemento será usado
2: In if (match < 0) { :
la condición tiene longitud > 1 y sólo el primer elemento será usado
+ filter(aisle %in% Unileiver_aisles)
> View(results_501_unileiver)
> results_grouped_501_unileiver <- results_501_unileiver %>% group_by(date_PD) %>%
summarise(total_CPs_in_date = sum(CPs_registered_in_date_section, na.rm = TRUE))
Warning message:
In if (match < 0) { :
la condición tiene longitud > 1 y sólo el primer elemento será usado
> View(results_grouped_501_unileiver)
> p <- ggplot(results_grouped_501_unileiver, aes(x = date_PD, y =
total_CPs_in_date)) + geom_col(fill = "blue") + labs(title = "Histograma de CPs
Totales por Fecha para pasillos con productos Unileiver", x = "Fecha", y = "Total
CPs en la Fecha") + theme_minimal()
> > print(p)
Error: inesperado '>' en ">"
> p <- ggplot(results_grouped_501_unileiver, aes(x = date_PD, y =
total_CPs_in_date)) + geom_col(fill = "blue") + labs(title = "Histograma de CPs
Totales por Fecha para pasillos con productos Unileiver", x = "Fecha", y = "Total
CPs en la Fecha") + theme_minimal()
> print(p)
> results_grouped_501_unileiver <- results_grouped_501_unileiver %>% mutate(date_PD
= as.Date(date_PD, format = "%Y-%m-%d"))
> p <- ggplot(results_grouped_501_unileiver, aes(x = date_PD, y =
total_CPs_in_date)) + geom_col(fill = "blue") + labs(title = "Histograma de CPs
Totales por Fecha para pasillos con productos Unileiver", x = "Fecha", y = "Total
CPs en la Fecha") + theme_minimal()
> print(p)
> View(results_grouped_501_unileiver)
> View(results_501_unileiver)
> # Crear una tabla de frecuencia de los valores en la columna
CPs_registered_in_date_section
Warning messages:
1: In if (match < 0) { :
la condición tiene longitud > 1 y sólo el primer elemento será usado
2: In if (match < 0) { :
la condición tiene longitud > 1 y sólo el primer elemento será usado
> frequencies <- table(results_501_unileiver$CPs_registered_in_date_section)
> # Convertir la tabla de frecuencia en un dataframe
> frequency_501_unileiver <- as.data.frame(frequencies)
> # Renombrar las columnas para mayor claridad
> colnames(frequency_501_unileiver) <- c("CPs_registered_in_date_section",
"Frequency")
> View(frequency_501_unileiver)
> View(results_grouped_501_unileiver)

You might also like