Leer

Uploaded by

Diego Andrés Cares Oyarzo

We take content rights seriously. If you suspect this is your content, claim it here.

Available Formats

Download as TXT, PDF, TXT or read online on Scribd

0% found this document useful (0 votes)

4 views

Leer

Uploaded by

Diego Andrés Cares Oyarzo

We take content rights seriously. If you suspect this is your content, claim it here.

Available Formats

Download as TXT, PDF, TXT or read online on Scribd

You are on page 1/ 19

Logramos arreglar el problema de valores extremadamente altos.

Ahora, hay que fijar los valores negativos en 0 (revisa el % que son negativos:
1,35% )

+ for (col in missing_cols) {

+ data2 <- data2 %>%
+ mutate(!!sym(col) := NA)
+ }
+
+ missing_cols <- setdiff(colnames(data2), colnames(data1))
+ for (col in missing_cols) {
+ data1 <- data1 %>%
+ mutate(!!sym(col) := NA)
+ }
+
+ df <- rbind(data1, data2)
+ rm(data1)
+ rm(data2)
+ gc()
+
+
+ # Ordenar el dataframe de manera que las filas con valores no NA en
VENTA.Un. aparezcan primero
+ df <- df[order(!is.na(df$VENTA.Un.), decreasing = TRUE), ]
+
+ # Eliminar duplicados manteniendo la primera aparición (la cual tendrá
VENTA.Un. no NA si existe)
+ df <- df[!duplicated(df[c("item", "date_PD", "aisle", "aisle_part",
"orient")]), ]
+
+ setwd(output_dir)
+ write.csv(df, file2)
+ }
+ }
+
+
+
+
+
+ }
>
> detect_changepoints_in_data <- function() {
+
+ input_dir <- "C:/Users/Admin/Downloads/Zippedi/Data/Stage
3/Merged_complete_and_incomplete"
+
+ # Check files available
+ setwd(input_dir)
+ files <- list.files(path = input_dir, pattern = "\\.csv$")
+
+ detect_changepoints <- function(data, column_name) {
+ # Agrupa los datos por 'aisle', 'aisle_part', 'orient', 'item'
+ grouped_data <- data %>%
+ group_by(aisle, aisle_part, orient, item)
+
+ # Aplica el análisis de puntos de cambio a cada grupo
+ results <- grouped_data %>%
+ mutate(CP_Date_X = {
+ # Verifica el tamaño del grupo antes de aplicar el análisis
+ if (n() > 2) {
+ cpt_result <- cpt.mean(.data[["Location_x_mean"]], penalty = "AIC",
pen.value = 0, method = "PELT", Q = 5,
+ test.stat = "Normal", class = TRUE,
param.estimates = TRUE)
+ cp_dates <- rep(0, n())
+ cp_dates[cpts(cpt_result)] <- 1
+ } else {
+ cp_dates <- rep(0, n())
+ }
+ cp_dates
+ },
+ CP_Date_Z = {
+ # Verifica el tamaño del grupo antes de aplicar el análisis
+ if (n() > 2) {
+ cpt_result <- cpt.mean(.data[["Location_z_mean"]], penalty = "AIC",
pen.value = 0, method = "PELT", Q = 5,
+ test.stat = "Normal", class = TRUE,
param.estimates = TRUE)
+ cp_dates <- rep(0, n())
+ cp_dates[cpts(cpt_result)] <- 1
+ } else {
+ cp_dates <- rep(0, n())
+ }
+ cp_dates
+ })
+
+ # Desagrupa los datos
+ ungrouped_results <- results %>% ungroup()
+
+ return(ungrouped_results)
+ }
+
+ for (file in files) {
+ # Leer el archivo CSV
+ df <- read.csv(file)
+
+ # Detectar puntos de cambio para Location_x_mean
+ df <- detect_changepoints(df, "Location_x_mean")
+
+ # Detectar puntos de cambio para Location_z_mean (si es necesario)
+ df <- detect_changepoints(df, "Location_z_mean")
+
+ # Guardar el archivo CSV modificado
+ write.csv(df, file, row.names = FALSE)
+
+ # Liberar memoria
+ rm(df)
+ gc()
+ }
+ }
>
> #REVISAR
>
> # 1) Revisar valores negativos y extremadamente altos de linear_px_final OK
>
> change_linear_px_neg_values <- function() {
+
+ setwd("C:/Users/Admin/Downloads/Zippedi/Data/Stage 3")
+
+ # Set directories to use
+ input_dir <- "C:/Users/Admin/Downloads/Zippedi/Data/Stage 3/Merged_data"
+ output_dir <- "C:/Users/Admin/Downloads/Zippedi/Data/Stage 3/Merged_data_fix"
+
+ # Create directory in case it does not exist
+ if (!file.exists(output_dir)) {
+ print("Creating directory...")
+ dir.create(output_dir, recursive = TRUE)
+ print("Directory successfully created.")
+ }
+
+ # Check files available
+ files <- list.files(path = input_dir, pattern = "\\.csv$")
+
+ for (file in files) {
+ # Read the CSV file
+ data <- read.csv(file.path(input_dir, file))
+
+ # Replace negative values in linear_px_final with 0
+ data$linear_px_final[data$linear_px_final < 0] <- 0
+
+ # Write the modified data to the output directory
+ write.csv(data, file.path(output_dir, file), row.names = FALSE)
+
+ # Clean up
+ rm(data)
+ gc()
+ }
+
+ # Set directories to use
+ input_dir <- "C:/Users/Admin/Downloads/Zippedi/Data/Stage 3/Product_data"
+ output_dir <- "C:/Users/Admin/Downloads/Zippedi/Data/Stage 3/Product_data_fix"
+
+ # Create directory in case it does not exist
+ if (!file.exists(output_dir)) {
+ print("Creating directory...")
+ dir.create(output_dir, recursive = TRUE)
+ print("Directory successfully created.")
+ }
+
+ # Check files available
+ files <- list.files(path = input_dir, pattern = "\\.csv$")
+
+ for (file in files) {
+ # Read the CSV file
+ data <- read.csv(file.path(input_dir, file))
+
+ # Replace negative values in linear_px_final with 0
+ data$linear_px_final[data$linear_px_final < 0] <- 0
+
+ # Write the modified data to the output directory
+ write.csv(data, file.path(output_dir, file), row.names = FALSE)
+
+ # Clean up
+ rm(data)
+ gc()
+ }
+ }
>
> # 2) Hacer una serie de tiempo y ver cuantos cambios se detectan por fecha (puede
ser más de 1 en una fecha, si ocurre para múltiples productos), y también, en qué
fechas
> # ocurren puntos de cambio
>
> summary_changepoints_in_data <- function() {
+
+ input_dir <- "C:/Users/Admin/Downloads/Zippedi/Data/Stage
3/Merged_complete_and_incomplete"
+ output_dir <- "C:/Users/Admin/Downloads/Zippedi/Data/Stage 3/Summary_for_data"
+
+ # Create directory in case it does not exist
+ if (!file.exists(output_dir)) {
+ print("Creating directory...")
+ dir.create(output_dir, recursive = TRUE)
+ print("Directory successfully created.")
+ }
+
+ # Check files available
+ files <- list.files(path = input_dir, pattern = "\\.csv$")
+
+ for (file in files) {
+ # Read the CSV file
+ print(paste("Reading", file))
+ data <- read.csv(file.path(input_dir, file))
+
+ # Create CP_Date column
+ data$CP_Date <- ifelse(data$CP_Date_X == 1 | data$CP_Date_Y == 1, 1, 0)
+
+ # Create timeseries to observe how many changepoints are registered by day in
every section of the store
+
+ summary_cp <- data %>%
+ group_by(date_PD, aisle, aisle_part, orient) %>%
+ summarise(
+ aisle = first(aisle),
+ aisle_part = first(aisle_part),
+ orient = first(orient),
+ date_PD = first(date_PD),
+ store = first(store),
+ is_CP_Date_in_section = max(CP_Date),
+ CPs_registered_in_date_section = sum(CP_Date)
+ ) %>%
+ ungroup()
+
+ # 3) ¿Cuántas fechas (en total) de la línea de tiempo poseen puntos de
cambio?
+
+ summary_cp <- summary_cp %>%
+ group_by(aisle, aisle_part, orient) %>%
+ summarise(
+ amount_of_dates_in_section = n_distinct(date_PD)
+ ) %>%
+ ungroup()
+
+ summary_cp <- summary_cp %>%
+ group_by(aisle, aisle_part, orient) %>%
+ filter(is_CP_Date_in_section == 1) %>%
+ summarise(
+ amount_of_dates_with_CP_in_section = n_distinct(date_PD)
+ ) %>%
+ ungroup()
+
+ setwd(output_dir)
+
+ write.csv(summary_cp, file.path(output_dir, file), row.names = FALSE)
+ rm(summary_cp)
+ gc()
+ }
+ }
Error in file(out, "wt") : no se puede abrir la conexión
> summary_changepoints_in_data()
[1] "Creating directory..."
[1] "Directory successfully created."
[1] "Reading PD_J501.csv"
Error in `$<-.data.frame`(`*tmp*`, "CP_Date", value = logical(0)) :
replacement has 0 rows, data has 4539518
Called from: `$<-.data.frame`(`*tmp*`, "CP_Date", value = logical(0))
Browse[1]> Q
> summary_changepoints_in_data <- function() {
+
+ input_dir <- "C:/Users/Admin/Downloads/Zippedi/Data/Stage
3/Merged_complete_and_incomplete"
+ output_dir <- "C:/Users/Admin/Downloads/Zippedi/Data/Stage 3/Summary_for_data"
+
+ # Create directory in case it does not exist
+ if (!file.exists(output_dir)) {
+ print("Creating directory...")
+ dir.create(output_dir, recursive = TRUE)
+ print("Directory successfully created.")
+ }
+
+ # Check files available
+ files <- list.files(path = input_dir, pattern = "\\.csv$")
+
+ for (file in files) {
+ # Read the CSV file
+ print(paste("Reading", file))
+ data <- read.csv(file.path(input_dir, file))
+
+ # Create CP_Date column
+ data$CP_Date <- ifelse(data$CP_Date_X == 1 | data$CP_Date_Y == 1, 1, 0)
+
+ # Create timeseries to observe how many changepoints are registered by day in
every section of the store
+
+ summary_cp <- data %>%
+ group_by(date_PD, aisle, aisle_part, orient) %>%
+ summarise(
+ aisle = first(aisle),
+ aisle_part = first(aisle_part),
+ orient = first(orient),
+ date_PD = first(date_PD),
+ store = first(store),
+ is_CP_Date_in_section = max(CP_Date),
+ CPs_registered_in_date_section = sum(CP_Date),
+ .groups = 'drop'
+ )
+
+ # 3) ¿Cuántas fechas (en total) de la línea de tiempo poseen puntos de
cambio?
+
+ summary_cp_dates <- summary_cp %>%
+ group_by(aisle, aisle_part, orient) %>%
+ summarise(
+ amount_of_dates_in_section = n_distinct(date_PD),
+ .groups = 'drop'
+ )
+
+ summary_cp_with_cp <- summary_cp %>%
+ group_by(aisle, aisle_part, orient) %>%
+ filter(is_CP_Date_in_section == 1) %>%
+ summarise(
+ amount_of_dates_with_CP_in_section = n_distinct(date_PD),
+ .groups = 'drop'
+ )
+
+ # Verificar si el número de entradas después del filtrado es 0
+ if (nrow(summary_cp_with_cp) == 0) {
+ summary_cp_with_cp <- data.frame(
+ aisle = character(0),
+ aisle_part = character(0),
+ orient = character(0),
+ amount_of_dates_with_CP_in_section = integer(0)
+ )
+ }
+
+ # Combinar ambos resultados
+ summary_cp <- left_join(summary_cp_dates, summary_cp_with_cp, by = c("aisle",
"aisle_part", "orient"))
+
+ setwd(output_dir)
+
+ write.csv(summary_cp, file.path(output_dir, file), row.names = FALSE)
+ rm(summary_cp)
+ gc()
+ }
+ }
Error in file(out, "wt") : no se puede abrir la conexión
> summary_changepoints_in_data()
[1] "Reading PD_J501.csv"
Error in `$<-.data.frame`(`*tmp*`, "CP_Date", value = logical(0)) :
replacement has 0 rows, data has 4539518
Called from: `$<-.data.frame`(`*tmp*`, "CP_Date", value = logical(0))
Browse[1]> Q
Error in file(out, "wt") : no se puede abrir la conexión
> summary_changepoints_in_data <- function() {
+
+ input_dir <- "C:/Users/Admin/Downloads/Zippedi/Data/Stage
3/Merged_complete_and_incomplete"
+ output_dir <- "C:/Users/Admin/Downloads/Zippedi/Data/Stage 3/Summary_for_data"
+
+ # Create directory in case it does not exist
+ if (!file.exists(output_dir)) {
+ print("Creating directory...")
+ dir.create(output_dir, recursive = TRUE)
+ print("Directory successfully created.")
+ }
+
+ # Check files available
+ files <- list.files(path = input_dir, pattern = "\\.csv$")
+
+ for (file in files) {
+ # Read the CSV file
+ print(paste("Reading", file))
+ data <- read.csv(file.path(input_dir, file))
+
+ # Create CP_Date column
+ data$CP_Date <- ifelse(data$CP_Date_X == 1 | data$CP_Date_Y == 1, 1, 0)
+
+ # Create timeseries to observe how many changepoints are registered by day in
every section of the store
+
+ summary_cp <- data %>%
+ group_by(date_PD, aisle, aisle_part, orient) %>%
+ summarise(
+ store = first(store),
+ is_CP_Date_in_section = max(CP_Date),
+ CPs_registered_in_date_section = sum(CP_Date),
+ ) %>%
+ ungroup()
+
+ # 3) ¿Cuántas fechas (en total) de la línea de tiempo poseen puntos de
cambio?
+
+ summary_cp<- summary_cp %>%
+ group_by(aisle, aisle_part, orient) %>%
+ summarise(
+ amount_of_dates_in_section = n_distinct(date_PD),
+ amount_of_dates_with_CP_in_section =
n_distinct(date_PD[is_CP_Date_in_section == 1])
+ ) %>%
+ ungroup()
+
+ setwd(output_dir)
+
+ write.csv(summary_cp, file.path(output_dir, file), row.names = FALSE)
+ rm(summary_cp)
+ gc()
+ }
+ }
> summary_changepoints_in_data()
[1] "Reading PD_J501.csv"
Error in `$<-.data.frame`(`*tmp*`, "CP_Date", value = logical(0)) :
replacement has 0 rows, data has 4539518
Called from: `$<-.data.frame`(`*tmp*`, "CP_Date", value = logical(0))
Browse[1]> Q
> summary_changepoints_in_data <- function() {
+
+ input_dir <- "C:/Users/Admin/Downloads/Zippedi/Data/Stage
3/Merged_complete_and_incomplete"
+ output_dir <- "C:/Users/Admin/Downloads/Zippedi/Data/Stage 3/Summary_for_data"
+
+ # Create directory in case it does not exist
+ if (!file.exists(output_dir)) {
+ print("Creating directory...")
+ dir.create(output_dir, recursive = TRUE)
+ print("Directory successfully created.")
+ }
+
+ # Check files available
+ files <- list.files(path = input_dir, pattern = "\\.csv$")
+
+ for (file in files) {
+ # Read the CSV file
+ print(paste("Reading", file))
+ data <- read.csv(file.path(input_dir, file))
+
+ # Create CP_Date column
+ data$CP_Date <- ifelse(data$CP_Date_X == 1 | data$CP_Date_Z == 1, 1, 0)
+
+ # Create timeseries to observe how many changepoints are registered by day in
every section of the store
+
+ summary_cp <- data %>%
+ group_by(date_PD, aisle, aisle_part, orient) %>%
+ summarise(
+ store = first(store),
+ is_CP_Date_in_section = max(CP_Date),
+ CPs_registered_in_date_section = sum(CP_Date),
+ ) %>%
+ ungroup()
+
+ # 3) ¿Cuántas fechas (en total) de la línea de tiempo poseen puntos de
cambio?
+
+ summary_cp<- summary_cp %>%
+ group_by(aisle, aisle_part, orient) %>%
+ summarise(
+ amount_of_dates_in_section = n_distinct(date_PD),
+ amount_of_dates_with_CP_in_section =
n_distinct(date_PD[is_CP_Date_in_section == 1])
+ ) %>%
+ ungroup()
+
+ setwd(output_dir)
+
+ write.csv(summary_cp, file.path(output_dir, file), row.names = FALSE)
+ rm(summary_cp)
+ gc()
+ }
+ }
Error in file(out, "wt") : no se puede abrir la conexión
> summary_changepoints_in_data()
[1] "Reading PD_J501.csv"
`summarise()` has grouped output by 'date_PD', 'aisle', 'aisle_part'. You can
override using the `.groups`
argument.
`summarise()` has grouped output by 'aisle', 'aisle_part'. You can override using
the `.groups` argument.
[1] "Reading PD_J502.csv"
`summarise()` has grouped output by 'date_PD', 'aisle', 'aisle_part'. You can
override using the `.groups`
argument.
`summarise()` has grouped output by 'aisle', 'aisle_part'. You can override using
the `.groups` argument.
[1] "Reading PD_J503.csv"
Called from: scan(file = file, what = what, sep = sep, quote = quote, dec = dec,
nmax = nrows, skip = 0, na.strings = na.strings, quiet = TRUE,
fill = fill, strip.white = strip.white, blank.lines.skip = blank.lines.skip,
multi.line = FALSE, comment.char = comment.char, allowEscapes = allowEscapes,
flush = flush, encoding = encoding, skipNul = skipNul)
Browse[1]> Q
> summary_changepoints_in_data <- function() {
+
+ input_dir <- "C:/Users/Admin/Downloads/Zippedi/Data/Stage
3/Merged_complete_and_incomplete"
+ output_dir <- "C:/Users/Admin/Downloads/Zippedi/Data/Stage 3/Summary_for_data"
+
+ # Create directory in case it does not exist
+ if (!file.exists(output_dir)) {
+ print("Creating directory...")
+ dir.create(output_dir, recursive = TRUE)
+ print("Directory successfully created.")
+ }
+
+ # Check files available
+ files <- list.files(path = input_dir, pattern = "\\.csv$")
+
+ for (file in files) {
+ # Read the CSV file
+ print(paste("Reading", file))
+ data <- read.csv(file.path(input_dir, file))
+
+ # Ensure CP_Date_X and CP_Date_Z columns exist
+ if (!("CP_Date_X" %in% colnames(data)) | !("CP_Date_Z" %in% colnames(data)))
{
+ stop("CP_Date_X or CP_Date_Z column is missing in the data")
+ }
+
+ # Create CP_Date column
+ data$CP_Date <- ifelse(data$CP_Date_X == 1 | data$CP_Date_Z == 1, 1, 0)
+
+ # Create timeseries to observe how many changepoints are registered by day in
every section of the store
+ summary_cp <- data %>%
+ group_by(date_PD, aisle, aisle_part, orient) %>%
+ summarise(
+ store = first(store),
+ is_CP_Date_in_section = max(CP_Date),
+ CPs_registered_in_date_section = sum(CP_Date),
+ .groups = 'drop'
+ ) %>%
+ ungroup() %>%
+ group_by(aisle, aisle_part, orient) %>%
+ summarise(
+ store = first(store),
+ amount_of_dates_in_section = n_distinct(date_PD),
+ amount_of_dates_with_CP_in_section =
n_distinct(date_PD[is_CP_Date_in_section == 1]),
+ .groups = 'drop'
+ ) %>%
+ ungroup()
+
+ setwd(output_dir)
+
+ write.csv(summary_cp, file.path(output_dir, file), row.names = FALSE)
+ rm(summary_cp)
+ gc()
+ }
+ }
> summary_changepoints_in_data()
[1] "Reading PD_J501.csv"
[1] "Reading PD_J502.csv"
[1] "Reading PD_J503.csv"
[1] "Reading PD_J504.csv"
[1] "Reading PD_J510.csv"
[1] "Reading PD_J512.csv"
[1] "Reading PD_J513.csv"
Called from: scan(file = file, what = what, sep = sep, quote = quote, dec = dec,
nmax = nrows, skip = 0, na.strings = na.strings, quiet = TRUE,
fill = fill, strip.white = strip.white, blank.lines.skip = blank.lines.skip,
multi.line = FALSE, comment.char = comment.char, allowEscapes = allowEscapes,
flush = flush, encoding = encoding, skipNul = skipNul)
Browse[1]> Q
Error in file(out, "wt") : no se puede abrir la conexión
Error in file(out, "wt") : no se puede abrir la conexión
Además: Warning messages:
1: In if (match < 0) { :
la condición tiene longitud > 1 y sólo el primer elemento será usado
2: In if (match < 0) { :
la condición tiene longitud > 1 y sólo el primer elemento será usado
3: In if (match < 0) { :
la condición tiene longitud > 1 y sólo el primer elemento será usado
> summary_changepoints_in_data <- function() {
+
+ input_dir <- "C:/Users/Admin/Downloads/Zippedi/Data/Stage
3/Merged_complete_and_incomplete"
+ output_dir <- "C:/Users/Admin/Downloads/Zippedi/Data/Stage 3/Summary_for_data"
+
+ # Create directory in case it does not exist
+ if (!file.exists(output_dir)) {
+ print("Creating directory...")
+ dir.create(output_dir, recursive = TRUE)
+ print("Directory successfully created.")
+ }
+
+ # Check files available
+ files <- list.files(path = input_dir, pattern = "\\.csv$")
+
+ for (file in files) {
+ # Read the CSV file
+ print(paste("Reading", file))
+ data <- read.csv(file.path(input_dir, file))
+
+ # Ensure CP_Date_X and CP_Date_Z columns exist
+ if (!("CP_Date_X" %in% colnames(data)) | !("CP_Date_Z" %in% colnames(data)))
{
+ stop("CP_Date_X or CP_Date_Z column is missing in the data")
+ }
+
+ # Create CP_Date column
+ data$CP_Date <- ifelse(data$CP_Date_X == 1 | data$CP_Date_Z == 1, 1, 0)
+
+ # Create timeseries to observe how many changepoints are registered by day in
every section of the store
+ summary_cp_by_date <- data %>%
+ group_by(store, date_PD, aisle, aisle_part, orient) %>%
+ summarise(
+ is_CP_Date_in_section = max(CP_Date),
+ CPs_registered_in_date_section = sum(CP_Date),
+ .groups = 'drop'
+ )
+
+ # Summarise the general statistics
+ summary_cp_general <- summary_cp_by_date %>%
+ group_by(store, aisle, aisle_part, orient) %>%
+ summarise(
+ amount_of_dates_in_section = n_distinct(date_PD),
+ amount_of_dates_with_CP_in_section =
n_distinct(date_PD[is_CP_Date_in_section == 1]),
+ .groups = 'drop'
+ )
+
+ # Since we do not need additional summarising here, the left join is done
directly
+ summary_cp <- summary_cp_general %>%
+ left_join(summary_cp_by_date,by = c("store", "aisle", "aisle_part",
"orient"))
+
+
+ setwd(output_dir)
+
+ write.csv(summary_cp, file.path(output_dir, file), row.names = FALSE)
+ rm(summary_cp)
+ gc()
+ }
+ }
> summary_changepoints_in_data()
[1] "Reading PD_J501.csv"
[1] "Reading PD_J502.csv"
[1] "Reading PD_J503.csv"
[1] "Reading PD_J504.csv"
Called from: scan(file = file, what = what, sep = sep, quote = quote, dec = dec,
nmax = nrows, skip = 0, na.strings = na.strings, quiet = TRUE,
fill = fill, strip.white = strip.white, blank.lines.skip = blank.lines.skip,
multi.line = FALSE, comment.char = comment.char, allowEscapes = allowEscapes,
flush = flush, encoding = encoding, skipNul = skipNul)
Browse[1]> summary_changepoints_in_data <- function() {
+
+ input_dir <- "C:/Users/Admin/Downloads/Zippedi/Data/Stage
3/Merged_complete_and_incomplete"
+ output_dir <- "C:/Users/Admin/Downloads/Zippedi/Data/Stage 3/Summary_for_data"
+
+ # Create directory in case it does not exist
+ if (!file.exists(output_dir)) {
+ print("Creating directory...")
+ dir.create(output_dir, recursive = TRUE)
+ print("Directory successfully created.")
+ }
+
+ # Check files available
+ files <- list.files(path = input_dir, pattern = "\\.csv$")
+
+ for (file in files) {
+ # Read the CSV file
+ print(paste("Reading", file))
+ data <- read.csv(file.path(input_dir, file))
+
+ # Ensure CP_Date_X and CP_Date_Z columns exist
+ if (!("CP_Date_X" %in% colnames(data)) | !("CP_Date_Z" %in% colnames(data)))
{
+ stop("CP_Date_X or CP_Date_Z column is missing in the data")
+ }
+
+ # Create CP_Date column
+ data$CP_Date <- ifelse(data$CP_Date_X == 1 | data$CP_Date_Z == 1, 1, 0)
+
+ # Create timeseries to observe how many changepoints are registered by day in
every section of the store
+ summary_cp_by_date <- data %>%
+ group_by(store, date_PD, aisle, aisle_part, orient) %>%
+ summarise(
+ is_CP_Date_in_section = max(CP_Date),
+ CPs_registered_in_date_section = sum(CP_Date),
+ .groups = 'drop'
+ )
+
+ # Summarise the general statistics
+ summary_cp_general <- summary_cp_by_date %>%
+ group_by(store, aisle, aisle_part, orient) %>%
+ summarise(
+ amount_of_dates_in_section = n_distinct(date_PD),
+ amount_of_dates_with_CP_in_section =
n_distinct(date_PD[is_CP_Date_in_section == 1]),
+ .groups = 'drop'
+ )
+
+ # Join the summaries
+ summary_cp <- summary_cp_general %>%
+ left_join(summary_cp_by_date, by = c("store", "aisle", "aisle_part",
"orient"))
+
+ # Reorder columns
+ summary_cp <- summary_cp %>%
+ select(
+ date_PD, store, aisle, aisle_part, orient,
+ amount_of_dates_in_section, amount_of_dates_with_CP_in_section,
+ is_CP_Date_in_section, CPs_registered_in_date_section
+ )
+
+ setwd(output_dir)
+
+ write.csv(summary_cp, file.path(output_dir, file), row.names = FALSE)
+ rm(summary_cp)
+ gc()
+ }
+ }
Browse[1]> Q
> summary_changepoints_in_data <- function() {
+
+ input_dir <- "C:/Users/Admin/Downloads/Zippedi/Data/Stage
3/Merged_complete_and_incomplete"
+ output_dir <- "C:/Users/Admin/Downloads/Zippedi/Data/Stage 3/Summary_for_data"
+
+ # Create directory in case it does not exist
+ if (!file.exists(output_dir)) {
+ print("Creating directory...")
+ dir.create(output_dir, recursive = TRUE)
+ print("Directory successfully created.")
+ }
+
+ # Check files available
+ files <- list.files(path = input_dir, pattern = "\\.csv$")
+
+ for (file in files) {
+ # Read the CSV file
+ print(paste("Reading", file))
+ data <- read.csv(file.path(input_dir, file))
+
+ # Ensure CP_Date_X and CP_Date_Z columns exist
+ if (!("CP_Date_X" %in% colnames(data)) | !("CP_Date_Z" %in% colnames(data)))
{
+ stop("CP_Date_X or CP_Date_Z column is missing in the data")
+ }
+
+ # Create CP_Date column
+ data$CP_Date <- ifelse(data$CP_Date_X == 1 | data$CP_Date_Z == 1, 1, 0)
+
+ # Create timeseries to observe how many changepoints are registered by day in
every section of the store
+ summary_cp_by_date <- data %>%
+ group_by(store, date_PD, aisle, aisle_part, orient) %>%
+ summarise(
+ is_CP_Date_in_section = max(CP_Date),
+ CPs_registered_in_date_section = sum(CP_Date),
+ .groups = 'drop'
+ )
+
+ # Summarise the general statistics
+ summary_cp_general <- summary_cp_by_date %>%
+ group_by(store, aisle, aisle_part, orient) %>%
+ summarise(
+ amount_of_dates_in_section = n_distinct(date_PD),
+ amount_of_dates_with_CP_in_section =
n_distinct(date_PD[is_CP_Date_in_section == 1]),
+ .groups = 'drop'
+ )
+
+ # Join the summaries
+ summary_cp <- summary_cp_general %>%
+ left_join(summary_cp_by_date, by = c("store", "aisle", "aisle_part",
"orient"))
+
+ # Reorder columns
+ summary_cp <- summary_cp %>%
+ select(
+ date_PD, store, aisle, aisle_part, orient,
+ amount_of_dates_in_section, amount_of_dates_with_CP_in_section,
+ is_CP_Date_in_section, CPs_registered_in_date_section
+ )
+
+ setwd(output_dir)
+
+ write.csv(summary_cp, file.path(output_dir, file), row.names = FALSE)
+ rm(summary_cp)
+ gc()
+ }
+ }
> summary_changepoints_in_data()
[1] "Reading PD_J501.csv"
[1] "Reading PD_J502.csv"
[1] "Reading PD_J503.csv"
[1] "Reading PD_J504.csv"
[1] "Reading PD_J510.csv"
[1] "Reading PD_J512.csv"
[1] "Reading PD_J513.csv"
[1] "Reading PD_J514.csv"
[1] "Reading PD_J519.csv"
[1] "Reading PD_J624.csv"
[1] "Reading PD_J659.csv"
[1] "Reading PD_J762.csv"
[1] "Reading PD_J770.csv"
[1] "Reading PD_J775.csv"
> setwd("C:/Users/Admin/Downloads/Zippedi/Data/Stage 3/Merged_data_fix")
> results_501 <- read.csv("PD_J501.csv")
Error in file(file, "rt") : no se puede abrir la conexión
Además: Warning message:
In file(file, "rt") :
no fue posible abrir el archivo 'PD_J501.csv': No such file or directory
> setwd("C:/Users/Admin/Downloads/Zippedi/Data/Stage 3/Summary_for_data")
> results_501 <- read.csv("PD_J501.csv")
> View(results_501)
> setwd("C:/Users/Admin/Downloads/Zippedi/Data/Stage
3/Merged_complete_and_incomplete")
Error in file(out, "wt") : no se puede abrir la conexión
> data_501 <- read.csv("PD_J501.csv")
> View(results_501)
> View(data_501)
> print(paste("Número de valores con linear_px_final = 0:",
sum(data_501$linear_px_final == 0)))
[1] "Número de valores con linear_px_final = 0: 61284"
Error in file(out, "wt") : no se puede abrir la conexión
> print(paste("Porcentaje de valores con linear_px_final = 0:",
100*sum(data_501$linear_px_final == 0)/nrow(data_501), "%"))
[1] "Porcentaje de valores con linear_px_final = 0: 1.35001116858662 %"
Warning message:
In if (match < 0) { :
la condición tiene longitud > 1 y sólo el primer elemento será usado
> setwd("C:/Users/Admin/Downloads/Zippedi/Data/Provided_data/Facings")
Error in file(out, "wt") : no se puede abrir la conexión
> setwd("C:/Users/Admin/Downloads/Zippedi/Data/Stage 2/Predicted_Data")
Error in file(out, "wt") : no se puede abrir la conexión
> facings_501 <- read.csv("PD_J501.csv")
Error in file(out, "wt") : no se puede abrir la conexión
> setwd("C:/Users/Admin/Downloads/Zippedi/Data/Provided_data/Facings")
> facings_501 <- read.csv("Facings_J501.csv")
> View(facings_501)
> facings_501$linear_px <- facings_501$linear_px/1000
Warning message:
In if (match < 0) { :
la condición tiene longitud > 1 y sólo el primer elemento será usado
> # Crear una tabla de frecuencia de los valores en la columna
CPs_registered_in_date_section
> frequencies <- table(results_501$CPs_registered_in_date_section)
>
> # Convertir la tabla de frecuencia en un dataframe
> frequency_df <- as.data.frame(frequencies)
>
> # Renombrar las columnas para mayor claridad
> colnames(frequency_df) <- c("CPs_registered_in_date_section", "Frequency")
>
> # Mostrar el dataframe resultante
> print(frequency_df)
CPs_registered_in_date_section Frequency
1 0 34447
2 1 9162
3 2 3936
4 3 1895
5 4 1057
6 5 588
7 6 405
8 7 263
9 8 183
10 9 125
11 10 105
12 11 89
13 12 52
14 13 62
15 14 30
16 15 44
17 16 40
18 17 21
19 18 22
20 19 21
21 20 20
22 21 15
23 22 17
24 23 17
25 24 10
26 25 14
27 26 12
28 27 5
29 28 6
30 29 12
31 30 11
32 31 5
33 32 4
34 33 11
35 34 9
36 35 7
37 36 6
38 37 8
39 38 3
40 39 3
41 40 8
42 41 5
43 42 4
44 43 8
45 44 1
46 45 5
47 46 4
48 47 4
49 48 5
50 49 3
51 50 5
52 51 3
53 52 5
54 53 5
55 54 1
56 55 4
57 56 3
58 57 3
59 58 4
60 59 1
61 60 1
62 61 4
63 62 3
64 63 3
65 64 1
66 65 7
67 67 2
68 68 2
69 69 1
70 70 1
71 71 3
72 72 4
73 73 5
74 74 5
75 75 6
76 76 3
77 77 1
78 78 2
79 80 1
80 81 3
81 83 5
82 84 3
83 85 2
84 86 1
85 87 2
86 89 3
87 90 4
88 92 2
89 93 5
90 94 2
91 95 1
92 97 1
93 98 1
94 99 3
95 101 1
96 103 1
97 106 1
98 108 1
99 109 2
100 110 1
101 111 1
102 112 2
103 114 2
104 117 1
105 120 1
106 121 1
107 125 2
108 128 1
109 131 1
110 132 1
111 134 1
112 141 1
113 142 3
114 146 1
115 147 1
116 150 1
117 151 2
118 154 1
119 156 1
120 157 2
121 160 1
122 164 1
123 165 1
124 172 1
125 186 1
126 187 1
127 203 1
128 210 1
129 220 1
130 223 2
131 242 1
132 247 1
133 253 1
134 307 1
135 354 1
136 373 1
137 401 1
>
> results_grouped <- results_501 %>%
+ group_by(fecha) %>%
+ summarise(total_CPs_in_date = sum(CPs_registered_in_date_section, na.rm =
TRUE))
Error in `group_by()`:
! Must group by variables found in `.data`.
x Column `fecha` is not found.
Run `rlang::last_trace()` to see where the error occurred.
> results_grouped <- results_501 %>%
+ group_by(date_PD) %>%
+ summarise(total_CPs_in_date = sum(CPs_registered_in_date_section, na.rm =
TRUE))
> View(results_grouped)
> #Revisar cambios estructurales
> #usar fechas con menos cambios
> results_grouped <- results_grouped %>%
+ mutate(date_PD = as.Date(fecha, format = "%Y-%m-%d"))
Error in `mutate()`:
i In argument: `date_PD = as.Date(fecha, format = "%Y-%m-%d")`.
Caused by error in `as.Date()`:
! objeto 'fecha' no encontrado
Run `rlang::last_trace()` to see where the error occurred.
> results_grouped <- results_grouped %>%
+ mutate(date_PD = as.Date(date_PD, format = "%Y-%m-%d"))
> p <- ggplot(results_grouped, aes(x = total_CPs_in_date, y = date_PD)) +
+ geom_histogram(stat = "identity", fill = "blue") +
+ labs(title = "Histograma de CPs Totales por Fecha",
+ x = "Total CPs en la Fecha",
+ y = "Fecha") +
+ theme_minimal()
Warning message:
Ignoring unknown parameters: binwidth, bins, pad
> print(p)
> p <- ggplot(results_grouped, aes(x = date_PD, y = total_CPs_in_date)) +
+ geom_col(fill = "blue") +
+ labs(title = "Histograma de CPs Totales por Fecha",
+ x = "Fecha",
+ y = "Total CPs en la Fecha") +
+ theme_minimal()
> print(p)
> #Revisar pasillos donde solo hay unileiver
> Unileiver_aisles <- c("13B", "13C", "14B", "15B", "15C", "16C", "17B", "17C",
"22B", "23B", "24B")
> results_501_unileiver <- results_501 %>%
Warning messages:
1: In if (match < 0) { :
la condición tiene longitud > 1 y sólo el primer elemento será usado
2: In if (match < 0) { :
la condición tiene longitud > 1 y sólo el primer elemento será usado
+ filter(aisle %in% Unileiver_aisles)
> View(results_501_unileiver)
> results_grouped_501_unileiver <- results_501_unileiver %>% group_by(date_PD) %>%
summarise(total_CPs_in_date = sum(CPs_registered_in_date_section, na.rm = TRUE))
Warning message:
In if (match < 0) { :
la condición tiene longitud > 1 y sólo el primer elemento será usado
> View(results_grouped_501_unileiver)
> p <- ggplot(results_grouped_501_unileiver, aes(x = date_PD, y =
total_CPs_in_date)) + geom_col(fill = "blue") + labs(title = "Histograma de CPs
Totales por Fecha para pasillos con productos Unileiver", x = "Fecha", y = "Total
CPs en la Fecha") + theme_minimal()
> > print(p)
Error: inesperado '>' en ">"
> p <- ggplot(results_grouped_501_unileiver, aes(x = date_PD, y =
total_CPs_in_date)) + geom_col(fill = "blue") + labs(title = "Histograma de CPs
Totales por Fecha para pasillos con productos Unileiver", x = "Fecha", y = "Total
CPs en la Fecha") + theme_minimal()
> print(p)
> results_grouped_501_unileiver <- results_grouped_501_unileiver %>% mutate(date_PD
= as.Date(date_PD, format = "%Y-%m-%d"))
> p <- ggplot(results_grouped_501_unileiver, aes(x = date_PD, y =
total_CPs_in_date)) + geom_col(fill = "blue") + labs(title = "Histograma de CPs
Totales por Fecha para pasillos con productos Unileiver", x = "Fecha", y = "Total
CPs en la Fecha") + theme_minimal()
> print(p)
> View(results_grouped_501_unileiver)
> View(results_501_unileiver)
> # Crear una tabla de frecuencia de los valores en la columna
CPs_registered_in_date_section
Warning messages:
1: In if (match < 0) { :
la condición tiene longitud > 1 y sólo el primer elemento será usado
2: In if (match < 0) { :
la condición tiene longitud > 1 y sólo el primer elemento será usado
> frequencies <- table(results_501_unileiver$CPs_registered_in_date_section)
> # Convertir la tabla de frecuencia en un dataframe
> frequency_501_unileiver <- as.data.frame(frequencies)
> # Renombrar las columnas para mayor claridad
> colnames(frequency_501_unileiver) <- c("CPs_registered_in_date_section",
"Frequency")
> View(frequency_501_unileiver)
> View(results_grouped_501_unileiver)

Verzani Answers
100% (8)
Verzani Answers
94 pages
The Practice of Statistics PDF
0% (2)
The Practice of Statistics PDF
1 page
ANZ Virtual Internship Module Model Answer For Task 1
No ratings yet
ANZ Virtual Internship Module Model Answer For Task 1
9 pages
notas
No ratings yet
notas
19 pages
Chapter 2. Pre-Processing Data
No ratings yet
Chapter 2. Pre-Processing Data
37 pages
InsideSherpa - Task2 - DraftSolutions - Template - RMD - Notepad - InsideSherpa - Task2 - DraftSolutions - Template
No ratings yet
InsideSherpa - Task2 - DraftSolutions - Template - RMD - Notepad - InsideSherpa - Task2 - DraftSolutions - Template
18 pages
report
No ratings yet
report
25 pages
R Functions
No ratings yet
R Functions
8 pages
Matrix, Dataframes, List
No ratings yet
Matrix, Dataframes, List
8 pages
Basic R Programming
No ratings yet
Basic R Programming
37 pages
RLAB KP
No ratings yet
RLAB KP
16 pages
Bellabeat R Script Template
No ratings yet
Bellabeat R Script Template
4 pages
R Programming
No ratings yet
R Programming
11 pages
7 K-Means Clustering
No ratings yet
7 K-Means Clustering
27 pages
DWM Practical
No ratings yet
DWM Practical
12 pages
ML Assignment Presentation
No ratings yet
ML Assignment Presentation
37 pages
Project1 - Cold Storage Case Study
No ratings yet
Project1 - Cold Storage Case Study
11 pages
PythonForMachineLearning
No ratings yet
PythonForMachineLearning
66 pages
Task 2 - Experimentation and uplift testing - Jupyter Notebook
No ratings yet
Task 2 - Experimentation and uplift testing - Jupyter Notebook
41 pages
Praktikum Modul 3
No ratings yet
Praktikum Modul 3
5 pages
Research File 3
No ratings yet
Research File 3
10 pages
Analysis Using Statistical: Introduction & Data Exploration
No ratings yet
Analysis Using Statistical: Introduction & Data Exploration
23 pages
HW 1 Math 380 R Code
No ratings yet
HW 1 Math 380 R Code
4 pages
Da (22C01156)
No ratings yet
Da (22C01156)
26 pages
Python For Business Decision Making Asm2
No ratings yet
Python For Business Decision Making Asm2
21 pages
1data Cleansing Cheklist
No ratings yet
1data Cleansing Cheklist
2 pages
Big Data - Lab 3
No ratings yet
Big Data - Lab 3
25 pages
Machine Learning Project Roadmap
No ratings yet
Machine Learning Project Roadmap
4 pages
DATA MINING EX1
No ratings yet
DATA MINING EX1
10 pages
Report For Task2
No ratings yet
Report For Task2
23 pages
Cluster R
No ratings yet
Cluster R
1 page
Exercise - 6: DS203-2024-S1 Problem1:: Statistics
No ratings yet
Exercise - 6: DS203-2024-S1 Problem1:: Statistics
10 pages
Report For Task2
No ratings yet
Report For Task2
23 pages
Experiment 5
No ratings yet
Experiment 5
13 pages
Customer Segmentation 1683225943
No ratings yet
Customer Segmentation 1683225943
34 pages
Zhang Haoze 202112 MSC
No ratings yet
Zhang Haoze 202112 MSC
114 pages
Creating A Single Data Frame From A Collection of Files
No ratings yet
Creating A Single Data Frame From A Collection of Files
6 pages
saurabh
No ratings yet
saurabh
22 pages
Semi-Automated Exploratory Data Analysis (EDA) in Python - by Destin Gong - Mar, 2021 - Towards Data
No ratings yet
Semi-Automated Exploratory Data Analysis (EDA) in Python - by Destin Gong - Mar, 2021 - Towards Data
3 pages
R 5 Marks
No ratings yet
R 5 Marks
11 pages
Da Lab It
No ratings yet
Da Lab It
20 pages
BigMart PDF
100% (1)
BigMart PDF
42 pages
Finalproj Aml
No ratings yet
Finalproj Aml
69 pages
dv mid internal 1
No ratings yet
dv mid internal 1
8 pages
FDA_E0323040_20_12_24
No ratings yet
FDA_E0323040_20_12_24
4 pages
R-Programming Lab Mannual
No ratings yet
R-Programming Lab Mannual
33 pages
Sta238 Wks - Week1+2
No ratings yet
Sta238 Wks - Week1+2
35 pages
Intro To Data Science Lecture 4
No ratings yet
Intro To Data Science Lecture 4
13 pages
Curso Básico de Iniciación A La Programación Con R Álvaro Mauricio Bustamante Lozano
No ratings yet
Curso Básico de Iniciación A La Programación Con R Álvaro Mauricio Bustamante Lozano
9 pages
DEV RECORD AIDS
No ratings yet
DEV RECORD AIDS
24 pages
DMBI IAT-2 IMP QUES SOLN
No ratings yet
DMBI IAT-2 IMP QUES SOLN
43 pages
The Error Code
No ratings yet
The Error Code
17 pages
lab book
No ratings yet
lab book
24 pages
Data Manipulation Workshop Handout
No ratings yet
Data Manipulation Workshop Handout
46 pages
基于Engle-Granger的低频、高频统计套利研究
No ratings yet
基于Engle-Granger的低频、高频统计套利研究
22 pages
R Program Record Book Iba
No ratings yet
R Program Record Book Iba
24 pages
Stastistics and Probability With R Programming Language: Lab Report
50% (2)
Stastistics and Probability With R Programming Language: Lab Report
44 pages
All Codes
No ratings yet
All Codes
10 pages
Sakhil Assignment 02
No ratings yet
Sakhil Assignment 02
8 pages
Oracle Certified Professional Java Programmer OCPJP 1Z0 809
From Everand
Oracle Certified Professional Java Programmer OCPJP 1Z0 809
Manish Soni
No ratings yet
Computer Engineering Laboratory Solution Primer
From Everand
Computer Engineering Laboratory Solution Primer
Karan Bhandari
No ratings yet
Sap BPC
No ratings yet
Sap BPC
6 pages
Waveshaper 1000b 4000b Ds
No ratings yet
Waveshaper 1000b 4000b Ds
4 pages
Iot Unit6
No ratings yet
Iot Unit6
35 pages
Node JS IMP Question and Answer SY BBA CA Sem IV
No ratings yet
Node JS IMP Question and Answer SY BBA CA Sem IV
33 pages
Chapter 11 Arrays1
No ratings yet
Chapter 11 Arrays1
14 pages
Learn Powerpoint
No ratings yet
Learn Powerpoint
320 pages
Files in Computer Programming
No ratings yet
Files in Computer Programming
35 pages
Mayank_resume (1)
No ratings yet
Mayank_resume (1)
1 page
Guido De Vita - Technical Project Manager CV
No ratings yet
Guido De Vita - Technical Project Manager CV
4 pages
Ey Erformance Ndicators: 4wire KPI Dashboard K P I
No ratings yet
Ey Erformance Ndicators: 4wire KPI Dashboard K P I
4 pages
WWW - Slideshare - Net - Rahulkushwaha06 - Computer Science Investigatory Project Class 12
No ratings yet
WWW - Slideshare - Net - Rahulkushwaha06 - Computer Science Investigatory Project Class 12
35 pages
Infrastructure Pentesting PDF
No ratings yet
Infrastructure Pentesting PDF
13 pages
File Handling: Prepared By: Sudipta Baladhikary Computer Science Department Hariyana Vidya Mandir
No ratings yet
File Handling: Prepared By: Sudipta Baladhikary Computer Science Department Hariyana Vidya Mandir
9 pages
Develop Computer Program Assignment
No ratings yet
Develop Computer Program Assignment
2 pages
Untitled
No ratings yet
Untitled
10 pages
System Administration Assignment
No ratings yet
System Administration Assignment
12 pages
JD - PEGA Developer
No ratings yet
JD - PEGA Developer
1 page
GRNK Guithth Onbt
No ratings yet
GRNK Guithth Onbt
2 pages
18csl58 Dbms Lab Manual 2022-23
No ratings yet
18csl58 Dbms Lab Manual 2022-23
72 pages
What Is DevOps? - IBM
No ratings yet
What Is DevOps? - IBM
20 pages
CRM Requirements Template: Activities General
No ratings yet
CRM Requirements Template: Activities General
5 pages
Init Setup
No ratings yet
Init Setup
2 pages
Food Delivery
No ratings yet
Food Delivery
9 pages
Cloud Security Implications For Finanical Services
No ratings yet
Cloud Security Implications For Finanical Services
9 pages
Magic Mirror: The Magpi Issue 54
No ratings yet
Magic Mirror: The Magpi Issue 54
2 pages
Final Term Question Paper Class XI Computer Science
No ratings yet
Final Term Question Paper Class XI Computer Science
4 pages
PDF (Ebook PDF) Computer Systems: A Programmer's Perspective 3nd Edition Download
100% (6)
PDF (Ebook PDF) Computer Systems: A Programmer's Perspective 3nd Edition Download
41 pages
Divertidos Ensayos Persuasivos
100% (1)
Divertidos Ensayos Persuasivos
6 pages
Network Security Record
No ratings yet
Network Security Record
57 pages

Leer

Uploaded by

Leer

Uploaded by

Logramos arreglar el problema de valores extremadamente altos.

+ for (col in missing_cols) {

You might also like