0% found this document useful (0 votes)
71 views8 pages

ANOVA

xstk

Uploaded by

Tuấn PP
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as PDF, TXT or read online on Scribd
0% found this document useful (0 votes)
71 views8 pages

ANOVA

xstk

Uploaded by

Tuấn PP
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as PDF, TXT or read online on Scribd
You are on page 1/ 8

# Load libraries

library(tidyverse)
library(readr)
library(httr)

# Download and unzip the dataset


url <-
"https://dax-cdn.cdn.appdomain.cloud/dax-airline/1.0.1/lax_to_jfk.tar.
gz"
GET(url, write_disk("lax_to_jfk.tar.gz", overwrite = TRUE))
untar("lax_to_jfk.tar.gz")

# Read the CSV file


sub_airline <- read_csv("lax_to_jfk/lax_to_jfk.csv",
col_types = cols('DivDistance' = col_number(),
'DivArrDelay' =
col_number()))

# Display the first few rows


head(sub_airline)

Response
[https://dax-cdn.cdn.appdomain.cloud/dax-airline/1.0.1/lax_to_jfk.tar.
gz]
Date: 2024-10-15 11:42
Status: 200
Content-Type: application/x-gzip
Size: 58.4 kB
<ON DISK> C:\Users\VUONG\Desktop\R\1\lax_to_jfk.tar.gzNULL

Month DayOfWeek FlightDate Reporting_Airline Origin Dest CRSDepTime


1 3 5 2003-03-28 UA LAX JFK 2210
2 11 4 2018-11-29 AS LAX JFK 1045
3 8 5 2015-08-28 UA LAX JFK 0805
4 4 7 2003-04-20 DL LAX JFK 2205
5 11 3 2005-11-30 UA LAX JFK 0840
6 4 1 1992-04-06 UA LAX JFK 1450
CRSArrTime DepTime ArrTime ⋯ ArrDelayMinutes CarrierDelay
WeatherDelay
1 0615 2209 0617 ⋯ 2 NA NA

2 1912 1049 1851 ⋯ 0 NA NA

3 1634 0757 1620 ⋯ 0 NA NA

4 0619 2212 0616 ⋯ 0 NA NA

5 1653 0836 1640 ⋯ 0 NA NA

6 2308 1452 2248 ⋯ 0 NA NA


NASDelay SecurityDelay LateAircraftDelay DepDelay DepDelayMinutes
DivDistance
1 NA NA NA -1 0 NA

2 NA NA NA 4 4 NA

3 NA NA NA -8 0 NA

4 NA NA NA 7 7 NA

5 NA NA NA -4 0 NA

6 NA NA NA 2 2 NA

DivArrDelay
1 NA
2 NA
3 NA
4 NA
5 NA
6 NA

dim(sub_airline)

[1] 2855 21

colnames(sub_airline)

[1] "Month" "DayOfWeek" "FlightDate"


[4] "Reporting_Airline" "Origin" "Dest"
[7] "CRSDepTime" "CRSArrTime" "DepTime"
[10] "ArrTime" "ArrDelay" "ArrDelayMinutes"
[13] "CarrierDelay" "WeatherDelay" "NASDelay"
[16] "SecurityDelay" "LateAircraftDelay" "DepDelay"
[19] "DepDelayMinutes" "DivDistance" "DivArrDelay"

ggplot(data = sub_airline, mapping = aes(x = Reporting_Airline, y =


ArrDelay)) +
geom_boxplot(fill = "bisque",color = "black", alpha = 0.3) +
geom_jitter(aes(color = 'blue'), alpha=0.2) +
labs(x = "Airline") +
ggtitle("Arrival Delays by Airline") +
guides(color = FALSE) +
theme_minimal() +
coord_cartesian(ylim = quantile(sub_airline$ArrDelay, c(0, 0.99)))

Warning message:
"The `<scale>` argument of `guides()` cannot be `FALSE`. Use "none"
instead as
of ggplot2 3.3.4."
alaska_flights <- sub_airline %>%
filter(Reporting_Airline == 'AS') %>%
filter(!is.na(DepDelay) & !is.na(ArrDelay)) %>%
filter(DepDelay < 40)

ggplot(data = alaska_flights, mapping = aes(x = DepDelay, y =


ArrDelay)) +
geom_point() +
ggtitle('Alaska Flight Depature Delays vs Arrival Delays')
avg_delays <- sub_airline %>%
group_by(Reporting_Airline, DayOfWeek) %>%
summarize(mean_delays = mean(ArrDelayMinutes), .groups = 'keep') %>%
mutate(bins = cut(mean_delays,breaks = c(-0.1,0.1,10,20,30,50,
max(mean_delays)),
labels = c("0","0-10","10-20","20-30","30-
50",">50"))) %>%
mutate(bins = factor(as.character(bins),levels = rev(levels(bins))))

ggplot(avg_delays, aes(x = Reporting_Airline,


y = lubridate::wday(DayOfWeek, label = TRUE),
fill = bins)) +
geom_tile(colour = "white", size = 0.2) +
geom_text(aes(label = round(mean_delays, 3))) +
guides(fill = guide_legend(title = "Delays Time Scale"))+
labs(x = "Reporting Airline",y = "Day of Week",title = "Average
Arrival Delays")+
scale_fill_manual(values = c("#d53e4f", "#f46d43", "#fdae61",
"#fee08b", "#e6f598", "#abdda4"))
cor(sub_airline$CarrierDelay, sub_airline$ArrDelayMinutes, use =
'complete.obs')

[1] 0.7287601

ggplot(data = sub_airline, mapping = aes(x = CarrierDelay, y =


ArrDelayMinutes)) +
geom_point() +
geom_smooth(method = 'lm', na.rm = TRUE)

`geom_smooth()` using formula = 'y ~ x'


Warning message:
"Removed 2486 rows containing missing values or values outside the
scale range
(`geom_point()`)."
library(corrplot)

corrplot 0.95 loaded

numerics_airline <- sub_airline %>%


select(ArrDelayMinutes, DepDelayMinutes, CarrierDelay,
WeatherDelay, NASDelay, SecurityDelay, LateAircraftDelay)

airlines_cor <- cor(numerics_airline, method = 'pearson', use =


'pairwise.complete.obs')

col <- colorRampPalette(c("#BB4444", "#EE9988", "#FFFFFF", "#77AADD",


"#4477AA"))

corrplot(airlines_cor, method = 'color', col = col(200),


type = 'upper', order = 'hclust',
addCoef.col = 'black',
tl.col = 'black', tl.srt = 45,
)
sub_airline %>%
group_by(Reporting_Airline) %>%
summarize(Average_Delays = mean(ArrDelayMinutes, na.rm = TRUE))

Reporting_Airline Average_Delays
1 AA 10.12226
2 AS 12.91111
3 B6 18.55039
4 DL 13.83650
5 HP 19.21429
6 PA (1) 33.54545
7 TW 15.59459
8 UA 11.73462
9 VX 14.93798

aa_as_subset <- sub_airline %>%


select(ArrDelay, Reporting_Airline) %>%
filter(Reporting_Airline == 'AA' | Reporting_Airline == 'AS')

ad_aov <- aov(ArrDelay ~ Reporting_Airline, data = aa_as_subset)


summary(ad_aov)

Df Sum Sq Mean Sq F value Pr(>F)


Reporting_Airline 1 126 125.7 0.13 0.718
Residuals 1139 1097707 963.7
summary_airline_delays %>%
ggplot(aes(x = Reporting_Airline, y = Average_Delays)) +
geom_bar(stat = 'identity') +
ggtitle('Average Arrival Delays by Airline')

You might also like