Advanced Statistics and Data Science (ABCD) CHEAT SHEET
Word Equations Summary Tables Simple Statistics
outcome = explanatory + other stuff # compute five-number summary mean(data_set$Y) favstats(~ Y, data = data_set) var(data_set$Y) Y = X + other stuff sd(data_set$Y) # create frequency table Basics tally(data_set$Y) cohensD(Y ~ X, data = data_set) tally(~ Y, data = data_set) cor(Y ~ X, data = data_set) print("Hello world!") # tally by condition b1(Y ~ X, data = data_set) # assign value to object tally(~ Y < 1900, data = data_set) b1(one_model) my_number <- 5 # two-way frequency table pre(Y ~ X, data = data_set) # combine values into vector tally(Y ~ X, data = data_set, margin = TRUE, f(Y ~ X, data = data_set) my_vector <- c(1, 2, 3) format = “proportion”) # sample F for X2 # first element in vector f(Y ~ X1 + X2, my_vector[1] data = data_set, predictor = ~X2)
# orders values or cases
sort(my_vector) Data Frame # arithmetic operations # structure of data frame # arrange rows by variable sum(1, 2, 100), +, -, *, / str(data_set) arrange(data_set, Y) sqrt(157) abs(data_set$Y) # view first/last six rows # creates data frame from csv file head(data_set) data_set <- read.csv("file_name", header = TRUE) # logical operations tail(data_set) >, <, >=, <=, ==, !=, |, & # convert quantitative variable # select multiple variables # to categorical # results in a new variable with values select(data_set, Y1, Y2) factor(data_set$Y) # of TRUE or FALSE factor(data_set$Y, data_set$C <- data_set$A > data_set$B # first six rows of selected variables levels = c(1,2), head(select(data_set, Y1, Y2)) labels = c("A", "B"))
Probability Distribution # transform values
# select variable (a column) recode(data_set$Y, "0" = 0, "1" = 50, "2" = 100) # calculate the probability area xpnorm(65.1, data_set$mean, data_set$sd) data_set$Y # creates two equal sized groups # find rows that meet condition ntile(data_set$Y, 2) zscore(data_set$Y) data_set[data_set$Y > 40] # convert categorical variable # returns t at this probability filter(data_set, Y > 300) # to quantitative qt(.975, df = 999) as.numeric(data_set$Y) # returns F at this probability # find rows that do not have NA qf(.95, df1 = 1, df2 = 100) filter(data_set, is.na(Y) == FALSE) filter(data_set, !is.na(Y)) # CI using t distribution confint(empty_model)
# calculate p-value using F-distribution
xpf(sample_f, df1 = 2 , df2 = 10)
Page: 1 ▷ Updated: 2024-10 ▷ Learn more about CourseKata @ https://coursekata.org
Advanced Statistics and Data Science (ABCD) CHEAT SHEET Simulation Fitting and Evaluating Models # sample without replacement # randomize sampling distribution of b1s, # empty model sample(data_set, 6) # centered on 0 empty_model <- lm(Y ~ NULL, sdob1 <- do(1000) * data = data_set) # sample with replacement b1(shuffle(Y) ~ X, data = data_set) resample(data_set, 10) # use one explanatory variable # bootstrap sampling distribution of b1s, one_model <- lm(Y ~ X, data = data_set) do(3) * resample (data_set, 10) # centered on sample b1 sdob1_boot <- do(1000) * # use more than one explanatory variable # mixes up values in a variable b1(Y ~ X, data = resample(data_set)) # multivariate model shuffle(data_set$Y) multi_model <- lm(Y ~ X1 + X2, data = data_set) # count the number of b1s at the upper # simulate sampling 10000 Ys # and lower extreme # all the model comparisons that can be # from normal distribution tally(sdob1$b1 > sample_b1 | # made in relation to the multivariate model sim_Y <- rnorm(10000, Y_stats$mean, sdob1$b1 < -sample_b1) generate_models(multi_model) Y_stats$sd) # model predictions and residuals # put simulated Ys into dataframe # return TRUE for middle 95% of distribution data_set$empty_predict <- predict(empty_model) data_set<- data.frame(sim_Y) middle(sdob1$b1, .95) data_set$empty_resid <- resid(empty_model) # randomize sampling distribution of PREs # produce ANOVA table # simulate sdopre <- do(1000) * pre(shuffle(Y) ~ X, anova(empty_model) # sampling distribution of means data = data_set) sdom_sim <- do(10000) * mean(rnorm(157, supernova(one_model) Y_stats$mean, Y_stats$sd)) # randomize sampling distribution of Fs sdof <- do(1000) * # t-test, using pooled variance # bootstrap f(shuffle(Y) ~ X, data = data_set) t.test(Tip ~ Condition, data = data_set, # sampling distribution of means var.equal=TRUE) sdom_boot <- do(10000) * # counts extreme Fs mean(resample(data_set$Y, 157)) sample_f <- f(shuffle(Y) ~ X, data = data_set) # pairwise comparison corrections: tally(~f > sample_f, data = sdof) # "Tukey","Bonferroni","none" pairwise(one_model, correction = "none")
Page: 2 ▷ Updated: 2024-10 ▷ Learn more about CourseKata @ https://coursekata.org
Advanced Statistics and Data Science (ABCD) CHEAT SHEET Visualizations gf_histogram(~ Y, data = data_set) %>% # sampling distribution of b1 gf_point(Y ~ X, data = data_set) # change labels gf_histogram(~b1, data = sdob1, gf_labs(title = "Graph Title", fill = ~middle(b1, .95)) %>% x = "Y_Name", y = "Frequency") # modify the limits on x- and y-axes gf_lims(x = c(-12, 12), y = c(0, 70))
gf_jitter(Y ~ X, data = data_set)
# faceted grid of histograms gf_histogram(~ Y, data = data_set) %>% gf_facet_grid(X ~ .)
gf_point(Y ~ X, data = data_set) %>%
# add model predictions as red points gf_point(Y ~ X , shape = 1, size = 3, color = "firebrick") %>% # add best fitting model as a red line gf_boxplot(Y ~ X, data = data_set) gf_model(one_model, color = “red”)
gf_dhistogram(~ Y, data = data_set,
fill = "orange") %>% gf_density()
gf_boxplot(Y ~ X, data = data_set, fill = pairwise(one_model, plot = TRUE)
"orange") %>% gf_jitter(height = 0, alpha = .2, size = 3) gf_bar( ~ Y, data = data_set)
Page: 3 ▷ Updated: 2024-10 ▷ Learn more about CourseKata @ https://coursekata.org