0% found this document useful (0 votes)
13 views

r-cheatsheet-ABC

Uploaded by

ikon82870
Copyright
© © All Rights Reserved
Available Formats
Download as PDF, TXT or read online on Scribd
0% found this document useful (0 votes)
13 views

r-cheatsheet-ABC

Uploaded by

ikon82870
Copyright
© © All Rights Reserved
Available Formats
Download as PDF, TXT or read online on Scribd
You are on page 1/ 3

Statistics and Data Science I (ABC) CHEAT SHEET

Word Equations Summary Tables Simple Statistics


outcome = explanatory + other stuff # compute five-number summary mean(data_set$Y)
favstats(~ Y, data = data_set) var(data_set$Y)
Y = X + other stuff sd(data_set$Y)
# create frequency table
tally(data_set$Y) cohensD(Y ~ X, data = data_set)
Basics tally(~ Y, data = data_set) cor(Y ~ X, data = data_set)
print("Hello world!") # tally by condition b1(Y ~ X, data = data_set)
tally(~ Y < 1900, data = data_set) b1(my_model)
# assign value to object
my_number <- 5 # two-way frequency table pre(Y ~ X, data = data_set)
tally(Y ~ X, data = data_set, margin = TRUE, f(Y ~ X, data = data_set)
# combine values into vector format = “proportion”)
my_vector <- c(1, 2, 3)

# first element in vector


Data Frame
my_vector[1] # structure of data frame # arrange rows by variable
str(data_set) arrange(data_set, Y)
# orders values or cases
sort(my_vector)
# view first/last six rows # creates data frame from csv file
head(data_set) data_set <- read.csv("file_name", header = TRUE)
# arithmetic operations
sum(1, 2, 100), +, -, *, / tail(data_set)
# convert quantitative variable
sqrt(157)
# select multiple variables # to categorical
abs(data_set$Y)
select(data_set, Y1, Y2) factor(data_set$Y)
# logical operations factor(data_set$Y,
>, <, >=, <=, ==, !=, |, & # first six rows of selected variables levels = c(1,2),
head(select(data_set, Y1, Y2)) labels = c("A", "B"))
# results in a new variable with values
# of TRUE or FALSE # transform values
data_set$C <- data_set$A > data_set$B # select variable (a column) recode(data_set$Y, "0" = 0, "1" = 50, "2" = 100)
data_set$Y
Probability Distribution # creates two equal sized groups
# find rows that meet some condition ntile(data_set$Y, 2)
# calculate the probability area data_set[data_set$Y > 40]
xpnorm(65.1, data_set$mean, data_set$sd) filter(data_set, Y > 300) # convert categorical variable
# to quantitative
zscore(data_set$Y) as.numeric(data_set$Y)
# find rows that do not have NA
filter(data_set, is.na(Y) == FALSE)
# returns t at this probability filter(data_set, !is.na(Y))
qt(.975, df = 999)
# returns F at this probability
qf(.95, df1 = 1, df2 = 100)

# CI using t distribution
confint(empty_model)

# calculate p-value using F-distribution


xpf(sample_f, df1 = 2 , df2 = 10)
Page: 1 ▷ Updated: 2024-10 ▷ Learn more about CourseKata @ https://coursekata.org
Statistics and Data Science I (ABC) CHEAT SHEET
Simulation Fitting and Evaluating Models
# sample without replacement # randomize sampling distribution of b1s, # empty model
sample(data_set, 6) # centered on 0 empty_model <- lm(Y ~ NULL, data = data_set)
sdob1 <- do(1000) *
# sample with replacement b1(shuffle(Y) ~ X, data = data_set) # use one explanatory variable
resample(data_set, 10) my_model <- lm(Y ~ X, data = data_set)
# bootstrap sampling distribution of b1s,
# centered on sample b1 # model predictions and residuals
do(3) * resample (data_set, 10) data_set$empty_predict <- predict(empty_model)
sdob1_boot <- do(1000) *
b1(Y ~ X, data = resample(data_set)) data_set$empty_resid <- resid(empty_model)
# mixes up values in a variable
shuffle(data_set$Y) # produce ANOVA table
# count the number of b1s at the upper
anova(empty_model)
# simulate sampling 10000 Ys # and lower extreme
tally(sdob1$b1 > sample_b1 | supernova(my_model)
# from normal distribution
sim_Y <- rnorm(10000, Y_stats$mean, sdob1$b1 < -sample_b1)
# t-test, using pooled variance
Y_stats$sd) t.test(Tip ~ Condition, data = data_set,
# return TRUE for middle 95% of distribution var.equal=TRUE)
# put simulated Ys into dataframe
data_set<- data.frame(sim_Y) middle(sdob1$b1, .95)
# pairwise comparison
# randomize sampling distribution of PREs # corrections: "Tukey","Bonferroni","none"
# simulate
sdopre <- do(1000) * pre(shuffle(Y) ~ X, pairwise(my_model, correction = "Tukey")
# sampling distribution of means data = data_set)
sdom_sim <- do(10000) * mean(rnorm(157,
Y_stats$mean, Y_stats$sd)) # randomize sampling distribution of Fs
sdof <- do(1000) *
# bootstrap f(shuffle(Y) ~ X, data = data_set)
# sampling distribution of means
sdom_boot <- do(10000) * # counts extreme Fs
mean(resample(data_set$Y, 157)) sample_f <- f(shuffle(Y) ~ X, data = data_set)
tally(~f > sample_f, data = sdof)

Page: 2 ▷ Updated: 2024-10 ▷ Learn more about CourseKata @ https://coursekata.org


Statistics and Data Science I (ABC) CHEAT SHEET
Visualizations
gf_histogram(~ Y, data = data_set) %>% # sampling distribution of b1
gf_point(Y ~ X, data = data_set) # change labels gf_histogram(~b1, data = sdob1,
gf_labs(title = "Graph Title", fill = ~middle(b1, .95)) %>%
x = "Y_Name", y = "Frequency") # modify the limits on x- and y-axes
gf_lims(x = c(-12, 12), y = c(0, 70))

gf_jitter(Y ~ X, data = data_set)


# faceted grid of histograms
gf_histogram(~ Y, data = data_set) %>%
gf_facet_grid(X ~ .)

gf_point(Y ~ X, data = data_set) %>%


# add model predictions as red points
gf_point(Y ~ X , shape = 1, size = 3,
color = "firebrick") %>%
# add best fitting model as a red line
gf_boxplot(Y ~ X, data = data_set) gf_model(my_model, color = “red”)

gf_dhistogram(~ Y, data = data_set,


fill = "orange") %>%
gf_density()

gf_boxplot(Y ~ X, data = data_set, fill =


"orange") %>% pairwise(my_model, plot = TRUE)
gf_jitter(height = 0, alpha = .2, size = 3)
gf_bar( ~ Y, data = data_set)

Page: 3 ▷ Updated: 2024-10 ▷ Learn more about CourseKata @ https://coursekata.org

You might also like