This cheat sheet provides a comprehensive overview of key concepts and commands in Statistics and Data Science, including basic statistics, data manipulation, model fitting, simulation, and visualizations. It includes syntax for R programming, such as calculating means, creating frequency tables, and generating plots. The document serves as a quick reference for performing statistical analyses and visualizations using R.
This cheat sheet provides a comprehensive overview of key concepts and commands in Statistics and Data Science, including basic statistics, data manipulation, model fitting, simulation, and visualizations. It includes syntax for R programming, such as calculating means, creating frequency tables, and generating plots. The document serves as a quick reference for performing statistical analyses and visualizations using R.
outcome = explanatory + other stuff # compute five-number summary mean(data_set$Y) Y = X + other stuff favstats(~ Y, data = data_set) var(data_set$Y) sd(data_set$Y) # create frequency table tally(data_set$Y) cohensD(Y ~ X, data = data_set) Basics tally(~ Y, data = data_set) cor(Y ~ X, data = data_set) print("Hello world!") # tally by condition b1(Y ~ X, data = data_set) # assign value to object tally(~ Y < 1900, data = data_set) b1(one_model) myNumber <- 5 # two-way frequency table pre(Y ~ X, data = data_set) # combine values into vector tally(Y ~ X, data = data_set, margin = TRUE, f(Y ~ X, data = data_set) myVector <- c(1, 2, 3) format = “proportion”)
# first element in vector Data Frame
myVector[1] # structure of data frame # arrange rows by variable # orders values or cases str(data_set) arrange(data_set, Y) sort(myVector) # view first/last six rows # creates data frame from csv file # arithmetic operations head(data_set) data_set <- read.csv("file_name", header = TRUE) sum(1, 2, 100), +, -, *, / tail(data_set) sqrt(157) # convert quantitative variable abs(data_set$Y) # select multiple variables # to categorical select(data_set, Y1, Y2) factor(data_set$Y) # logical operations factor(data_set$Y, levels = c(1,2), labels = >, <, >=, <=, ==, !=, |, & # first six rows of selected variables c("A", "B")) head(select(data_set, Y1, Y2)) # results in a variable with values # transform values # of TRUE or FALSE recode(data_set$Y, "0" = 0, "1" = 50, "2" = 100) data_set$C <- data_set$A > data_set$B # select variable (a column) data_set$Y # creates two equal sized groups Probability Distribution ntile(data_set$Y, 2) # calculate the probability area # find rows that meet condition # convert categorical variable xpnorm(65.1, data_set$mean, data_set$sd) data_set[data_set$Y > 40] # to quantitative filter(data_set, Y > 300) as.numeric(data_set$Y) zscore(data_set$Y) filter(data_set, Y != "NA")
# returns t at this probability
qt(.975, df = 999) # returns F at this probability qf(.95, df1 = 1, df2 = 100)
# CI using t distribution confint(empty_model)
# calculate p-value using F-distribution
xpf(sample_F, df1 = 2 , df2 = 10)
Page: 1 ▷ Updated: 2023-04 ▷ Learn more about CourseKata @ https://coursekata.org
Statistics and Data Science I (ABC) CHEAT SHEET Simulation Fitting and Evaluating Models # sample without replacement # bootstrap sampling distribution of b1s, # empty model sample(data_set, 6) # centered on sample b1 empty_model <- lm(Y ~ NULL, sdob1_boot <- do(1000) * data = data_set) # sample with replacement b1(Y ~ X, data = resample(data_set)) resample(data_set, 10) # use one expanatory variable # count the number of b1s at the upper one_model <- lm(Y ~ X, data = data_set) do(3) * resample (data_set, 10) # and lower extreme tally(sdob1$b1 > sample_b1 | # create a function from a formula # mixes up values in a variable sdob1$b1 < -sample_b1) one_model_fun <- makeFun(one_model) shuffle(data_set$Y) one_model_fun(x_level_1) # simulate sampling 10000 Ys # return TRUE for middle 95% of distribution # from normal distribution middle(sdob1$b1, .95) # model predictions and residuals sim_Y <- rnorm(10000, Y_stats$mean, data_set$empty_predict <- predict(empty_model) Y_stats$sd) # randomize sampling distribution of PREs data_set$empty_resid <- resid(empty_model) sdoPRE <- do(1000) * PRE(shuffle(Y) ~ X, # put simulated Ys into dataframe data = data_set) # produce ANOVA table data_set<- data.frame(sim_Y) anova(empty_model) # randomize sampling distribution of Fs supernova(one_model) # simulate sampling distribution of sdoF <- do(1000) * means fVal(shuffle(Y) ~ X, data = data_set) # t-test, using pooled variance sim_SDoM <- do(10000) * mean(rnorm(157, t.test(Tip ~ Condition, data = data_set, Y_stats$mean, Y_stats$sd)) # counts extreme Fs var.equal=TRUE) tally(~fVal > sample_F, data = sdoF) # bootstrap sampling distribution of # pairwise comparison means # corrections: "Bonferroni" or "none" bootSDoM <- do(10000) * pairwise(one_model, correction = "none") mean(resample(data_set$Y, 157))
# randomize sampling distribution
# of b1s, centered on 0 sdob1 <- do(1000) * b1(shuffle(Y) ~ X, data = data_set)
Page: 2 ▷ Updated: 2023-04 ▷ Learn more about CourseKata @ https://coursekata.org
Statistics and Data Science I (ABC) CHEAT SHEET Visualizations gf_boxplot(Y ~ X, data = data_set) # sampling distribution of b1 gf_histogram(~ Y, data = data_set) %>% gf_histogram(~b1, data = sdob1, # change labels fill = ~middle(b1, .95)) %>% gf_labs(title = "Graph Title", x = "Y_Name", # modify the limits on x- and y-axes y = "Frequency") gf_lims(x = c(-12, 12), y = c(0, 70))
gf_point(Y ~ X, data = data_set)
# faceted grid of histograms
gf_histogram(~ Y, data = data_set) %>% gf_facet_grid(X ~ .)
gf_point(Y ~ X, data = data_set) %>%
# add model predictions as red points gf_point(Y ~ X , shape = 1, size = 3, color = "firebrick") %>% gf_jitter(Y ~ X, data = data_set) # add best fitting model as a red line gf_model(one_model, color = “red”)
Instant Download The Handbook of Financial Modeling: A Practical Approach to Creating and Implementing Valuation Projection Models 2nd Edition Jack Avon PDF All Chapters