Statistic and R Programming Lab Exercise
Statistic and R Programming Lab Exercise
v1 =c(1:20)
v1
v2 =seq(20,1,-1)
v2
v3=c(1:20, seq(19,1,-1))
v3
temp=c(4,6,3)
temp
# Answer (2)
#2(e)
ve=rep(c(4,6,3),10)
ve
#2(f)
temp=c(4,6,3)
rep(temp,times=11,length.out=31)
#2(g)
vg=c(rep(4,10),rep(6,20),rep(3,30))
vg
#or we can do this in 2nd way
temp=c(4,6,3)
rep(temp,c(10,20,30))
# Answer (3)
x =seq(3,6,by = 0.1)
v2 =exp(x) * cos(x)
v2
# Answer (4)
heights=c(180,165,160,193)
weights=c(180,165,160,193)
bmi=c(weights/(heights^2))
print(bmi)
# Answer (5)
temp=c(23,27,19)
f=(9/5*temp)+32
print(f)
# Answer (6)
set.seed(75)
aMat=matrix (sample(10, size=60, replace=T), nrow=6)
aMat
#Answer 6)a
apply(aMat, 1, function(x){sum(x>4)})
#Answer 6)b
which( apply(aMat,1,function(x){sum(x==7)==2}) )
#Answer 6)c
aMatColSums =colSums(aMat)
aMatColSums
which( outer(aMatColSums,aMatColSums,"+")>75, arr.ind=T )
#or
#Answer (7)
Number = c(1,2,3,4)
Diet = c("Poor","Poor","Good","Good")
Sex = c("M","F","M","F")
Weight= c(156,180,167,190)
fat.content= c(34,43,40,50)
Morph = c("Winged","Winged","Wingless","Intermediate")
#Answer (8)
#class(): Determine the kind of variable
#mode() : a mutually exclusive classification of objects according to their basic structure
#typeof(): Data type of an object
#is() : to check any variable of certain data type
#rm() function is used to delete objects from the memory
v=cbind(c(1,2,3),c(4,5,6))
v
class(v) #class will be matrix and array
mode(v) # mode will be numeric
typeof(v) # typeof will be double
is.array(v) # will return TRUE
is.matrix(v) #will return TRUE
rm(v)
v #object not found
#Answer (9)
fy = c(2010, 2011, 2012, 2010, 2011, 2012, 2010, 2011, 2012)
fy
company =c("Apple", "Apple", "Apple", "Google", "Google", "Google", "Microsoft", "Microsoft",
"Microsoft")
company
revenue = c(65225, 108249, 156508, 29321, 37905, 50175, 62484, 69943, 73723)
revenue
profit =c( 14013, 25922, 41733, 8505, 9737, 10737, 18760, 23150, 16978)
profit
companiesData=data.frame(fy,company,revenue,profit)
companiesData
write.csv(companiesData, file="ray.csv")
head(companiesData)
dim(companiesData)
nrow(companiesData)
ncol(companiesData)
str(companiesData)
min(companiesData$profit)
max(companiesData$profit)
subset(companiesData, fy >=2011, select=c(fy,profit))
#Answer (10)
gender <- factor(c(rep("female", 91), rep("male", 92)))
table(gender)
gender <- factor(gender, levels = c("male", "female")) # mapping of male to female and female to male
table(gender)
gender <- factor(gender, levels = c("Male", "female")) #mapping of Male to male and female to female
Male value will be zero
table(gender)
rm(gender)
gender # object not found because of rm(gender)code
#Answer (11)
for (i in 1:7){
print(i**3)
}
#Answer (12)
#Answer (13)
mx = matrix(c(1,1,3,5,2,6,-2,-1,-3), nrow = 3, byrow = TRUE)
fun = function(mx) {
ifelse(mx %% 2 == 0, mx, 2*mx)
}
res <- fun(mx)
res
#Answer (14)
num=c(19)
#num=as.numeric(readline(prompt = "Enter The number: "))
is.prime <- function(num) {
if (num == 2) {
print(" Number is Prime")
} else if (any(num %% 2:(num-1) == 0)) {
print(" Number is not Prime")
} else {
print(" Number is Prime")
}
}
is.prime(num)
#Answer (15)
data = airquality
print(class(data))
result = data[order(data[,1]),]
print("Order the entire data frame by the first and second column:")
print(result)
#Answer (16)
data(mtcars)
head(mtcars)
#16)a
print("Original dataframe:")
print(mtcars)
print("Structure of the said data frame:")
print(str(mtcars))
#16)b
print("Original dataframe:")
print(mtcars)
print("Statistical summary and nature of the data of the said dataframe:")
print(summary(mtcars))
head(mtcars)
#16)c
print(" to extract specific column from a data frame using column name:")
data.frame(mtcars$mpg)
data.frame(mtcars$mpg,mtcars$gear)
#16)d
print("Original dataframe:")
print(mtcars)
print("Extract first two rows:")
result = mtcars[1:2,]
print(result)
#16)e
print("Original dataframe:")
print(mtcars)
print("Extract 3rd and 5th rows with 1st and 3rd columns :")
result = mtcars[c(3,5),c(1,3)]
print(result)
#16)f
print("Original dataframe:")
print(mtcars)
print("New data frame after adding the 'country' column:")
mtcars$country =
c("USA","USA","USA","USA","USA","USA","USA","USA","USA","USA","India","USA","India","USA","India","
USA","India","USA","USA","USA","India","USA","India","USA","India","USA","India","USA","India","USA","In
dia","USA")
print(mtcars)
#16)g
Alto = data.frame(
#16)h
print("Original dataframe:")
print(mtcars)
print("After removing col(s) to an existing data frame:")
mtcars = subset(mtcars, select = -c(cyl, qsec))
print(mtcars)
#16)i
print("Original dataframe:")
print(mtcars)
print("After removing row(s) to an existing data frame:")
print(mtcars[-c(33,34), ])
#16)j
print("to sort a given data frame by multiple column(s):")
#16)k
print("Original dataframe:")
print(mtcars)
print("Change column-name 'mpg' to 'MPG' of the said dataframe:")
colnames(mtcars)[which(names(mtcars) == "mpg")] = "MPG"
print(mtcars)
#Answer 17)
library(tidyverse)
df1
subset1<-df1[1:5,2:4]
subset1
subset2<-df2[6:10,1:3]
subset2
subset1
names(subset1)<-c("caste","10k","20k")
subset1
subset2
names(subset2)<-c("ID","caste","10k")
subset2
md<-merge(subset1,subset2,all = TRUE)
md
md$ID<-NULL
head(md)
write.csv(md,file = "myd.csv")
#Answer 18)
volumes<-c(1/3*pi*R^2*H)
volumes
#Answer 19)
mean(volumes)
median(volumes)
sd(volumes)
vol<- subset(volumes,H<8.5)
vol
mean(vol)
#Answer 20)
rainforest<-read.csv("rainforest.csv",header = TRUE)
rainforest
Acmena <- subset(rainforest, species == "Acmena smithii")
Acmena
order1 <- order(Acmena$dbh)
order1
Statistics Assignment in R
Q1. Consider the data from Brendon, Jason, Melissa, Paula, and McGuirk. Report for each
answer, indicate how you know, when appropriate, by reporting the values of the statistic you
are using or other information you used.
install.packages("psych")
library(psych)
install.packages("PerformanceAnalytics")
library(PerformanceAnalytics)
install.packages("DescTools")
library(DescTools)
install.packages("ggplot2")
library(ggplot2)
install.packages("rcompanion")
library(rcompanion)
Input = ("
Instructor Grade Weight Calories Sodium Score
'Brendon Small' 6 43 2069 1287 77
'Brendon Small' 6 41 1990 1164 76
'Brendon Small' 6 40 1975 1177 76
'Brendon Small' 6 44 2116 1262 84
'Brendon Small' 6 45 2161 1271 86
'Brendon Small' 6 44 2091 1222 87
'Brendon Small' 6 48 2236 1377 90
'Brendon Small' 6 47 2198 1288 78
'Brendon Small' 6 46 2190 1284 89
'Jason Penopolis' 7 45 2134 1262 76
'Jason Penopolis' 7 45 2128 1281 80
'Jason Penopolis' 7 46 2190 1305 84
'Jason Penopolis' 7 43 2070 1199 68
'Jason Penopolis' 7 48 2266 1368 85
'Jason Penopolis' 7 47 2216 1340 76
'Jason Penopolis' 7 47 2203 1273 69
'Jason Penopolis' 7 43 2040 1277 86
'Jason Penopolis' 7 48 2248 1329 81
'Melissa Robins' 8 48 2265 1361 67
'Melissa Robins' 8 46 2184 1268 68
'Melissa Robins' 8 53 2441 1380 66
'Melissa Robins' 8 48 2234 1386 65
'Melissa Robins' 8 52 2403 1408 70
'Melissa Robins' 8 53 2438 1380 83
'Melissa Robins' 8 52 2360 1378 74
'Melissa Robins' 8 51 2344 1413 65
'Melissa Robins' 8 51 2351 1400 68
'Paula Small' 9 52 2390 1412 78
'Paula Small' 9 54 2470 1422 62
'Paula Small' 9 49 2280 1382 61
'Paula Small' 9 50 2308 1410 72
'Paula Small' 9 55 2505 1410 80
'Paula Small' 9 52 2409 1382 60
'Paula Small' 9 53 2431 1422 70
'Paula Small' 9 56 2523 1388 79
'Paula Small' 9 50 2315 1404 71
'Coach McGuirk' 10 52 2406 1420 68
'Coach McGuirk' 10 58 2699 1405 65
'Coach McGuirk' 10 57 2571 1400 64
'Coach McGuirk' 10 52 2394 1420 69
'Coach McGuirk' 10 55 2518 1379 70
'Coach McGuirk' 10 52 2379 1393 61
'Coach McGuirk' 10 59 2636 1417 70
'Coach McGuirk' 10 54 2465 1414 59
'Coach McGuirk' 10 54 2479 1383 61
")
Data = read.table(textConnection(Input),header=TRUE)
Data
head(Data)
output:-
> head(Data)
Instructor Grade Weight Calories Sodium Score
1 Brendon Small 6 43 2069 1287 77
2 Brendon Small 6 41 1990 1164 76
3 Brendon Small 6 40 1975 1177 76
4 Brendon Small 6 44 2116 1262 84
5 Brendon Small 6 45 2161 1271 86
6 Brendon Small 6 44 2091 1222 87
Data$Instructor = factor(Data$Instructor,
levels=unique(Data$Instructor))
pairs(data=Data,
~ Grade + Weight + Calories + Sodium + Score)
Output:-
Data.num = Data[c("Grade", "Weight", "Calories", "Sodium",
"Score")]
corr.test(Data.num,
use = "pairwise",
method = "pearson",
adjust = "none")
output:-
> Data.num = Data[c("Grade", "Weight", "Calories", "Sodium", "Score")]
> corr.test(Data.num,
+ use = "pairwise",
+ method = "pearson",
+ adjust = "none")
Call:corr.test(x = Data.num, use = "pairwise", method = "pearson",
adjust = "none")
Correlation matrix
Grade Weight Calories Sodium Score
Grade 1.00 0.85 0.85 0.79 -0.70
Weight 0.85 1.00 0.99 0.87 -0.48sss
Calories 0.85 0.99 1.00 0.85 -0.48
Sodium 0.79 0.87 0.85 1.00 -0.45
Score -0.70 -0.48 -0.48 -0.45 1.00
Sample Size
[1] 45
Probability values (Entries above the diagonal are adjusted for multiple
tests.)
Grade Weight Calories Sodium Score
Grade 0 0 0 0 0
Weight 0 0 0 0 0
Calories 0 0 0 0 0
Sodium 0 0 0 0 0
Score 0 0 0 0 0
library(PerformanceAnalytics)
chart.Correlation(Data.num,
method="pearson",
histogram=TRUE,
pch=16)
output:-
plot(Sodium ~ Calories,
data=Data,
pch=16,
xlab = "Calories",
ylab = "Sodium")
output:-
x = residuals(model)
library(rcompanion)
plotNormalHistogram(x)
output:-
output:-
cor.test( ~ Sodium + Calories,
data=Data,
method = "kendall")
sample estimates:
tau
0.6490902
Call:
lm(formula = Sodium ~ Calories, data = Data)
Residuals:
Min 1Q Median 3Q Max
-83.263 -26.263 -0.486 29.973 64.714
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 519.07547 78.78211 6.589 5.09e-08 ***
Calories 0.35909 0.03409 10.534 1.74e-13 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
plot(Sodium ~ Calories,
data=Data,
pch=16,
xlab = "Calories",
ylab = "Sodium")
abline(model,
col = "blue",
lwd = 2)
output:-
x = residuals(model)
library(rcompanion)
plotNormalHistogram(x)
output:-
plot(fitted(model),
residuals(model))
output:-
a. Which two variables are the most strongly correlated?
Ans-(weight & calorie)
Correlation :- 0.99
c. Are there any pairs of variables that are statistically uncorrelated? Which?
Ans-There are no pairs of variables that are statistically uncorrelated
h. Does the quadratic polynomial model fit the Sodium vs. Calories data better than the
linear model? Consider the p-value, the r-squared value, the range of values for each
of Sodium and Calories, and your practical conclusions
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 519.07547 78.78211 6.589 5.09e-08 ***
Calories 0.35909 0.03409 10.534 1.74e-13 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Q2.As part of a professional skills program, a 4-H club tests its members for typing
proficiency (Words.per.minute), Proofreading skill, proficiency with using a Spreadsheet,
and acumen in Statistics.
Instructor,Grade,Words.per.minute,Proofreading,Spreadsheet,Statistics
'Dr. Katz',6,35,53,75,61
'Dr. Katz',6,50,77,24,51
'Dr. Katz',6,55,71,62,55
'Dr. Katz',6,60,78,27,91
'Dr. Katz',6,65,84,44,95
'Dr. Katz',6,60,79,38,50
'Dr. Katz',6,70,96,12,94
'Dr. Katz',6,55,61,55,76
'Dr. Katz',6,45,73,59,75
'Dr. Katz',6,55,75,55,80
'Dr. Katz',6,60,85,35,84
'Dr. Katz',6,45,61,49,80
'Laura',7,55,59,79,57
'Laura',7,60,60,60,60
'Laura',7,75,90,19,64
'Laura',7,65,87,32,65
'Laura',7,60,70,33,94
'Laura',7,70,84,27,54
'Laura',7,75,87,24,59
'Laura',7,70,97,38,74
'Laura',7,65,86,30,52
'Laura',7,72,91,36,66
'Laura',7,73,88,20,57
'Laura',7,65,86,19,71
'Ben Katz',8,55,84,20,76
'Ben Katz',8,55,63,44,94
'Ben Katz',8,70,95,31,88
'Ben Katz',8,55,63,69,93
'Ben Katz',8,65,65,47,70
'Ben Katz',8,60,61,63,92
'Ben Katz',8,70,80,35,60
'Ben Katz',8,60,88,38,58
'Ben Katz',8,60,71,65,99
'Ben Katz',8,62,78,46,54
'Ben Katz',8,63,89,17,60
'Ben Katz',8,65,75,33,77
")
data
head(data)
data$Instructor = factor(data$Instructor,
levels=unique(data$Instructor))
pairs(data=data,
~ Grade+Words.per.minute+Proofreading+Spreadsheet+Statistics)
Output:-
corr.test(Data.num,
use = "pairwise",
method = "pearson",
)
Output:-
Call:corr.test(x = Data.num, use = "pairwise", method = "pearson")
Correlation matrix
Grade Words.per.minute Proofreading Spreadsheet Statistics
Grade 1.00 0.24 0.05 -0.05 0.07
Words.per.minute 0.24 1.00 0.52 -0.45 -0.24
Proofreading 0.05 0.52 1.00 -0.79 -0.13
Spreadsheet -0.05 -0.45 -0.79 1.00 0.12
Statistics 0.07 -0.24 -0.13 0.12 1.00
Sample Size
[1] 36
chart.Correlation(Data.num,
method="pearson",
histogram=TRUE,
pch=16)
output:-
(i). What is the value of the correlation coefficient r for this correlation?
data=Data.num,
method = "pearson")
output:-
Pearson's product-moment correlation
Ans- r=(-0.793)
data=Data.num,
method = "kendall")
output:-
Kendall's rank correlation tau
Ans- tau=(-0.579)
data=Data.num,
method = "spearman")
output:-
Ans- rho=(-0.755)
data = Data.num)
summary(model)
output:-
Call:
lm(formula = Proofreading ~ Words.per.minute, data = Data.num)
Residuals:
Min 1Q Median 3Q Max
-20.932 -8.960 1.568 7.913 17.561
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 64.4820 4.0977 15.736 < 2e-16 ***
Words.per.minute 2.4928 0.7103 3.509 0.00129 **
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Ans- 0.2659
(iii). Do the residuals suggest that the linear regression model is an appropriate model?
pch=16,
xlab = "Proofreading",
ylab = "Words.per.minute")
abline(model,
col = "blue",
lwd = 2)
x = residuals(model)
library(rcompanion)
plotNormalHistogram(x)
plot(fitted(model),
residuals(model))
iv. What can you conclude about the results of the linear regression? Consider the p-
value, the r-squared value, the range of values for each
of Proofreading and Words.per.minute, and your practical conclusions
data = Data.num)
summary(model)
output:-
Call:
lm(formula = Proofreading ~ Words.per.minute, data = Data.num)
Residuals:
Min 1Q Median 3Q Max
-20.932 -8.960 1.568 7.913 17.561
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 64.4820 4.0977 15.736 < 2e-16 ***
Words.per.minute 2.4928 0.7103 3.509 0.00129 **
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1