Analysis Using Statistical: Introduction & Data Exploration
Analysis Using Statistical: Introduction & Data Exploration
PART I: Analysis
Introduction & using
Data Exploration
5th December 2020
Nur Saadah Binti Abd Majid 10am-1.00pm
10am-11.30am Via Microsoft Teams
Course Outline
Importing Learn how to import data with different type such
Dataset
as csv, text, stata and excel.
Understanding
the dataset Use the basic built in function to get
knowledge on the data
setwd("<working path>")
OR
File Change dir… <Find your folder>
Importing the dataset
Dataset data<-
in read.table("mydata.txt
text ",sep=";")
file
data<- Dataset
in
read.csv("mydata.csv",
header=T, sep=",") csv
file
Dataset
library(foreign)
in
data <-
Stata read.dta("mydata.dta")
file
Built-in dataset
Example of datasets
data(mtcars)
help(mtcars)
data(iris)
help(iris)
data()
Generating normally
distributed data
#1. density function
dnorm(0.5,mean = 0, sd = 1, log =
FALSE)
CHECKING THE
DUPLICATES
class(c)
class(data)
class(try1)
#to make try1 become data.frame
try1<-as.data.frame(try1)
class(try1)
Create data
with Missing
#Find the MISSING VALUES Values
try1[c(3,6),1]<-NA
try1[c(10,15),2]<-NA
try1
complete.cases(try1)
mis.rownum<-which(complete.cases(try1)=="FALSE“)
try1[mis.rownum,]
Check the
rows without
the missing
values
Sorting and Ordering the
data Sorting the
vector
automatically
Give the
number of
order for
each rows
sort(b)
sort(b,decreasing=T)
order(b)
order(b,decreasing=T)
try1<-cbind(a,b)
Extract the
try1<-as.data.frame(try1) data by order
try1[order(b),]
If else
Dataset in a
vector type
avr<-mean(data[,2])
if (data[5,2]>avr) {
print("Larger than mean")
} The value
else { assigned is
larger than
print("Smaller than mean") the mean of
} the vector No
"Smaller
Yes than mean"
"Larger than
mean"
Looping
i-th observation
of dataset in a
vector type
The ith
"Smaller
No value is Yes "Larger
larger
than the
than
than mean" mean"
mean of
the vector
for(i in 1:nrow(data)){
if (data[i,2]>avr) {
print("Larger than mean")
} else { The i value is
print("Smaller than mean") increasing by 1
} and create new
} value of i
The i value
is larger
No
than the
max value
Yes
Terminate
Data Exploration &
Plottings
SCATTER PLOT
help(mtcars)
summary(mtcars)
dim(mtcars)
plot(mtcars[,1],mtcars[,3],ylim
=c(min(mtcars[,3]),max(mtcars[, The Displacement against Miles/US gallon
3])),xlim=c(min(mtcars[,1]),max
(mtcars[,1])),ylab="Displacemen
t (cu.in.)",xlab="Miles/US
400
300
abline(v=mean(mtcars[,1]),
lty=2, col="Red")
200
abline(h=mean(mtcars[,3]),
lty=5, col="Blue")
100
points(mtcars[1:5,1],mtcars[1:5
,3],col="Purple") 10 15 20 25 30
Miles/US gallon
Save as Save as
metafile bitmap
Data Exploration &
Plottings
LINE PLOT
plot(sunspots[1:200],main="Monthly averaged sunspots
from 1749–1983",xaxt="n",xlabs="Month order",
ylab="Sunspots number")
lines(sunspots[1:200],lty=1,col="Purple")
lines(sunspots[1:200]+20,lty=1,col="Blue")
axis(1,1:200,1:200,las=2)
HISTOGRAM
hist(mtcars[,1],xlab
= "Weight",col =
"pink",border =
"red", main="Miles/US
gallon", breaks =
10)
Data Exploration &
Plottings
BOX PLOT
boxplot(mtcars[,1],main="Miles/U
S gallon", horizontal=T)
boxplot(mtcars[,3],main=
"Displacement (cu.in.)")
par(mfrow=c(2,1))
boxplot(mtcars[,1],main=
"Miles/US gallon", horizontal=T)
boxplot(mtcars[,3],main=
"Displacement (cu.in.)",
horizontal=T)
par(mfrow=c(1,2))
boxplot(mtcars[,1],main=
"Miles/US gallon" )
boxplot(mtcars[,3],main=
"Displacement (cu.in.)")
Data Exploration &
Plottings
QQPLOT
qqnorm(mtcars[,1],
pch = 1, frame =
FALSE)
qqline(mtcars[,1],
col = "blue", lwd =
2)
Data Exploration &
Plottings
CORRPLOT
library(corrplot)
corrplot(cor(
mtcars[,1:5]),method
="circle")
corrplot(cor(
mtcars[,1:5]),method
="numbers") #errors
make you learn
corrplot(cor(
mtcars[,1:5]),method
="square")
Save figure as Pdf
boxplot(mtcars[,3],
main="Displacement
(cu.in.)")
dev.off()
Dataset
in
write.csv(mydata,
"mydata.csv") csv
file
Dataset
library(foreign)
in
write.dta(mydata,
Stata "mydata.dta")
file
EXERCISE
1. Please import the data named “yourdata” using read.csv
command.