FE418_RLectureNotes1
FE418_RLectureNotes1
#setwd("C:\Users\Erhan\Desktop\FE418")
#Error!
#You can copy and paste a path to a directory from your computer
#\path\to\a directory\...
#For example:
#C:\Desktop\newdirectory
C:\Users\Erhan\Desktop\FE418
setwd("C:/Users/Erhan/Desktop/FE418")
#setwd("/../FE418")
#basic math
10*10
#calling a function
sqrt(4)
#basic math
#PEMDAS: Parenthesis, Exponents, Multiplication, Division, Addition, Subtraction
4*6+5
4*(6+5)
#variable Assignment
x <- 2
x
#Unpreferred way:
#y=5
y = 5
y
3 <- z
z
a <- b <- 7
a
b
assign("j" , 4)
j
j
rm(j)
j
ls()
#to remove any item from this list (can NOT be undone! )
rm(list = ls())
class (5L)
class (2L)
5L/2L
class(5L/2L)
#Character Data
#R handles string data 2 ways: character and factor
x <- "data"
x
y <- factor("data")
y
#more on this in vector section
#dates: Date stores just a date while POSIXct stores 6a date and time
#both objects are represented as the number of days (Date) or seconds (POSIXct)
#since January 1, 1970
Sys.Date( )
date()
x <-date()
x
class(x)
nchar(x)
class(date1)
as.numeric(date1)
date3
as.numeric(date2)
as.numeric(date3)
# to get the "Date" for a number you need to install a package (such as zoo). Base
R do not recognize numbers as a variable to be converted to a specifi date
#date4 <- as.Date(17953) #installed the zoo package
as.Date(as.character(20200303), "%Y%m%d")
#Logicals: logicals are a way of representing data that can be either TRUE or FALSE
#Numerically, TRUE is the same as 1 and FALSE as 0.
TRUE * 5
FALSE * 5
k <- TRUE
k
class(k)
is.logical(k)
l <- "TRUE"
class(l)
is.logical(l)
n <- (2==3)
n
n+m
n*m
n-m
#Vectors
# collection of elements, all of the SAME type: c(1, 3, 2, 5, 1)
#c("R", "Excel", "SAS", "minitab")
#different from mathematical vectors (no column or row vector)
# "c" stands for combine
y <- c(1,"A",2)
y
class(y)
y+3
# shortcut : operator
1:10
seq(1, 10, 1)
seq(1, 10, 2)
seq(1,10, 0.5)
#seq(10,1,1)
seq(from=1,to=10,by=1)
seq(from=10,to=1,by=-1)
seq(10,1, -1)
10:1
-2:3
5:-7
x <- 1:10
y <- -5:4
x
y
x+y
x-y
x*y
x/y
x**y
length(x)
length(y)
length(x+y)
any(x < y)
all(x < y)
any(x>y)
all(x>y)
class(q)
q
nchar(q)
y
nchar(y)
length(q)
a <- c("a",
"b",
"c")
#Accessing individual elements of a vector is done using square brackets ([ ]). The
first element of x is retrieved by typing x[1],
#the first two elements by x[1:2] and nonconsecutive elements by x[c(1, 4)].
x[1]
q[1]
x[1:2]
x[5:7]
x[c(1, 4)]
#x[1,4]
q[1:2]
q[c(1, 4)]
q[8:10]
q[c(1,5,7,9)]
#It is possible to give names to a vector either during creation or after the fact.
# provide a name for each element of an array using a name-value pair
c(One = "a", Two = "y", Last = "r")
# create a vector
w <- 1:3
w
# name the elements
names(w) <- c("a", "b", "c")
w
class(w)
#Factor Vectors
#factors are an important concept in R, especially when building models (such as
statistical, machiene learning, etc.).
# Let�s create a simple vector of text data that has a few repeats. We will
#start with the vector q we created earlier and add some elements to it.
q2Factor
#Notice that after printing out every element of q2Factor, R also prints the
#levels of q2Factor. The levels of a factor are the unique values of that
#factor variable. Technically, R is giving each unique value of a factor a
#unique integer tying it back to the character representation. This can be
#seen with as.numeric.
#CALLING FUNCTIONS
mean(x)
#FUNCTION DOCUMENTATION
?`+`
?`*`
?`==`
#There are occasions when we have only a sense of the function we want to use. In
that case we can
#look up the function by using part of the name with apropos.
apropos("mea")
##MISSING DATA
##Missing data plays a critical role in both statistics and computing, and R has
##two types of missing data, NA and NULL. While they are similar, they behave
differently
##and that difference needs attention.
# NA concept
#Often we will have data that has missing values for any number of reasons.
Statistical programs use varying techniques to represent missing data such as a
dash, a period or even the number 99.
#R uses NA. NA will often be seen as just another element of a vector. is.na tests
each element of a vector for missingness.
is.na(z)
zChar <- c("Hockey", NA, "Lacrosse")
zChar
is.na(zChar)
# NULL concept
#NULL is the absence of anything.
#It is not exactly missingness, it is nothingness. Functions can sometimes return
NULL and their arguments can be NULL.
#An important difference between NA and NULL is that NULL is atomical and cannot
exist within a vector. If used inside a vector it simply disappears.
#Even though it was entered into the vector z, it did not get stored in z. In fact,
z is only two elements long.
d <- NULL
d
is.null(d)
is.null(7)
#DATA.FRAMES
#Perhaps one of the most useful features of R is the data.frame. It is one of the
most often cited reasons for R�s ease of use.
#On the surface a data.frame is just like an Excel spreadsheet in that it has
columns and rows. In statistical terms, each column is a variable and each row is
an observation.
#In terms of how R organizes data.frames, each column is actually a vector, each of
which has the same length.
#That is very important because it lets each column hold a different type of data
(see Section 4.3).
#This also implies that within a column each element must be of the same type, just
like with vectors.
#There are numerous ways to construct a data.frame, the simplest being to use the
data.frame function.
#Let�s create a basic data.frame using some of the vectors we have already
introduced, namely x, y and q.
x <- 10:1
y <- -4:5
q <- c("Hockey", "Football", "Baseball", "Curling", "Rugby", "Lacrosse",
"Basketball", "Tennis", "Cricket", "Soccer")
x y q
1 10 -4 Hockey
2 9 -3 Football
3 8 -2 Baseball
4 7 -1 Curling
5 6 0 Rugby
6 5 1 Lacrosse
7 4 2 Basketball
8 3 3 Tennis
9 2 4 Cricket
10 1 5 Soccer
#This creates a 10x3 data.frame consisting of those three vectors. Notice the names
of theDF are simply the variables.
#We could have assigned names during the creation process, which is generally a
good idea.
class(theDF)
theDF
a <- 1:5
b <- -5:-1
c <- c(1,2,3,NA,5)
df2 <- data.frame (a,b,c)
df2
#data.frames are complex objects with many attributes. The most frequently checked
attributes are the number of rows and columns. Of course there are functions to do
this for us: nrow and ncol.
#And in case both are wanted at the same time there is the dim function.
nrow(theDF)
ncol(theDF)
dim(theDF)
names(theDF)
names(theDF)[3]
#We can also check and assign the row names of a data.frame.
rownames(theDF)
[1] "1" "2" "3" "4" "5" "6" "7" "8" "9" "10"
#Usually a data.frame has far too many rows to print them all to the screen,
#so thankfully the head function prints out only the first few rows.
head(theDF, n = 7)
tail(theDF)
tail(theDF, 2)
#As we can with other variables, we can check the class of a data.frame using the
class function.
class(theDF)
[1] "data.frame"
theDF$Sport
class(theDF$Sport)
theDF[3, 2]
[1] -2
theDF[10, 3]
[1] "Soccer"
#To specify more than one row or column use a vector of indices.
# row 3, columns 2 through 3
theDF[3, 2:3]
Second Sport
3 -2 Baseball
theDF[10, 1:2]
First Second
10 1 5
[1] -2 0
Second Sport
3 -2 Baseball
>
> # rows 3 and 5, column 2
> # since only one column was selected it was returned as a vector
[1] -2 0
>
> # rows 3 and 5, columns 2 through 3
theDF[c(3, 5), 2:3]
Second Sport
3 -2 Baseball
5 0 Rugby
#To access an entire row, specify that row while not specifying any column.
Likewise, to access an entire column, specify that column while not specifying any
row.
theDF[, 3]
Second Sport
1 -4 Hockey
2 -3 Football
3 -2 Baseball
4 -1 Curling
5 0 Rugby
6 1 Lacrosse
7 2 Basketball
8 3 Tennis
9 4 Cricket
10 5 Soccer
>
> # all of row 2
theDF[2, ]
>
> # all of rows 2 through 4
theDF[2:4, ]
class(theDF[2:4, ])
#To access multiple columns by name, make the column argument a character vector of
the names.
First Sport
1 10 Hockey
2 9 Football
3 8 Baseball
4 7 Curling
5 6 Rugby
6 5 Lacrosse
7 4 Basketball
8 3 Tennis
9 2 Cricket
10 1 Soccer
#Yet another way to access a specific column is to use its column name (or its
number) either as second argument to the square brackets or as the only argument to
either single or double square brackets.
class(theDF[, "Sport"])
[1] "character"
>
> # just the "Sport" column
> # this returns a one column data.frame
theDF["Sport"]
Sport
1 Hockey
2 Football
3 Baseball
4 Curling
5 Rugby
6 Lacrosse
7 Basketball
8 Tennis
9 Cricket
10 Soccer
class(theDF["Sport"])
[1] "data.frame"
>
> # just the "Sport" column
> # this also returns a (factor) vector
theDF[["Sport"]]
class(theDF[["Sport"]])
[1] "character"
#All of these methods have differing outputs. Some return a vector, some return a
single-column data.frame.
#To ensure a single-column data.frame while using single-square brackets, there is
a third argument: drop=FALSE.
#This also works when specifying a single column by number.
Sport
1 Hockey
2 Football
3 Baseball
4 Curling
5 Rugby
6 Lacrosse
7 Basketball
8 Tennis
9 Cricket
10 Soccer
[1] "data.frame"
>
theDF[, 3, drop = FALSE]
Sport
1 Hockey
2 Football
3 Baseball
4 Curling
5 Rugby
6 Lacrosse
7 Basketball
8 Tennis
9 Cricket
10 Soccer
[1] "data.frame"