Data Visualization Analysis with R
Data Visualization Analysis with R
(v. 2.0)
Oscar Torres-Reyna
[email protected]
DSS/OTR 2
While different fields of study have
developed their own way of visualizing data,
the common goal across all types of visuals
for data analysis is to find meaningful
patterns through trends, relationships, or
distribution.
See the following site
https://www.r-graph-gallery.com/
DSS/OTR 3
The data on the right shows a snapshot
of the unemployment rate and
presidential approval in the United
States from 1948 to 2019.
In table form it is hard to figure
something out of the data, except for
the fact that it represents indicators
measured over time, making it an ideal
candidate for a time series graph.
DSS/OTR 4
We will be using RStudio, please see the
following document for some introduction to
its interface
https://dss.princeton.edu/training/RStudio101.pdf
DSS/OTR 5
We shall start by activating some of the R
packages we will need for this document:
library(zoo)
library(ggplot2)
library(stargazer)
DSS/OTR 6
The data is in *.csv format, we can use the read.csv() function
import it into R:
mydata = read.csv("http://www.princeton.edu/~otorres/mydataviz.csv",
header = TRUE, stringsAsFactors = FALSE)
DSS/OTR 7
mydata$date = as.Date(mydata$date,"%m/%d/%Y")
mydata$month = as.yearmon(mydata$date)
DSS/OTR 9
# Plotting unemployment data using ggplot2
ggplot(data=mydata, aes(x=month, y=unemp)) +
geom_line() +
labs(title = "US Unemployment rate, 1948-2019 (monthly)",
y = "Unemployment rate (%)",
x = "Monthly",
caption = "Source: Unemployment data from the BLS") +
scale_y_continuous(limits = c(0, 12), breaks = seq(0,12))
DSS/OTR 10
We can add analytic component to the visual by incorporating context to the
trends. In the next slides we will see the unemployment trends by presidential
terms.
# Begin date
# End date
# Order
order = c("blue","red","blue","blue","red","red","blue","red","red","blue","red","blue","red")
DSS/OTR 11
mydata$unempd = ifelse(mydata$party=="Democrat",mydata$unemp, NA)
mydata$unempr = ifelse(mydata$party=="Republican",mydata$unemp, NA)
ggplot(data=mydata, aes(x=month)) +
geom_line(data = mydata, aes(y = unempd), color = "blue", size = 1) +
geom_line(data = mydata, aes(y = unempr), color = "red", size = 1) +
See next page
theme_bw() +
labs(title = "US Unemployment rate, 1948-2019 (monthly)",
y = "Unemployment rate (%)",
x = "Monthly",
caption = "Source: Unemployment data from the BLS") +
theme(plot.title = element_text(hjust = 0.5)) +
scale_y_continuous(limits = c(0, 12), breaks = seq(0,12)) +
scale_x_yearmon(breaks = as.yearmon(start)) +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
geom_vline(xintercept = as.yearmon(start),
linetype = "solid",
size = 0.5,
color = "black") +
annotate("rect",
xmin = as.yearmon(start),
xmax = as.yearmon(end),
ymin = -Inf, ymax = Inf,
fill = order,
alpha = 0.2) +
geom_text(data = terms, aes(x=month, y = y, label=name), angle = 270, hjust = 0, fontface= "bold")
DSS/OTR 12
DSS/OTR 13
Adding presidential
approval
See next page
ggplot(data=mydata, aes(x=month)) +
geom_line(data = mydata, aes(y = unempd), color = "blue", size = 1) +
geom_line(data = mydata, aes(y = unempr), color = "red", size = 1) +
theme_bw() +
labs(title = "US Unemployment rate and Presidential Approval, 1948-2019 (monthly)",
y = "Unemployment rate (%)",
x = "Monthly",
caption = "Source: Unemployment data from the BLS, Presidential Approval from ROPER") +
theme(plot.title = element_text(hjust = 0.5)) +
scale_y_continuous(limits = c(0, 12), breaks = seq(0,12)) +
scale_x_yearmon(breaks = as.yearmon(start)) +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
geom_vline(xintercept = as.yearmon(start),
linetype = "solid",
size = 0.5,
color = "black") +
annotate("rect",
xmin = as.yearmon(start),
xmax = as.yearmon(end),
ymin = -Inf, ymax = Inf,
fill = order,
alpha = 0.2) +
geom_hline(yintercept = 50/7.997222) +
geom_text(data = terms, aes(x=month, y = y, label=name), angle = 270, hjust = 0, fontface= "bold") +
geom_line(data = mydata, aes(y = approve/7.997222)) +
scale_y_continuous(sec.axis = sec_axis(~.*7.997222,(name = "Presidential Approval (%)")),
limits = c(0, 12), breaks = seq(0,12))
DSS/OTR 14
DSS/OTR 15
The previous visual does not provide a clear idea of the relationship between unemployment
rates and presidential approval. Scatterplots are ideal plots to find relationships between
variables. The following code produces a scatterplot using base R:
plot(mydata$unemp, mydata$approve,
main = "US Unemployment rate and Presidential Approval, 1948-2019 (monthly)",
xlab = "Unemployment rate (%)", ylab = "Presidential Approval (%)")
DSS/OTR 16
The car package provides an informative scatterplot including a linear and loess fit with
boxplots
scatterplot(approve ~ unemp, data = mydata,
main = "US Unemployment rate and Presidential Approval, 1948-2019 (monthly)",
xlab = "Unemployment rate (%)", ylab = "Presidential Approval (%)")
DSS/OTR 17
Per group
scatterplot(approve ~ unemp|party, data = mydata,
main = "US Unemployment rate and Presidential Approval, 1948-2019 (monthly)",
xlab = "Unemployment rate (%)", ylab = "Presidential Approval (%)",
col = c("blue","red"),
legend = c(title="Party", coords="topright"))
DSS/OTR 18
Scatterplots using ggplot()
ggplot(data = mydata, aes(x=unemp, y=approve, group=factor(party), color = factor(party))) +
geom_point(size = 3) +
scale_color_manual(name="",values=c('blue', 'red')) +
theme(legend.position = "bottom") +
labs(title = "US Unemployment rate and Presidential Approval, 1948-2019 (monthly)",
x = "Unemployment rate (%)",
y = "Presidential Approval (%)",
caption = "Source: Unemployment data from the BLS, Presidential Approval from ROPER")
DSS/OTR 19
Adding a linear fit per party
ggplot(data = mydata, aes(x=unemp, y=approve, group=factor(party), color = factor(party))) +
geom_point(size = 3) +
scale_color_manual(name="",values=c('blue', 'red')) +
theme(legend.position = "bottom") +
labs(title = "US Unemployment rate and Presidential Approval, 1948-2019 (monthly)",
x = "Unemployment rate (%)",
y = "Presidential Approval (%)",
caption = "Source: Unemployment data from the BLS, Presidential Approval from ROPER") +
stat_smooth(method=lm)
DSS/OTR 20
We can estimate the equation of the linear fit per party by using the lm()
function and produce a nice presentation using stargazer().
The file reg.html can be opened with Word, it is saved in the working directory.