0% found this document useful (0 votes)
50 views

Assignment-1 80501

The document analyzes elephant population data from several African countries over multiple time periods. It calculates the geometric mean of population changes and creates box plots to visualize population trends over time. It also discusses web traffic probabilities and performs k-means clustering on employee data to determine optimal clusters.

Uploaded by

rishabh7arora
Copyright
© © All Rights Reserved
Available Formats
Download as DOCX, PDF, TXT or read online on Scribd
0% found this document useful (0 votes)
50 views

Assignment-1 80501

The document analyzes elephant population data from several African countries over multiple time periods. It calculates the geometric mean of population changes and creates box plots to visualize population trends over time. It also discusses web traffic probabilities and performs k-means clustering on employee data to determine optimal clusters.

Uploaded by

rishabh7arora
Copyright
© © All Rights Reserved
Available Formats
Download as DOCX, PDF, TXT or read online on Scribd
You are on page 1/ 6

Assignment-1.

R
manpreetkaur

2024-01-23
# Load required libraries
library(dplyr)

##
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':


##
## filter, lag

## The following objects are masked from 'package:base':


##
## intersect, setdiff, setequal, union

library(ggplot2)

## Warning: package 'ggplot2' was built under R version 4.3.2

# Elephant population data


elephant_data <- data.frame(
Country = c("Kenya", "Tanzania", "Uganda", "Zimbabwe", "South Africa"),
Pop_1979 = c(20000, 75000, 8000, 70000, 12000),
Pop_1989 = c(18000, 60000, 7000, 50000, 10000),
Pop_2007 = c(25000, 45000, 6000, 40000, 8000),
Pop_2012 = c(28000, 40000, 5000, 35000, 7000)
)

# Function to calculate geometric mean


geom_mean <- function(x) exp(mean(log(x)))

# Calculate geometric mean for each period


mean_change_79_89 <- elephant_data %>%
summarize(across(starts_with("Pop"), ~geom_mean(c(1, .[-1]/.[1])) - 1))

mean_change_89_07 <- elephant_data %>%


summarize(across(starts_with("Pop"), ~geom_mean(c(1, .[-1]/.[1])) - 1))

mean_change_07_12 <- elephant_data %>%


summarize(across(starts_with("Pop"), ~geom_mean(c(1, .[-1]/.[1])) - 1))

# Display results
print("Mean annual change from 1979 to 1989:")
## [1] "Mean annual change from 1979 to 1989:"

print(mean_change_79_89)

## Pop_1979 Pop_1989 Pop_2007 Pop_2012


## 1 0.2579463 0.1487509 -0.2604786 -0.3970063

print("Mean annual change from 1989 to 2007:")

## [1] "Mean annual change from 1989 to 2007:"

print(mean_change_89_07)

## Pop_1979 Pop_1989 Pop_2007 Pop_2012


## 1 0.2579463 0.1487509 -0.2604786 -0.3970063

print("Mean annual change from 2007 to 2012:")

## [1] "Mean annual change from 2007 to 2012:"

print(mean_change_07_12)

## Pop_1979 Pop_1989 Pop_2007 Pop_2012


## 1 0.2579463 0.1487509 -0.2604786 -0.3970063

library(tidyr)
# Load required libraries
library(dplyr)
library(ggplot2)
library(tidyr) # Add this line to load tidyr

# Elephant population data


# ... (rest of your code)

# Create multiple boxplot graph


elephant_data_long <- elephant_data %>%
pivot_longer(cols = starts_with("Pop"), names_to = "Year", values_to =
"Population")

# Plot
ggplot(elephant_data_long, aes(x = Year, y = Population, fill = Country)) +
geom_boxplot() +
labs(title = "Elephant Populations Over Time",
x = "Year", y = "Population") +
theme_minimal()
#Q2
# Mean and standard deviation for web traffic
mean_traffic <- 4.5e6
std_dev_traffic <- 820000

# Probability calculations
prob_less_than_5m <- pnorm(5e6, mean = mean_traffic, sd = std_dev_traffic)
prob_3m_or_more <- 1 - pnorm(3e6, mean = mean_traffic, sd = std_dev_traffic)
prob_between_3m_4m <- pnorm(4e6, mean = mean_traffic, sd = std_dev_traffic) -
pnorm(3e6, mean = mean_traffic, sd = std_dev_traffic)

# Web traffic requiring additional server capacity


additional_capacity <- qnorm(0.85, mean = mean_traffic, sd = std_dev_traffic)

# Display results
print("Probability of fewer than 5 million visitors:")

## [1] "Probability of fewer than 5 million visitors:"

print(prob_less_than_5m)

## [1] 0.7289883

print("Probability of 3 million or more visitors:")

## [1] "Probability of 3 million or more visitors:"

print(prob_3m_or_more)
## [1] 0.9663203

print("Probability of between 3 million and 4 million visitors:")

## [1] "Probability of between 3 million and 4 million visitors:"

print(prob_between_3m_4m)

## [1] 0.237332

print("Web traffic requiring additional server capacity:")

## [1] "Web traffic requiring additional server capacity:"

print(additional_capacity)

## [1] 5349875

#Q3

# Load required libraries


library(dplyr)
library(cluster)
library(readxl) # Add this line to load readxl

## Warning: package 'readxl' was built under R version 4.3.2

# Load dataset
bigblue <- read_excel("C:/Users/jagta/Downloads/BigBlue.xlsx")

# Print column names to check for case sensitivity


print(colnames(bigblue))

## [1] "EmployeeID" "UsageRate" "Recognition" "Leader"

# Normalize variables
normalized_data <- bigblue %>%
mutate(
UsageRate = (UsageRate - mean(UsageRate)) / sd(UsageRate),
Recognition = (Recognition - mean(Recognition)) / sd(Recognition),
Leader = (Leader - mean(Leader)) / sd(Leader)
)
# Determine number of clusters using the elbow method
wss <- numeric(10)
for (i in 1:10) {
kmeans_model <- kmeans(normalized_data[, c("UsageRate", "Recognition",
"Leader")], centers = i)
wss[i] <- sum(kmeans_model$withinss)
}

# Plot the elbow graph


plot(1:10, wss, type = "b", xlab = "Number of Clusters", ylab = "Within-
cluster Sum of Squares")

# Based on the elbow plot, let's assume the optimal number of clusters is 3
optimal_clusters <- 3

# Apply k-means clustering


kmeans_model <- kmeans(normalized_data[, c("UsageRate", "Recognition",
"Leader")], centers = optimal_clusters)

# Describe each cluster


cluster_summary <- as.data.frame(t(aggregate(. ~ kmeans_model$cluster, data =
cbind(normalized_data, Cluster = kmeans_model$cluster), FUN = function(x)
c(mean(x), min(x), max(x)))))

# Display results
print("Number of Clusters:")

## [1] "Number of Clusters:"

print(optimal_clusters)

## [1] 3

print("Cluster Summary:")

## [1] "Cluster Summary:"


print(cluster_summary)

## V1 V2 V3
## kmeans_model$cluster 1.0000000 2.0000000 3.0000000
## EmployeeID.1 66.2962963 91.5000000 47.6578947
## EmployeeID.2 2.0000000 78.0000000 1.0000000
## EmployeeID.3 105.0000000 104.0000000 107.0000000
## UsageRate.1 1.3726029 1.5793215 -0.5707574
## UsageRate.2 0.5215694 0.7150272 -0.9831708
## UsageRate.3 1.9846656 2.1550943 0.6844744
## Recognition.1 0.8108851 2.7516181 -0.4328996
## Recognition.2 -0.4328996 1.8831133 -0.4328996
## Recognition.3 4.1991262 3.0411198 -0.4328996
## Leader.1 -0.1857172 4.7822168 -0.1857172
## Leader.2 -0.1857172 3.1262388 -0.1857172
## Leader.3 -0.1857172 6.4381947 -0.1857172
## Cluster.1 1.0000000 2.0000000 3.0000000
## Cluster.2 1.0000000 2.0000000 3.0000000
## Cluster.3 1.0000000 2.0000000 3.0000000

You might also like