vertopal.com_homework1
vertopal.com_homework1
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
/Library/Frameworks/Python.framework/Versions/3.13/lib/python3.13/
site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found.
Please update jupyter and ipywidgets. See
https://ipywidgets.readthedocs.io/en/stable/user_install.html
from .autonotebook import tqdm as notebook_tqdm
Part 1
1.1
• a) Load the dataset and display the first 5 rows.
path = kagglehub.dataset_download("yasserh/titanic-dataset")
print("Path to dataset files:", path)
data = pd.read_csv('Titanic-Dataset.csv')
df = pd.DataFrame(data)
print(df.head(5))
Number of rows in dataset are: $891 and columns number are: $12)
PassengerId int64
Survived int64
Pclass int64
Name object
Sex object
Age float64
SibSp int64
Parch int64
Ticket object
Fare float64
Cabin object
Embarked object
dtype: object
1.2
• a) Generate summary statistics (mean, median, standard deviation, min, and max) for the
following columns:
– Age
– Fare
age_mean = "{:.2f}".format(df['Age'].mean())
age_max = df['Age'].max()
age_min = df['Age'].min()
age_median = df['Age'].median()
age_std = df['Age'].std()
fare_mean = "{:.2f}".format(df['Fare'].mean())
fare_max = df['Fare'].max()
fare_min = df['Fare'].min()
fare_median = df['Fare'].median()
fare_std = df['Fare'].std()
• b) What is the range of the Age and Fare columns? (Range = Max - Min)
age_range = age_max - age_min
fare_range = fare_max - fare_min
• c) Identify which class (Pclass) had the highest average Fare. Provide the mean Fare for
each class.
mean_fare_by_class = df.groupby("Pclass")["Fare"].mean()
highest_fare_class = mean_fare_by_class.idxmax()
1.3:
• a) Count the number of passengers in each Pclass (1st, 2nd, 3rd).
pclass_counts = df["Pclass"].value_counts()
survival_percentage = survival_table.div(survival_table["All"],
axis=0) * 100
Part 2:
2.4
• a) Calculate the correlation coefficient between Age and Fare. Is the correlation positive,
negative, or negligible?
• b) Calculate the correlation between Pclass and Fare. Interpret the result—do higher
classes pay more on average?
correlation = df["Age"].corr(df["Fare"])
correlation_type = ("positive" if correlation > 0 else "negative" if
correlation < 0 else "negligible")
print(f"Correlation Coefficient between Age and Fare:
{correlation:.2f}")
print(f"The correlation is {correlation_type}.")
correlation = df["Pclass"].corr(df["Fare"])
if correlation < 0:
interpretation = "negative correlation, meaning higher classes
(lower Pclass values) pay more on average."
elif correlation > 0:
interpretation = "positive correlation, meaning higher classes
(higher Pclass values) pay more on average."
else:
interpretation = "no correlation."
num_outliers = outliers.shape[0]
plt.figure(figsize=(8, 5))
sns.boxplot(x=df["Fare"])
plt.title("Boxplot of Fare Column")
plt.xlabel("Fare")
plt.show()
• Based on the graph, the shape is close to normal, but slightly skewed, because presence
of long tail suggest positive skewness
plt.figure(figsize=(8, 5))
sns.histplot(df["Fare"], bins=10, kde=True, color="green")
plt.title("Histogram of Fare")
plt.xlabel("Fare")
plt.ylabel("Frequency")
plt.show()
age_skewness = df["Age"].skew()
fare_skewness = df["Fare"].skew()
if fare_skewness > 0:
fare_interpretation = "positively skewed (right-skewed)"
elif fare_skewness < 0:
fare_interpretation = "negatively skewed (left-skewed)"
else:
fare_interpretation = "approximately symmetric"
Mean: 29.962962962962962
Median: 25.0
mode_age = stats.mode(ages)
print(mode_age)
print(f"Mode: {mode_age[0]} with count: {mode_age[1]}")
ModeResult(mode=np.int64(25), count=np.int64(4))
Mode: 25 with count: 4
Mode for ages are 25 and 35, but stats.mode() only gives the smallest value that appears
most frequently. Hence, logically the the data is bimodal
Midrange: 41.5
# (d) First Quartile (Q1) and Third Quartile (Q3)
Q1 = np.percentile(ages, 25)
Q3 = np.percentile(ages, 75)
# (f) Boxplot
plt.boxplot(ages, vert=False)
plt.title('Boxplot of Ages')
plt.xlabel('Age')
plt.show()
1–5 200
6–15 450
16–20 300
21–50 1500
51–80 700
81–110 44
intervals = [(1, 5), (6, 15), (16, 20), (21, 50), (51, 80), (81, 110)]
frequencies = [200, 450, 300, 1500, 700, 44]
N = sum(frequencies)
cumulative_frequency = 0
median_class_index = -1
L = intervals[median_class_index][0]
F = cumulative_frequency - frequencies[median_class_index]
f = frequencies[median_class_index]
w = intervals[median_class_index][1] - intervals[median_class_index]
[0]
median = L + (((N/2) - F) / f) * w
• 2.4 Suppose that a hospital tested the age and body fat data for 18 randomly selected
adults with the following results:
age 23 23 27 27 39 41 47 49 50
%fat 9.5 26.5 7.8 17.8 31.4 25.9 27.4 27.2 31.2
age 52 54 54 56 57 58 58 60 61
%fat 34.6 42.5 28.8 33.4 30.2 34.1 32.9 41.2 35.7
• (a) Calculate the mean, median, and standard deviation of age and %fat.
import numpy as np
ages = np.array([23, 23, 27, 27, 39, 41, 47, 49, 50, 52, 54, 54, 56,
57, 58, 58, 60, 61])
body_fat = np.array([9.5, 26.5, 7.8, 17.8, 31.4, 25.9, 27.4, 27.2,
31.2, 34.6, 42.5, 28.8, 33.4, 30.2, 34.1, 32.9, 41.2, 35.7])
mean_age = np.mean(ages)
mean_fat = np.mean(body_fat)
median_age = np.median(ages)
median_fat = np.median(body_fat)
plt.figure(figsize=(10, 5))
plt.subplot(1, 2, 1)
plt.boxplot(ages, vert=False)
plt.title('Boxplot of Age')
plt.xlabel('Age')
plt.subplot(1, 2, 2)
plt.boxplot(body_fat, vert=False)
plt.title('Boxplot of %Fat')
plt.xlabel('%Fat')
plt.tight_layout()
plt.show()
• (c) Draw a scatter plot and a q-q plot based on these two variables.
plt.figure(figsize=(10, 5))
plt.scatter(ages, body_fat, color='blue')
plt.title('Scatter Plot of Age vs %Fat')
plt.xlabel('Age')
plt.ylabel('%Fat')
plt.grid(True)
plt.show()
plt.figure(figsize=(10, 5))
plt.subplot(1, 2, 1)
stats.probplot(ages, dist="norm", plot=plt)
plt.title('Q-Q Plot of Age')
plt.subplot(1, 2, 2)
stats.probplot(body_fat, dist="norm", plot=plt)
plt.title('Q-Q Plot of %Fat')
plt.tight_layout()
plt.show()
• 2.8 It is important to define or select similarity measures in data analysis. However, there
is no commonly accepted subjective similarity measure. Results can vary depending on
the similarity measures used. Nonetheless, seemingly different similarity measures may
be equivalent after some transformation. Suppose we have the following 2-D data set:
A1 A2
x1 1.5 1.7
x2 2 1.9
x3 1.6 1.8
x4 1.2 1.5
x5 1.5 1.0
• (a) Consider the data as 2-D data points. Given a new data point, x = (1.4,1.6) as a query,
rank the database points based on similarity with the query using Euclidean distance,
Manhattan distance, supremum distance, and cosine similarity.
import numpy as np
from scipy.spatial import distance
data_points = np.array([
[1.5, 1.7],
[2.0, 1.9],
[1.6, 1.8],
[1.2, 1.5],
[1.5, 1.0]
])
euclidean_rank = np.argsort(euclidean_distances)
manhattan_rank = np.argsort(manhattan_distances)
supremum_rank = np.argsort(supremum_distances)
cosine_rank = np.argsort(-cosine_similarities)
• (b) Normalize the data set to make the norm of each data point equal to 1. Use Euclidean
distance on the transformed data to rank the data points.
normalized_data_points = np.array([dp / np.linalg.norm(dp) for dp in
data_points])
normalized_query_point = query_point / np.linalg.norm(query_point)
normalized_euclidean_distances = np.linalg.norm(normalized_data_points
- normalized_query_point, axis=1)
normalized_euclidean_rank = np.argsort(normalized_euclidean_distances)