data-science-practical-with-solutions-bsc-cs-sem-6
data-science-practical-with-solutions-bsc-cs-sem-6
PRACTICAL NO:-01
A. Read data from CSV and JSON files into a data frame.
import pandas as pd
df = pd.read_csv('Student_Marks.csv')
Output:-
import pandas as pd
data = pd.read_json('dataset.json')
print(data)
Output:-
B. Perform basic data pre-processing tasks such as handling missing values and
outliers.
Code:-
import pandas as pd
df = pd.read_csv('titanic.csv')
print(df) df.head(10)
df2=df.fillna(value=0)
print(df2)
Output:-
import pandas as pd
df = pd.read_csv('titanic.csv')
print(df) df.head(10)
df.dropna(inplace = True)
print(df)
Output:-
C. Manipulate and transform data using functions like filtering, sorting, and grouping
Code:-
import pandas as pd
# Load iris dataset
iris = pd.read_csv('Iris.csv')
# Sorting data
sorted_iris = iris.sort_values(by='SepalLengthCm', ascending=False)
print("\nSorted iris dataset:")
print(sorted_iris.head())
# Grouping data
grouped_species = iris.groupby('Species').mean()
print("\nMean measurements for each species:")
print(grouped_species)
Output:-
PRACTICAL NO:02
Output:-
Code:
import pandas as pd
iris=pd.read_csv("Iris.csv")
print(iris)
from sklearn.preprocessing import LabelEncoder
le=LabelEncoder()
iris['code']=le.fit_transform(iris.Species)
print(iris)
Output:-
PRACTICAL NO:-03
Hypothesis Testing
Conduct a hypothesis test using appropriate statistical tests (e.g., t-test, chi-square test.
# t-test
import numpy as np
from scipy import stats
import matplotlib.pyplot as plt
print(f'P-value: {p_value}')
plt.figure(figsize=(10, 6))
plt.show()
# Draw Conclusions
else:
else:
Output:-
#chi-test
import pandas as pd
import numpy as np
import seaborn as sb
import warnings
warnings.filterwarnings('ignore')
df=sb.load_dataset('mpg')
print(df) print(df['horsepower'].describe())
print(df['model_year'].describe()) bins=[0,75,150,240]
df['horsepower_new']=pd.cut(df['horsepower'],bins=bins,labels=['l','m','h'])
c=df['horsepower_new']
print(c)
ybins=[69,72,74,84]
label=['t1','t2','t3']
df['modelyear_new']=pd.cut(df['model_year'],bins=ybins,labels=label)
newyear=df['modelyear_new'] print(newyear)
df_chi=pd.crosstab(df['horsepower_new'],df['modelyear_new'])
print(df_chi)
print(stats.chi2_contingency(df_chi)
Output:
PRACTICAL NO:-04
import pandas as pd
print("one-way ANOVA:")
print("F-statistics:", f_statistics)
print("p-value", p_value)
print(tukey_results)
Output:-
PRACTICAL NO:-05
Regression and its Types.
import numpy as np
import pandas as pd
housing = fetch_california_housing()
print(housing_df)
housing_df['PRICE'] = housing.target
X = housing_df[['AveRooms']]
y = housing_df['PRICE']
model = LinearRegression()
model.fit(X_train, y_train)
print("R-squared:", r2)
print("Intercept:", model.intercept_)
print("CoefÏcient:", model.coef_)
X = housing_df.drop('PRICE',axis=1)
y = housing_df['PRICE']
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=42)
model = LinearRegression()
model.fit(X_train,y_train)
y_pred = model.predict(X_test)
mse = mean_squared_error(y_test,y_pred)
r2 = r2_score(y_test,y_pred)
print("R-squared:",r2)
print("Intercept:",model.intercept_)
print("CoefÏcient:",model.coef_)
PRACTICAL NO:-06
import numpy as np
import pandas as pd
iris = load_iris()
binary_df = iris_df[iris_df['target'] != 2]
X = binary_df.drop('target', axis=1)
y = binary_df['target']
logistic_model = LogisticRegression()
ogistic_model.fit(X_train, y_train)
y_pred_logistic = logistic_model.predict(X_test)
print("\nClassification Report")
print(classification_report(y_test, y_pred_logistic))
decision_tree_model = DecisionTreeClassifier()
decision_tree_model.fit(X_train, y_train)
y_pred_tree = decision_tree_model.predict(X_test)
print("\nClassification Report")
print(classification_report(y_test, y_pred_tree))
Output:-
PRACTICAL NO:-07
K-Means clustering
import pandas as pd
data[continuous_features].describe()
data.head()
mms = MinMaxScaler()
mms.fit(data)
data_transformed = mms.transform(data)
sum_of_squared_distances = []
K = range(1, 15)
for k in K:
km = KMeans(n_clusters=k)
km = km.fit(data_transformed)
sum_of_squared_distances.append(km.inertia_)
plt.xlabel('k')
plt.ylabel('sum_of_squared_distances')
plt.show()
PRACTICAL NO:-08
import pandas as pd
import numpy as np
iris = load_iris()
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
pca = PCA()
X_pca = pca.fit_transform(X_scaled)
explained_variance_ratio = pca.explained_variance_ratio_
plt.figure(figsize=(8, 6))
plt.grid(True)
plt.show()
cumulative_variance_ratio = np.cumsum(explained_variance_ratio)
pca = PCA(n_components=n_components)
X_reduced = pca.fit_transform(X_scaled)
plt.figure(figsize=(8, 6))
plt.colorbar(label='Target')
plt.show()
Output:-
PRACTICAL NO:-09
import pandas as pd
data = pd.DataFrame({
'category': pd.Series(np.random.choice(['A', 'B', 'C', 'D'], size=1000, p=[0.4, 0.3, 0.2, 0.1]),
dtype='category') })
plt.figure(figsize=(10, 6))
plt.show()
plt.ylabel('Count', fontsize=14)
plt.xticks(rotation=45)
plt.show()
plt.show()
Output:-
# Data Storytelling
print("\nThe scatter plot (Figure 1) shows the relationship between Variable 1 and
Variable 2. We can observe a positive correlation, indicating that as Variable 1 increases,
Variable 2 tends to increase as well. However, there is a considerable amount of scatter,
suggesting that other factors may influence this relationship.")
print("\nScatter Plot")
print("\nBar Chart")
print("\nHeatmap")
print("\nIn summary, the visualizations and analysis provide insights into the
relationships between variables, the distribution of categories, and the correlations
between numerical variables. These findings can be used to inform further analysis,
decision-making, or to generate new hypotheses for investigation.")
Output:-