0% found this document useful (0 votes)
62 views

ML 1-10

The document contains code to analyze bollywood movie data using pandas and perform descriptive analysis. It calculates genre counts, groups data by genre and release time to get movie counts, calculates return on investment for movies, compares ROI by release time and genre, analyzes correlations between variables, and creates box plots to compare youtube likes by genre.

Uploaded by

22128008
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as PDF, TXT or read online on Scribd
0% found this document useful (0 votes)
62 views

ML 1-10

The document contains code to analyze bollywood movie data using pandas and perform descriptive analysis. It calculates genre counts, groups data by genre and release time to get movie counts, calculates return on investment for movies, compares ROI by release time and genre, analyzes correlations between variables, and creates box plots to compare youtube likes by genre.

Uploaded by

22128008
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as PDF, TXT or read online on Scribd
You are on page 1/ 53

1.

DESCRIPTIVE ANALYSIS

import pandas as pd
df=pd.read_csv('bollywood.csv')
df.shape

OP:

df.info()

OP:

df.Genre.value_counts()

OP:
print(df.groupby('Genre')['MovieName'].count().sort_values(ascending=False))

OP:

df.groupby(['Genre','ReleaseTime'])['MovieName'].count()

OP:

dft = df
dft['ROI'] = (dft['BoxOfficeCollection'] - dft['Budget'])/dft['Budget']
dft.sort_values('ROI',ascending=False)['MovieName'].head(10)

OP:
print(dft.groupby(['ReleaseTime'])['ROI'].mean())
dft.groupby(['ReleaseTime'])['ROI'].mean().plot()
OP:

df['Budget'].plot.hist()

OP:
dft.loc[dft.Genre == "Comedy"]['ROI'].plot.hist()

dft.loc[dft.Genre == "Drama"]['ROI'].plot.hist()

print("Comedy has higher ROI")

OP:

boc = df['BoxOfficeCollection']
yl = df['YoutubeLikes']
boc.corr(yl)

OP:

df.groupby('Genre')['YoutubeLikes'].sum().sort_values(ascending=False).head(1)

OP:

df.loc[df.Genre == 'Action']['YoutubeLikes'].plot.box()

OP:
df.loc[df.Genre == 'Comedy']['YoutubeLikes'].plot.box()

OP:

df.loc[df.Genre == 'Drama']['YoutubeLikes'].plot.box()

OP:
df.loc[df.Genre == 'Romance']['YoutubeLikes'].plot.box()

OP:

df.loc[df.Genre == 'Thriller']['YoutubeLikes'].plot.box()

OP:
import seaborn as sns
btod=df[['Budget','BoxOfficeCollection','YoutubeViews','YoutubeLikes','YoutubeDislikes']]
correlation = btod.corr()
sns.pairplot(correlation)

OP:
2.EXPLORATORY ANALYSIS

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sn
data=pd.read csv(Data.csv)
df=pd.DataFrame(data)

df.head()
OP:

df.shape
df.info()

OP:
df.describe()
type(df)
list(df.columns)
df.iloc[0:9,0:4]
df.describe(include=['O'])
OP:

corr_df=df[['sbp', 'obesity' , 'age', 'idl']] sn.pairplot(correlation_matrix)


plt.show()

OP:
correlation_matrix=cor_df.corr()
chd_stats=df.groupby('chd' , 'tobacco'].describe()
print(chd_stats)
sn.regplot(x="age" , y="sbp" , data=df)
plt.show()

OP:
bins=[0, 15, 15, 35,float ('inf')]
Labels =['young' , 'adult', 'mid', 'old']
df['agegroup']=pd.cut(df['age'] bins=bins, labels= labels, right=False)
print(df)

OP:
3.FEATURE ENGINEERING

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
df = pd.read_csv('iris.csv')
df.head()
OP:

df.info()
df.tail()
df.describe()

OP:
df.isnull().sum()

OP:

df.loc[df.isnull().any(axis=1)]

OP:

df = df.dropna()
df.duplicated().sum()
df.loc[df.duplicated()]
df['sepal_length'] = df['sepal_length'].astype(int)

OP:

df.rename(columns={'old_column_name': 'new_column_name'}, inplace=True)


df['sum_lengths'] = df['sepal_length'] + df['petal_length']
df.head()
from scipy import stats
z_scores = stats.zscore(df['sepal_length'])
OP:

z_scores
df[(z_scores > 3)]

OP:

df.to_csv("C:\\Users\\STUDENT\\Downloads\\22128018\\iris_data.csv", index=False)

OP:
4.MEASURE OF FEATURE RELEVANCE & REDUNDANCY

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import statsmodels.api as sm
from statsmodels.stats.outliers_influence import variance_inflation_factor

# Load the dataset


df = pd.read_csv("DAD.csv")

# Task 2: Build correlation matrix


numeric_cols = df.select_dtypes(include=np.number).columns
correlation_matrix = df[numeric_cols].corr().abs()

# Identify highly correlated features


high_correlation_vars = np.where(correlation_matrix > 0.7)
high_correlation_vars = [(correlation_matrix.index[x], correlation_matrix.columns[y]) for x, y in
zip(*high_correlation_vars) if x != y and x < y]
print("Highly correlated features:", high_correlation_vars)

OP:

# Task 3: Create BMI feature


df['BMI'] = df['Body Weight'] / ((df['Body Height'] / 100) ** 2)

# Task 4: Impute 'None' for missing values in Past Medical History Code
df['Past Medical History Code'].fillna('None', inplace=True)

# Display column names in the DataFrame


print("Column names in the DataFrame:")
print(df.columns)

OP:
# Task 5: Select features for model building
selected_features = ['Age', 'Gender', 'Marital Status', 'Key Complaints Code', 'HR Pulse', 'BP
High', 'BP Low', 'RR',
'Past Medical History Code', 'HB', 'Urea', 'Creatinine', 'Mode of Arrival',
'Ambulance',
'Walked Instate at the Time of Arrival', 'Type of Admission', 'Elective', 'Emergency',
'BMI', 'Cost of Implant']

# Task 6: Encode categorical features


categorical_features = ['Gender', 'Marital Status', 'Key Complaints Code', 'Past Medical History
Code',
'Mode of Arrival', 'Type of Admission', 'Elective', 'Emergency', 'Ambulance',
'Walked Instate at the Time of Arrival', 'Implant Used Yes/no']

# Filter categorical features for encoding


df_categorical = df[categorical_features]
for col in df_categorical.columns:
df[col] = LabelEncoder().fit_transform(df[col])

# Create dummy variables for categorical features


df = pd.get_dummies(df, columns=categorical_features, drop_first=True)

# Display column names after encoding


print("\nColumn names after encoding:")
print(df.columns)

# Update selected_features based on column names after encoding


selected_features = [col for col in selected_features if col in df.columns]
OP:

# Task 7: Check for multicollinearity


X = df[selected_features]
vif_data = pd.DataFrame()
vif_data["feature"] = X.columns
vif_data["VIF"] = [variance_inflation_factor(X.values, i) for i in range(len(X.columns))]
print("\nVIF:")
print(vif_data)

OP:

# Task 8: Find outliers using Z-score


z_scores = np.abs((df[selected_features] - df[selected_features].mean()) /
df[selected_features].std())
outliers_zscore = df[(z_scores > 3).any(axis=1)]
print("Outliers using Z-score:")
print(outliers_zscore)

OP:

# Task 9: Split the data


X_train, X_test, y_train, y_test = train_test_split(X, df['Total Cost to Hospital'], test_size=0.2,
random_state=42)

# Task 10: Build regression model with statsmodels


X_train_sm = sm.add_constant(X_train)
model = sm.OLS(y_train, X_train_sm).fit()
print("Regression Model Summary:")
print(model.summary())
# Task 11: Identify statistically significant features
significant_features = model.pvalues[model.pvalues < 0.05].index.tolist()
print("Significant features:", significant_features)

OP:

# Task 12: Build linear regression model with significant features


X_train_sm = sm.add_constant(X_train[significant_features])
model_significant = sm.OLS(y_train, X_train_sm).fit()
print("Regression Model with Significant Features Summary:")
print(model_significant.summary())
OP:

# Task 13: Residual analysis


sm.qqplot(model_significant.resid, line='s')
plt.title("P-P Plot of Residuals")
plt.show()

OP:
# Task 14: Predict and evaluate RMSE
X_test_sm = sm.add_constant(X_test[significant_features])
y_pred = model_significant.predict(X_test_sm)

from sklearn.metrics import mean_squared_error


rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print("RMSE:", rmse)

OP:
5.SUPERVISED LEARNING TECHNIQUE - CLASSIFICATION

A)
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
dataset = pd.read_csv("loan_data_set.csv")
dataset.head()

OP:

X = dataset.drop('Loan_Status', axis=1)
y = dataset['Loan_Status']
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test= train_test_split(X, y, test_size= 0.25, random_state=38, stratify
= y)
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors = 5)
knn.fit(X_train, y_train)

OP:

#Prediction of test set


prediction_knn = knn.predict(X_test)
#Print the predicted values
print("Prediction for test set: {}".format(prediction_knn))

OP:

a = pd.DataFrame({'Actual value': y_test, 'Predicted value': prediction_knn})


a.head()

OP:

from sklearn import metrics


from sklearn.metrics import classification_report, confusion_matrix
matrix = confusion_matrix(y_test, prediction_knn)
sns.heatmap(matrix, annot=True, fmt="d")
plt.title('Confusion Matrix')
plt.xlabel('Predicted')
plt.ylabel('True')
print(classification_report(y_test, prediction_knn))

OP:
5-B

import pandas as pd
from sklearn.tree import DecisionTreeClassifier, plot_tree # Import Decision Tree Classifier and
plot_tree function
from sklearn.model_selection import train_test_split # Import train_test_split function
from sklearn import metrics
col_names = ['pregnant', 'glucose', 'bp', 'skin', 'insulin', 'bmi', 'pedigree', 'age', 'label']
# Load dataset
pima = pd.read_csv("diabetes.csv", header=None, names=col_names, skiprows=1)

# Split dataset into features and target variable


feature_cols = ['pregnant', 'insulin', 'bmi', 'age','glucose','bp','pedigree']
X = pima[feature_cols] # Features
y = pima.label # Target variable

# Split dataset into training set and test set


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1) # 70%
training and 30% test

# Create Decision Tree classifer object


clf = DecisionTreeClassifier()

# Train Decision Tree Classifer


clf = clf.fit(X_train,y_train)

# Predict the response for test dataset


y_pred = clf.predict(X_test)

# Model Accuracy, how often is the classifier correct?


print("Accuracy:", metrics.accuracy_score(y_test, y_pred))

# Visualize the decision tree


import matplotlib.pyplot as plt
plt.figure(figsize=(20,10))
plot_tree(clf, feature_names=feature_cols, class_names=['0','1'], filled=True)
plt.show()

OP:
5-C

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
df = pd.read_csv("titanic.csv")
df.head()

OP:

df.drop(['Cabin','PassengerId','Name','Ticket'],axis=1,inplace=True)
df = df.fillna(0)
from sklearn.preprocessing import LabelEncoder
#Initialize LabelEncoder
le = LabelEncoder()
# Apply label encoding to 'Sex' column
df['Sex'] = le.fit_transform(df['Sex'].astype(str))
# Apply label encoding to 'Embarked' column
df['Embarked'] = le.fit_transform(df['Embarked'].astype(str))

# Putting feature variable to X


X = df.drop('Survived',axis=1)
# Putting response variable to y
y = df['Survived']

# Splitting the data into train and test


from sklearn.model_selection import train_test_split
# Splitting the data into train and test
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7, random_state=42)

#Import Random Forest Model


from sklearn.ensemble import RandomForestClassifier
#Create a Gaussian Classifier
clf=RandomForestClassifier(n_estimators=100)
#Train the model using the training sets y_pred=clf.predict(X_test)
clf.fit(X_train,y_train)

OP:

# Predicting the test set results


Pred = clf.predict(X_test)
print(Pred)

OP:
from sklearn.metrics import classification_report
rand_score=clf.score(X_test, y_test)
classification_report_rf=classification_report(y_test,Pred)
print("Accuracy score:",rand_score)

OP:

5-D

class Dog:
attr1 = "mammal"
attr2 = "dog"

def fun(self):
print("I'm a", self.attr1)
print("I'm a", self.attr2)

# Object instantiation
Rodger = Dog()

# Accessing class attributes


# and method through objects
print(Rodger.attr1)
Rodger.fun()

OP:
6.SUPERVISED LEARNING METHOD - REGRESSION

A)
import pandas as pd
import numpy as np
import matplotlib.pyplot as mtplt

def estimate_coeff(p, q):


# Here, we will estimate the total number of points or observation
n1 = nmp.size(p)
# Now, we will calculate the mean of a and b vector
m_p = nmp.mean(p)
m_q = nmp.mean(q)

# here, we will calculate the cross deviation and deviation about a


SS_pq = nmp.sum(q * p) - n1 * m_q * m_p
SS_pp = nmp.sum(p * p) - n1 * m_p * m_p

# here, we will calculate the regression coefficients


b_1 = SS_pq / SS_pp
b_0 = m_q - b_1 * m_p

return (b_0, b_1)


def plot_regression_line(p, q, b):
# Now, we will plot the actual points or observation as scatter plot
mtplt.scatter(p, q, color = "m",
marker = "o", s = 30)

# here, we will calculate the predicted response vector


q_pred = b[0] + b[1] * p

# here, we will plot the regression line


mtplt.plot(p, q_pred, color = "g")

# here, we will put the labels


mtplt.xlabel('p')
mtplt.ylabel('q')

# here, we will define the function to show plot


mtplt.show()
def main():
# entering the observation points or data
p = np.array([10, 11, 12, 13, 14, 15, 16, 17, 18, 19])
q = np.array([11, 13, 12, 15, 17, 18, 18, 19, 20, 22])

# now, we will estimate the coefficients


b = estimate_coeff(p, q)
print("Estimated coefficients are :\nb_0 = {} \ \nb_1 = {}".format(b[0], b[1]))

# Now, we will plot the regression line


plot_regression_line(p, q, b)

if __name__ == "__main__":
main()

OP:
6-B

import matplotlib.pyplot as mtpplt


import pandas as pd
import numpy as np
import numpy as nmp
from sklearn import datasets as DS
from sklearn import linear_model as LM
from sklearn import metrics as mts
data_url = "http://lib.stat.cmu.edu/datasets/boston"
raw_df = pd.read_csv(data_url, sep="\s+", skiprows=22, header=None)
data = np.hstack([raw_df.values[::2, :], raw_df.values[1::2, :2]])
target = raw_df.values[1::2, 2]
H = data
f = target
from sklearn.model_selection import train_test_split as tts
H_train, H_test, f_train, f_test = tts(H, f, test_size=0.4, random_state=1)

# Here, we will create linear regression object


reg1 = LM.LinearRegression()

# Now, we will train the model by using the training sets


reg1.fit(H_train, f_train)

# here, we will print the regression coefficients


print('Regression Coefficients are: ', reg1.coef_)

# Here, we will print the variance score: 1 means perfect prediction


print('Variance score is: {}'.format(reg1.score(H_test, f_test)))

# Here, we will plot for residual error

# here, we will set the plot style


mtpplt.style.use('fivethirtyeight')

# here we will plot the residual errors in training data


mtpplt.scatter(reg1.predict(H_train), reg1.predict(H_train) - f_train, color="green", s=10,
label='Train data')

# Here, we will plot the residual errors in test data


mtpplt.scatter(reg1.predict(H_test), reg1.predict(H_test) - f_test, color="blue", s=10, label='Test
data')

# Here, we will plot the line for zero residual error

mtpplt.hlines(y=0, xmin=0, xmax=50, linewidth=2)

# here, we will plot the legend


mtpplt.legend(loc='upper right')

# now, we will plot the title


mtpplt.title("Residual errors")

# here, we will define the method call for showing the plot
mtpplt.show()

OP:
6-C

# Import necessary libraries


import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.datasets import load_diabetes
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_curve,
auc

# Load the diabetes dataset


diabetes = load_diabetes()
X, y = diabetes.data, diabetes.target

# Convert the target variable to binary (1 for diabetes, 0 for no diabetes)


y_binary = (y > np.median(y)).astype(int)

# Split the data into training and testing sets


X_train, X_test, y_train, y_test = train_test_split(X, y_binary, test_size=0.2, random_state=42)

# Standardize features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Train the Logistic Regression model


model = LogisticRegression()
model.fit(X_train, y_train)

# Evaluate the model


y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy: {:.2f}%".format(accuracy * 100))

# evaluate the model


print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))

# Visualize the decision boundary with accuracy information


plt.figure(figsize=(8, 6))
sns.scatterplot(x=X_test[:, 2], y=X_test[:, 8], hue=y_test, palette={0: 'blue', 1: 'red'}, marker='o')
plt.xlabel("BMI")
plt.ylabel("Age")
plt.title("Logistic Regression Decision Boundary\nAccuracy: {:.2f}%".format(accuracy * 100))
plt.legend(title="Diabetes", loc="upper right")
plt.show()

y_prob = model.predict_proba(X_test)[:, 1]
fpr, tpr, thresholds = roc_curve(y_test, y_prob)
roc_auc = auc(fpr, tpr)

plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, color='darkorange', lw=2,label=f'ROC Curve (AUC = {roc_auc:.2f})')
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--', label='Random')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curve\nAccuracy: {:.2f}%'.format(accuracy *
100))
plt.legend(loc="lower right")
plt.show()

OP:
7.UNSUPERVISED LEARNING METHODS

A)
import pandas as pd
df = pd.read_csv("Mall_Customers.csv")
print(df.head())

from sklearn.cluster import KMeans

#define the inputs we will use for our K-means clustering algorithm
X = df[['Age', 'Spending Score (1-100)']].copy()

#determine the number of Python clusters that we will use


for i in range(1, 11):
kmeans = KMeans(n_clusters=i, random_state=0)
kmeans.fit(X)

wcss = []

#append the WCSS values to our list


for i in range(1, 11):
kmeans = KMeans(n_clusters=i, random_state=0)
kmeans.fit(X)
wcss.append(kmeans.inertia_)

print(wcss)

import matplotlib.pyplot as plt


import seaborn as sns

#style the plots using Seaborn


sns.set()

#plot the WCSS versus the clusters


plt.plot(range(1, 11), wcss)

#add a title and label the axes


plt.title('Selecting the Numbeer of Clusters using the Elbow Method')
plt.xlabel('Clusters')
plt.ylabel('WCSS')
plt.show()

OP:


7-B

import pandas as pd
from mlxtend.frequent_patterns import apriori, association_rules
raw_data= pd.read_excel(r"C:\Users\STUDENT\Downloads\OnlineRetail.xlsx")
print(raw_data)

def prepare_retail(dataframe):
# preparing dataset
dataframe.dropna(inplace=True)
dataframe = dataframe[~dataframe["InvoiceNo"].str.contains("C", na=False)]
dataframe = dataframe[dataframe["Quantity"] > 0]
dataframe = dataframe[dataframe["UnitPrice"] > 0]
return dataframe
df = prepare_retail(raw_data)

def create_apriori_datastructure(dataframe, id=False):


if id:
grouped = germany_df.groupby(
['InvoiceNo', 'StockCode'], as_index=False).agg({'Quantity': 'sum'})
apriori_datastructure = pd.pivot(data=grouped, index='InvoiceNo', columns='StockCode',
values='Quantity').fillna(
0).applymap(lambda x: 1 if x > 0 else 0)
return apriori_datastructure
else:
grouped = germany_df.groupby(
['InvoiceNo', 'Description'], as_index=False).agg({'Quantity': 'sum'})
apriori_datastructure = pd.pivot(data=grouped, index='InvoiceNo', columns='Description',
values='Quantity').fillna(
0).applymap(lambda x: 1 if x > 0 else 0)
return apriori_datastructure

germany_df = df[df['Country'] == 'Germany']


germany_df.head()

germany_apriori_df = create_apriori_datastructure(germany_df,True)
germany_apriori_df.head()

def get_rules(apriori_df, min_support=0.01):


frequent_itemsets = apriori(apriori_df, min_support=min_support, use_colnames=True)
rules = association_rules(frequent_itemsets, metric="support", min_threshold=min_support)
return rules
germany_rules = get_rules(germany_apriori_df)
germany_rules.head()

def get_item_name(dataframe, stock_code):


if type(stock_code) != list:
product_name = dataframe[dataframe["StockCode"] ==
stock_code][["Description"]].values[0].tolist()
return product_name
else:
product_names = [dataframe[dataframe["StockCode"] ==
product][["Description"]].values[0].tolist()[0] for product in stock_code]
return product_names

get_item_name(germany_df,10125)

OP:

def recommend_products(rules_df, product_id, rec_count=5):


sorted_rules = rules_df.sort_values('lift', ascending=False)
# we are sorting the rules dataframe by using "lift" metric
recommended_products = []

for i, product in sorted_rules["antecedents"].items():


for j in list(product):
if j == product_id:
recommended_products.append(
list(sorted_rules.iloc[i]["consequents"]))

recommended_products = list({item for item_list in recommended_products for item in


item_list})
return recommended_products[:rec_count]
# simulating some products like they are in cart
TARGET_PRODUCT_ID_1 = 21987
TARGET_PRODUCT_ID_2 = 23235
TARGET_PRODUCT_ID_3 = 22747

get_item_name(germany_df, [TARGET_PRODUCT_ID_1,TARGET_PRODUCT_ID_2,
TARGET_PRODUCT_ID_3])

OP:
8.TRAINING ARTIFICIAL NEURAL NETWORK

import numpy as np
import pandas as pd
import tensorflow as tf

tf.__version__

dataset = pd.read_csv(r"C:\Users\STUDENT\Downloads\Churn_Modelling.csv")
X = dataset.iloc[:, 3:-1].values
y = dataset.iloc[:, -1].values

print(X)

print(y)

from sklearn.preprocessing import LabelEncoder


le = LabelEncoder()
X[:, 2] = le.fit_transform(X[:, 2])
print(X)
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), [1])],


remainder='passthrough')
X = np.array(ct.fit_transform(X))
print(X)

from sklearn.model_selection import train_test_split


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

from sklearn.preprocessing import StandardScaler


sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

ann = tf.keras.models.Sequential()
ann.add(tf.keras.layers.Dense(units=6, activation='relu'))
ann.add(tf.keras.layers.Dense(units=6, activation='relu'))
ann.add(tf.keras.layers.Dense(units=1, activation='sigmoid'))
ann.compile(optimizer = 'adam', loss='binary_crossentropy', metrics = ['accuracy'])

ann.fit(X_train, y_train, batch_size = 32, epochs = 100)

Op:
y_pred = ann.predict(X_test)
y_pred = (y_pred > 0.5)
print(np.concatenate((y_pred.reshape(len(y_pred),1), y_test.reshape(len(y_test),1)),1))

from sklearn.metrics import confusion_matrix, accuracy_score


cm = confusion_matrix(y_test, y_pred)
print(cm)
accuracy_score(y_test, y_pred)

OP:
9.INDEPENDENT COMPONENT ANALYSIS

pip install librosa


!pip install pydub
from pydub import AudioSegment
import IPython
import numpy as np
import wave
mix_1_wave = wave.open(r"C:\Users\aashw\Downloads\ICA mix 1.wav", 'r')

mix_1_wave.getparams()
OP:
_wave_params(nchannels=1, sampwidth=2, framerate=44100,
nframes=264515, comptype='NONE', compn ame='not compressed')

# Extract Raw Audio from Wav File


signal_1_raw = mix_1_wave.readframes(-1)
signal_1 = np.fromstring(signal_1_raw, 'int16')
'length: ', len(signal_1) , 'first 100 elements: ',signal_1[:100]
OP:

('length: ',264515,'first 100 elements: ',array([ 879, 1268, 1460, 1756, 1943, 2216, 24
07, 2668, 2866,3106, 3308, 3546, 3752, 3981, 4175, 4395, 4588, 4790,4966, 5146, 52
92, 5436, 5550, 5643, 5717, 5759, 5790,5798, 5789, 5756, 5713, 5649, 5576, 5478,
5381, 5267,5146, 4999, 4856, 4682, 4502, 4308, 4097, 3875, 3637,3380, 3107, 2825,
2514, 2194, 1847, 1472, 1087, 671,227, -219, -691, -1176, -1666, -2167, -2669, -3179,
-3668,-4170, -4643, -5116, -5559, -5985, -6380, -6765, -7105, -7422,-7706, -7955, -8163,
-8339
, -8470, -8557, -8600, -8618, -8585,-8524, -8425, -8298, -8129, -7947, -7720, -7475, -7205,
-6
916,-6606, -6266, -5922, -5556, -5165, -4774, -4353, -3922, -3476,-3021], dtype=int16))

import matplotlib.pyplot as plt


fs = mix_1_wave.getframerate()
timing = np.linspace(0, len(signal_1)/fs, num=len(signal_1))
plt.figure(figsize=(12,2))
plt.title('Recording 1')
plt.plot(timing,signal_1, c="#3ABFE7")
plt.ylim(-35000, 35000)
plt.show()
mix_2_wave = wave.open(r"C:\Users\aashw\Downloads\ICA mix 1.wav",'r')
#Extract Raw Audio from Wav File
signal_raw_2 = mix_2_wave.readframes(-1)
signal_2 = np.fromstring(signal_raw_2, 'int16')
mix_3_wave = wave.open(r"C:\Users\aashw\Downloads\ICA mix 1.wav",'r')

#Extract Raw Audio from Wav File


signal_raw_3 = mix_3_wave.readframes(-1)
signal_3 = np.fromstring(signal_raw_3, 'int16')

plt.figure(figsize=(12,2))
plt.title('Recording 2')
plt.plot(timing,signal_2, c="#3ABFE7")
plt.ylim(-35000, 35000)
plt.show()

plt.figure(figsize=(12,2))
plt.title('Recording 3')
plt.plot(timing,signal_3, c="#3ABFE7")
plt.ylim(-35000, 35000)
plt.show()

OP:

from IPython.display import Audio


import librosa
# Absolute path to the audio file
audio_path = r"C:\Users\aashw\Downloads\ICA mix 1.wav"
# Load the audio file
audio_data, sampling_rate = librosa.load(audio_path)
# Display the audio
Audio(data=audio_data, rate=sampling_rate)

OP:

from scipy.io import wavfile


from IPython.display import Audio
# Load audio file
sampling_rate, audio_data = wavfile.read(r"C:\Users\aashw\Downloads\ICA mix 1.wav")
# Display the audio
Audio(audio_data, rate=sampling_rate)

OP:

X = list(zip(signal_1, signal_2, signal_3))


# Let's peak at what X looks like
X[:10]

OP:
[(879, 879, 879),
(1268, 1268, 1268),
(1460, 1460, 1460),
(1756, 1756, 1756),
(1943, 1943, 1943),
(2216, 2216, 2216),
(2407, 2407, 2407),
(2668, 2668, 2668),
(2866, 2866, 2866),
(3106, 3106, 3106)]

from sklearn.decomposition import FastICA


# Initializing FastICA with n_components=3
ica = FastICA(n_components=3)
# Running the FastICA algorithm using fit_transform on dataset X
ica_result = ica.fit_transform(X)

ica_result.shape

OP:
(264515, 3)

result_signal_1 = ica_result[:,0]
result_signal_2 = ica_result[:,1]
result_signal_3 = ica_result[:,2]

# Independent Component #1
plt.figure(figsize=(12,2))
plt.title('Independent Component #1')
plt.plot(result_signal_1, c="#df8efd")
plt.ylim(-0.010, 0.010)
plt.show()

OP:

# Independent Component #2
plt.figure(figsize=(12,2))
plt.title('Independent Component #2')
plt.plot(result_signal_2, c="#87de72")
plt.ylim(-0.010, 0.010)
plt.show()

OP:
# Independent Component #3
plt.figure(figsize=(12,2))
plt.title('Independent Component #3')
plt.plot(result_signal_3, c="#f65e97")
plt.ylim(-0.010, 0.010)
plt.show()

OP:

from scipy.io import wavfile


# Converting to int, mapping the appropriate range, and increasing the volume a little bit
result_signal_1_int = np.int16(result_signal_1*32767*100)
result_signal_2_int = np.int16(result_signal_2*32767*100)
result_signal_3_int = np.int16(result_signal_3*32767*100)

# Writing wave files


wavfile.write("result_signal_1.wav", fs, result_signal_1_int)
wavfile.write("result_signal_2.wav", fs, result_signal_2_int)
wavfile.write("result_signal_3.wav", fs, result_signal_3_int)

IPython.display.Audio("result_signal_1.wav")
OP:

IPython.display.Audio("result_signal_3.wav")
OP:

IPython.display.Audio("result_signal_2.wav")
10.ENSEMBLE LEARNING ALGORITHM

A)
import matplotlib.pyplot as plt
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.ensemble import BaggingClassifier
data = datasets.load_wine(as_frame = True)
X = data.data
y = data.target
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 22)
estimator_range = [2,4,6,8,10,12,14,16,18,20]
models = []
scores = []
for n_estimators in estimator_range:

# Create a bagging classifier


clf = BaggingClassifier(n_estimators = n_estimators, random_state = 22)

# Fit the model


clf.fit(X_train, y_train)

# Append the model and score to their respective list


models.append(clf)
scores.append(accuracy_score(y_true = y_test, y_pred = clf.predict(X_test)))

# Generate the plot of the scores against a number of the estimators


plt.figure(figsize=(9,6))
plt.plot(estimator_range, scores)

# Adjust labels and font (to make them visible)


plt.xlabel("n_estimators")
plt.ylabel("score")

# show the plot


plt.show()
10-B

import pandas as pd
import numpy as np
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import r2_score
from sklearn.model_selection import train_test_split

#creating a list of values for years_experience & salary


years_experience = [1.1, 1.3, 1.5, 2.0, 2.2, 2.9,
3.0,3.2,3.2,3.7,3.9,4.0,4.0,4.1,4.5,4.9,5.1,5.3,5.9,6.0,6.8,7.1,7.9,8.2,8.7,9.0,9.5,9.6,10.3,10.5]
salary = [39343.00, 46205.00, 37731.00, 43525.00, 39891.00, 56642.00, 60150.00, 54445.00,
64445.00, 57189.00, 63218.00, 55794.00, 56957.00,
57081.00,61111.00,67938.00,66029.00,83088.00,81363.00,93940.00,91738.00,98273.00,101302
.00,113812.00,109431.00,105582.00,116969.00,112635.00,122391.00, 121872.00 ]

# Create a dataframe from lists


df = pd.DataFrame({'years_experience': years_experience, 'salary': salary})

# Split the data into training and testing sets


X = df[['years_experience']]
y = df['salary']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Fit a GradientBoostingRegressor model


model = GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, max_depth=3,
random_state=42)
model.fit(X_train, y_train)

# Make a prediction on the test data


y_pred = model.predict(X_test)

# Print the R-squared value


r2 = model.score(X_test, y_test)
print("Mean_absoloute_score is: ", mean_absolute_error(y_pred, y_test))
print("R_squared score is: ",r2_score(y_pred, y_test))

OP:

You might also like