ML 1-10
ML 1-10
DESCRIPTIVE ANALYSIS
import pandas as pd
df=pd.read_csv('bollywood.csv')
df.shape
OP:
df.info()
OP:
df.Genre.value_counts()
OP:
print(df.groupby('Genre')['MovieName'].count().sort_values(ascending=False))
OP:
df.groupby(['Genre','ReleaseTime'])['MovieName'].count()
OP:
dft = df
dft['ROI'] = (dft['BoxOfficeCollection'] - dft['Budget'])/dft['Budget']
dft.sort_values('ROI',ascending=False)['MovieName'].head(10)
OP:
print(dft.groupby(['ReleaseTime'])['ROI'].mean())
dft.groupby(['ReleaseTime'])['ROI'].mean().plot()
OP:
df['Budget'].plot.hist()
OP:
dft.loc[dft.Genre == "Comedy"]['ROI'].plot.hist()
dft.loc[dft.Genre == "Drama"]['ROI'].plot.hist()
OP:
boc = df['BoxOfficeCollection']
yl = df['YoutubeLikes']
boc.corr(yl)
OP:
df.groupby('Genre')['YoutubeLikes'].sum().sort_values(ascending=False).head(1)
OP:
df.loc[df.Genre == 'Action']['YoutubeLikes'].plot.box()
OP:
df.loc[df.Genre == 'Comedy']['YoutubeLikes'].plot.box()
OP:
df.loc[df.Genre == 'Drama']['YoutubeLikes'].plot.box()
OP:
df.loc[df.Genre == 'Romance']['YoutubeLikes'].plot.box()
OP:
df.loc[df.Genre == 'Thriller']['YoutubeLikes'].plot.box()
OP:
import seaborn as sns
btod=df[['Budget','BoxOfficeCollection','YoutubeViews','YoutubeLikes','YoutubeDislikes']]
correlation = btod.corr()
sns.pairplot(correlation)
OP:
2.EXPLORATORY ANALYSIS
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sn
data=pd.read csv(Data.csv)
df=pd.DataFrame(data)
df.head()
OP:
df.shape
df.info()
OP:
df.describe()
type(df)
list(df.columns)
df.iloc[0:9,0:4]
df.describe(include=['O'])
OP:
OP:
correlation_matrix=cor_df.corr()
chd_stats=df.groupby('chd' , 'tobacco'].describe()
print(chd_stats)
sn.regplot(x="age" , y="sbp" , data=df)
plt.show()
OP:
bins=[0, 15, 15, 35,float ('inf')]
Labels =['young' , 'adult', 'mid', 'old']
df['agegroup']=pd.cut(df['age'] bins=bins, labels= labels, right=False)
print(df)
OP:
3.FEATURE ENGINEERING
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
df = pd.read_csv('iris.csv')
df.head()
OP:
df.info()
df.tail()
df.describe()
OP:
df.isnull().sum()
OP:
df.loc[df.isnull().any(axis=1)]
OP:
df = df.dropna()
df.duplicated().sum()
df.loc[df.duplicated()]
df['sepal_length'] = df['sepal_length'].astype(int)
OP:
z_scores
df[(z_scores > 3)]
OP:
df.to_csv("C:\\Users\\STUDENT\\Downloads\\22128018\\iris_data.csv", index=False)
OP:
4.MEASURE OF FEATURE RELEVANCE & REDUNDANCY
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import statsmodels.api as sm
from statsmodels.stats.outliers_influence import variance_inflation_factor
OP:
# Task 4: Impute 'None' for missing values in Past Medical History Code
df['Past Medical History Code'].fillna('None', inplace=True)
OP:
# Task 5: Select features for model building
selected_features = ['Age', 'Gender', 'Marital Status', 'Key Complaints Code', 'HR Pulse', 'BP
High', 'BP Low', 'RR',
'Past Medical History Code', 'HB', 'Urea', 'Creatinine', 'Mode of Arrival',
'Ambulance',
'Walked Instate at the Time of Arrival', 'Type of Admission', 'Elective', 'Emergency',
'BMI', 'Cost of Implant']
OP:
OP:
OP:
OP:
# Task 14: Predict and evaluate RMSE
X_test_sm = sm.add_constant(X_test[significant_features])
y_pred = model_significant.predict(X_test_sm)
OP:
5.SUPERVISED LEARNING TECHNIQUE - CLASSIFICATION
A)
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
dataset = pd.read_csv("loan_data_set.csv")
dataset.head()
OP:
X = dataset.drop('Loan_Status', axis=1)
y = dataset['Loan_Status']
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test= train_test_split(X, y, test_size= 0.25, random_state=38, stratify
= y)
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors = 5)
knn.fit(X_train, y_train)
OP:
OP:
OP:
OP:
5-B
import pandas as pd
from sklearn.tree import DecisionTreeClassifier, plot_tree # Import Decision Tree Classifier and
plot_tree function
from sklearn.model_selection import train_test_split # Import train_test_split function
from sklearn import metrics
col_names = ['pregnant', 'glucose', 'bp', 'skin', 'insulin', 'bmi', 'pedigree', 'age', 'label']
# Load dataset
pima = pd.read_csv("diabetes.csv", header=None, names=col_names, skiprows=1)
OP:
5-C
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
df = pd.read_csv("titanic.csv")
df.head()
OP:
df.drop(['Cabin','PassengerId','Name','Ticket'],axis=1,inplace=True)
df = df.fillna(0)
from sklearn.preprocessing import LabelEncoder
#Initialize LabelEncoder
le = LabelEncoder()
# Apply label encoding to 'Sex' column
df['Sex'] = le.fit_transform(df['Sex'].astype(str))
# Apply label encoding to 'Embarked' column
df['Embarked'] = le.fit_transform(df['Embarked'].astype(str))
OP:
OP:
from sklearn.metrics import classification_report
rand_score=clf.score(X_test, y_test)
classification_report_rf=classification_report(y_test,Pred)
print("Accuracy score:",rand_score)
OP:
5-D
class Dog:
attr1 = "mammal"
attr2 = "dog"
def fun(self):
print("I'm a", self.attr1)
print("I'm a", self.attr2)
# Object instantiation
Rodger = Dog()
OP:
6.SUPERVISED LEARNING METHOD - REGRESSION
A)
import pandas as pd
import numpy as np
import matplotlib.pyplot as mtplt
if __name__ == "__main__":
main()
OP:
6-B
# here, we will define the method call for showing the plot
mtpplt.show()
OP:
6-C
# Standardize features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)
y_prob = model.predict_proba(X_test)[:, 1]
fpr, tpr, thresholds = roc_curve(y_test, y_prob)
roc_auc = auc(fpr, tpr)
plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, color='darkorange', lw=2,label=f'ROC Curve (AUC = {roc_auc:.2f})')
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--', label='Random')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curve\nAccuracy: {:.2f}%'.format(accuracy *
100))
plt.legend(loc="lower right")
plt.show()
OP:
7.UNSUPERVISED LEARNING METHODS
A)
import pandas as pd
df = pd.read_csv("Mall_Customers.csv")
print(df.head())
#define the inputs we will use for our K-means clustering algorithm
X = df[['Age', 'Spending Score (1-100)']].copy()
wcss = []
print(wcss)
OP:
7-B
import pandas as pd
from mlxtend.frequent_patterns import apriori, association_rules
raw_data= pd.read_excel(r"C:\Users\STUDENT\Downloads\OnlineRetail.xlsx")
print(raw_data)
def prepare_retail(dataframe):
# preparing dataset
dataframe.dropna(inplace=True)
dataframe = dataframe[~dataframe["InvoiceNo"].str.contains("C", na=False)]
dataframe = dataframe[dataframe["Quantity"] > 0]
dataframe = dataframe[dataframe["UnitPrice"] > 0]
return dataframe
df = prepare_retail(raw_data)
germany_apriori_df = create_apriori_datastructure(germany_df,True)
germany_apriori_df.head()
get_item_name(germany_df,10125)
OP:
get_item_name(germany_df, [TARGET_PRODUCT_ID_1,TARGET_PRODUCT_ID_2,
TARGET_PRODUCT_ID_3])
OP:
8.TRAINING ARTIFICIAL NEURAL NETWORK
import numpy as np
import pandas as pd
import tensorflow as tf
tf.__version__
dataset = pd.read_csv(r"C:\Users\STUDENT\Downloads\Churn_Modelling.csv")
X = dataset.iloc[:, 3:-1].values
y = dataset.iloc[:, -1].values
print(X)
print(y)
ann = tf.keras.models.Sequential()
ann.add(tf.keras.layers.Dense(units=6, activation='relu'))
ann.add(tf.keras.layers.Dense(units=6, activation='relu'))
ann.add(tf.keras.layers.Dense(units=1, activation='sigmoid'))
ann.compile(optimizer = 'adam', loss='binary_crossentropy', metrics = ['accuracy'])
Op:
y_pred = ann.predict(X_test)
y_pred = (y_pred > 0.5)
print(np.concatenate((y_pred.reshape(len(y_pred),1), y_test.reshape(len(y_test),1)),1))
OP:
9.INDEPENDENT COMPONENT ANALYSIS
mix_1_wave.getparams()
OP:
_wave_params(nchannels=1, sampwidth=2, framerate=44100,
nframes=264515, comptype='NONE', compn ame='not compressed')
('length: ',264515,'first 100 elements: ',array([ 879, 1268, 1460, 1756, 1943, 2216, 24
07, 2668, 2866,3106, 3308, 3546, 3752, 3981, 4175, 4395, 4588, 4790,4966, 5146, 52
92, 5436, 5550, 5643, 5717, 5759, 5790,5798, 5789, 5756, 5713, 5649, 5576, 5478,
5381, 5267,5146, 4999, 4856, 4682, 4502, 4308, 4097, 3875, 3637,3380, 3107, 2825,
2514, 2194, 1847, 1472, 1087, 671,227, -219, -691, -1176, -1666, -2167, -2669, -3179,
-3668,-4170, -4643, -5116, -5559, -5985, -6380, -6765, -7105, -7422,-7706, -7955, -8163,
-8339
, -8470, -8557, -8600, -8618, -8585,-8524, -8425, -8298, -8129, -7947, -7720, -7475, -7205,
-6
916,-6606, -6266, -5922, -5556, -5165, -4774, -4353, -3922, -3476,-3021], dtype=int16))
plt.figure(figsize=(12,2))
plt.title('Recording 2')
plt.plot(timing,signal_2, c="#3ABFE7")
plt.ylim(-35000, 35000)
plt.show()
plt.figure(figsize=(12,2))
plt.title('Recording 3')
plt.plot(timing,signal_3, c="#3ABFE7")
plt.ylim(-35000, 35000)
plt.show()
OP:
OP:
OP:
OP:
[(879, 879, 879),
(1268, 1268, 1268),
(1460, 1460, 1460),
(1756, 1756, 1756),
(1943, 1943, 1943),
(2216, 2216, 2216),
(2407, 2407, 2407),
(2668, 2668, 2668),
(2866, 2866, 2866),
(3106, 3106, 3106)]
ica_result.shape
OP:
(264515, 3)
result_signal_1 = ica_result[:,0]
result_signal_2 = ica_result[:,1]
result_signal_3 = ica_result[:,2]
# Independent Component #1
plt.figure(figsize=(12,2))
plt.title('Independent Component #1')
plt.plot(result_signal_1, c="#df8efd")
plt.ylim(-0.010, 0.010)
plt.show()
OP:
# Independent Component #2
plt.figure(figsize=(12,2))
plt.title('Independent Component #2')
plt.plot(result_signal_2, c="#87de72")
plt.ylim(-0.010, 0.010)
plt.show()
OP:
# Independent Component #3
plt.figure(figsize=(12,2))
plt.title('Independent Component #3')
plt.plot(result_signal_3, c="#f65e97")
plt.ylim(-0.010, 0.010)
plt.show()
OP:
IPython.display.Audio("result_signal_1.wav")
OP:
IPython.display.Audio("result_signal_3.wav")
OP:
IPython.display.Audio("result_signal_2.wav")
10.ENSEMBLE LEARNING ALGORITHM
A)
import matplotlib.pyplot as plt
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.ensemble import BaggingClassifier
data = datasets.load_wine(as_frame = True)
X = data.data
y = data.target
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 22)
estimator_range = [2,4,6,8,10,12,14,16,18,20]
models = []
scores = []
for n_estimators in estimator_range:
import pandas as pd
import numpy as np
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import r2_score
from sklearn.model_selection import train_test_split
OP: