hci_lab2_1 (1)
hci_lab2_1 (1)
March 1, 2025
#DA LAB DECISION TREE ##Name:Vishwajeet H. Dodyalkar ##Roll No: 221IT084 ##Sub-
ject: Data Analytics (IT350)
Question 1 F or the below dataset, construct a decision tree for classifying a new
test record using
information gain measure. Also, classify the below test records using the
constructed decision tree:
a. Outlook = Rainy , T emp = Cool, Humidity = High, Windy = T rue b. Outlook =
Sunny , T emp
= Mild, Humidity = Normal, Windy = F alse
[18]: import pandas as pd
import numpy as np
import pprint
data=pd.DataFrame([
['Rainy','Hot','High', False,'No'],
['Rainy','Hot','High', True,'No'],
['Overcast ','Hot','High', False,'Yes'],
['Sunny','Mild','High',False,'Yes'],
['Sunny','Cool','Normal', False,'Yes'],
['Sunny','Cool','Normal', True,'No'],
['Overcast ','Cool','Normal', True,'Yes'],
['Rainy','Mild','High',False,'No'],
['Rainy','Cool','Normal', False,'Yes'],
['Sunny','Mild','Normal', False,'Yes'],
['Rainy','Mild','Normal', True,'Yes'],
['Overcast ','Mild','High', True,'Yes'],
['Overcast ','Hot','Normal',False,'Yes'],
['Sunny','Mild','High',True,'No']
], columns =['Outlook','Temp','Humidity ','Windy','Play Golf '])
defentropy(target_col):
values, counts =np.unique(target_col, return_counts =True)
probs=counts/counts.sum()
return-np.sum(probs *np.log2(probs))
definfo_gain (data, attribute, target ='Play Golf '):
total_entropy =entropy(data[target])
values, counts =np.unique(data[attribute], return_counts =True)
weighted_entropy =sum(
1
(counts[i] /sum(counts)) *entropy(data[data[attribute] ==␣
↪values[i]][target])
fori inrange(len(values))
)
returntotal_entropy -weighted_entropy
defbuild_tree (data, attributes, target ='Play Golf '):
iflen(np.unique(data[target])) ==1:
returnnp.unique(data[target])[ 0]
iflen(attributes) ==0:
returndata[target] .mode()[0]
gains={attr: info_gain(data, attr, target) forattr inattributes}
best_attribute =max(gains, key =gains.get)
tree={best_attribute: {}}
forvalue innp.unique(data[best_attribute]):
subset=data[data[best_attribute] ==value]
remaining_attributes =[attr forattr inattributes ifattr!=␣
↪best_attribute]
tree[best_attribute][value] =build_tree(subset, remaining_attributes, ␣
↪target)
returntree
defprint_tree (tree, indent =''):
if notisinstance (tree,dict):
print(indent +f"→{tree }")
return
attribute =next(iter(tree))
print(indent +f"{attribute }?")
forvalue, subtree intree[attribute] .items():
print(indent +f" {value }")
print_tree(subtree, indent +"")
attributes =['Outlook','Temp','Humidity ','Windy']
tree=build_tree(data, attributes)
print("Constructed Decision Tree: ")
print_tree(tree)
defclassify (tree, record):
if notisinstance (tree,dict):
returntree
attribute =next(iter(tree))
value=record[attribute]
ifvalue not intree[attribute]:
return'Unknown'
2
returnclassify(tree[attribute][value], record)
test_records =[
{'Outlook':'Rainy','Temp':'Cool','Humidity ':'High','Windy': True},
{'Outlook':'Sunny','Temp':'Mild','Humidity ':'Normal','Windy': False}
]
print("\nClassifications for Test Records: ")
fori, record inenumerate (test_records):
result=classify(tree, record)
print(f"Test Record {i+1}:{record }=> Predicted: {result }")
Constructed Decision Tree:
Outlook?
Overcast
→ Yes
Rainy
Humidity?
High
→ No
Normal
→ Yes
Sunny
Windy?
False
→ Yes
True
→ No
Classifications for Test Records:
Test Record 1: {'Outlook': 'Rainy', 'Temp': 'Cool', 'Humidity': 'High', 'Windy':
True} => Predicted: No
Test Record 2: {'Outlook': 'Sunny', 'Temp': 'Mild', 'Humidity': 'Normal',
'Windy': False} => Predicted: Yes
Question 2 F or the below dataset, construct a decision tree for classifying a new
test record using
information gain measure. Also, classify the below test records using the
constructed decision tree:
a. Industry = Urban, JobType = Sales, Income = Low, Previous Customer = Y es b.
Industry =
Electronics, JobType = Engineering, Income = High, Previous Customer = No
[23]: import pandas as pd
import numpy as np
import pprint
data=pd.DataFrame([
['Aerospace ','Engineering ','High','No','NO'],
['Aerospace ','Engineering ','High','Yes','NO'],
3
['Auto','Engineering ','High','No','YES'],
['Electronics ','Marketing ','High','No','YES'],
['Urban','Marketing ','Low','No','YES'],
['Urban','Marketing ','Low','Yes','NO'],
['Auto','Marketing ','Low','Yes','YES'],
['Aerospace ','Sales','High','No','NO'],
['Aerospace ','Marketing ','Low','No','YES'],
['Electronics ','Sales','Low','No','NO'],
['Aerospace ','Sales','Low','Yes','YES'],
['Electronics ','Sales','High','Yes','NO'],
['Auto','Engineering ','Low','No','YES'],
['Electronics ','Sales','High','Yes','NO']
], columns =['Industry ','JobType','Income','Previous Customer ','Class'])
defentropy(target_col):
values, counts =np.unique(target_col, return_counts =True)
probs=counts/counts.sum()
return-np.sum(probs *np.log2(probs))
definfo_gain (data, attribute, target ='Class'):
total_entropy =entropy(data[target])
values, counts =np.unique(data[attribute], return_counts =True)
weighted_entropy =sum((counts[i] /sum(counts)) *␣
↪entropy(data[data[attribute] ==values[i]][target]) foriin␣
↪range(len(values)))
returntotal_entropy -weighted_entropy
defbuild_tree (data, attributes, target ='Class'):
iflen(np.unique(data[target])) ==1:
returnnp.unique(data[target])[ 0]
iflen(attributes) ==0:
returndata[target] .mode()[0]
gains={attr: info_gain(data, attr, target) forattr inattributes}
best_attribute =max(gains, key =gains.get)
tree={best_attribute: {}}
forvalue innp.unique(data[best_attribute]):
subset=data[data[best_attribute] ==value]
remaining_attributes =[attr forattr inattributes ifattr!=␣
↪best_attribute]
tree[best_attribute][value] =build_tree(subset, remaining_attributes, ␣
↪target)
4
returntree
defvisualize_tree (tree, indent ='', level=0):
if notisinstance (tree,dict):
print(indent +f'→{tree }')
return
forkey, value intree.items():
print(indent +key)
forsub_key, sub_value invalue.items():
print(indent +f' {sub_key }')
visualize_tree(sub_value, indent +'', level +1)
attributes =['Industry ','JobType','Income','Previous Customer ']
tree=build_tree(data, attributes)
print("Constructed Decision Tree: ")
pprint.pprint(tree)
visualize_tree(tree)
defclassify (tree, record, data, target ='Class'):
if notisinstance (tree,dict):
returntree
attribute =next(iter(tree))
value=record.get(attribute, 'Unknown')
ifvalue not intree[attribute]:
returndata[target] .mode()[0]
returnclassify(tree[attribute][value], record, data, target)
test_records =[
{'Industry ':'Urban','JobType':'Sales','Income':'Low','Previous ␣
↪Customer ':'Yes'},
{'Industry ':'Electronics ','JobType':'Engineering ','Income':'High',␣
↪'Previous Customer ':'No'}
]
print("\nClassifications for Test Records: ")
fori, record inenumerate (test_records):
result=classify(tree, record, data)
print(f"Test Record {i+1}:{record }=> Predicted: {result }")
Constructed Decision Tree:
5
{'Industry': {'Aerospace': {'Income': {'High': 'NO', 'Low': 'YES'}},
'Auto': 'YES',
'Electronics': {'JobType': {'Marketing': 'YES', 'Sales': 'NO'}},
'Urban': {'Previous Customer': {'No': 'YES', 'Yes': 'NO'}}}}
Industry
Aerospace
Income
High
→ NO
Low
→ YES
Auto
→ YES
Electronics
JobType
Marketing
→ YES
Sales
→ NO
Urban
Previous Customer
No
→ YES
Yes
→ NO
Classifications for Test Records:
Test Record 1: {'Industry': 'Urban', 'JobType': 'Sales', 'Income': 'Low',
'Previous Customer': 'Yes'} => Predicted: NO
Test Record 2: {'Industry': 'Electronics', 'JobType': 'Engineering', 'Income':
'High', 'Previous Customer': 'No'} => Predicted: NO
6