Python实战：KNN、逻辑回归与k-means算法详解-CSDN博客

本文链接：https://blog.csdn.net/and52696686/article/details/108869351

常见算法--实例解析

一、KNN临近算法
二、KNN回归算法
三、线性回归-梯度下降
四、逻辑回归
五、k-means算法

一、KNN临近算法

import numpy as np
import pandas as pd

data=pd.read_csv("iris.csv")
data

#读取 csv文件，去除头部，默认0，第一行，如果不去除头部，可以使用None
data=pd.read_csv("iris.csv",header=0)
data

#head 默认前5行
data.head(10)
#tail 默认后5行
#data.tail(10)

#抽样方法  默认输出1条
data.sample(10)

#将 字段"Species"   string 转成 int
data["Species"] = data["Species"].map({"virginica":0,"setosa":1,"versicolor":2})
data.sample(10)

#删除Id列  inplace设置为True,修改当前对象的值
data.drop("Id",axis=1,inplace=True)
data

#判断是否有重复值，false没有重复值， True有重复数据
data.duplicated()

#返回True表示有重复值，返回一个结果
data.duplicated().any()

#去除重复值
data.drop_duplicates(inplace=True)
len(data)

#统计每个种类的个数
data["Species"].value_counts()

class KNN:
    """实现 KNN 邻近算法 （分类）"""
    def __init__(self,k):
        """
        初始化方法
        k:int  获取邻居的个数
        self   相当于java中的this
        """
        self.k = k
        
    def fit(self,X,y):
        """
        训练数据
        
        大写X表示 矩阵
        小写y表示 向量
        
        X: 类型组类型，形状为： [样本数量，特征数量]
            训练样本特征（属性） 5.1,3.5,1.4,0.2
            
        y: 类数组类型，形状为：[样本数量]
            每个样本的目标值（标签）1
        
        """
        #将X,y转换成 np array
        self.X=np.asarray(X)
        self.y=np.asarray(y)
        
    def predict(self,V):
        """
        根据样本传递过来的属性，预测特征
         
        V: 类型组类型，形状为： [样本数量，特征数量]
            训练样本特征（属性） 5.1,3.5,1.4,0.2
            
        result:数组类型
            预测结果
        
        """
        V = np.asarray(V)
        result = []
        
        #对测试数据集进行遍历，取出每条数据与训练数据进行计算
        for v in V:
            #获取每一个测试集到每一个训练集的距离
            dis = np.sqrt(np.sum((v - self.X)**2,axis=1))
            #将距离 dis 进行排序（下标进行排序）
            index = dis.argsort()
            #取出前K表距离最近的邻居
            index = index[:self.k]
            #根据index过去y中对应的特征值
            count = np.bincount(self.y[index])
            #将数据放到result集合中
            result.append(count.argmax())
        
        #返回预测数据集合
        return np.asarray(result)

#data["Species"].value_count()
t0 = data[data["Species"] == 0]
t1 = data[data["Species"] == 1]
t2 = data[data["Species"] == 2]
#len(t0)
#len(t1)
#len(t2)
# 数据可以重现  种子
t0 = t0.sample(len(t0),random_state=0)
t1 = t1.sample(len(t1),random_state=0)
t2 = t2.sample(len(t2),random_state=0)


#行数取40行，列数最后一列不要
#t0.iloc[:40,:-1]
#t1.iloc[:40,:-1]
#t2.iloc[:40,:-1]

train_X = pd.concat([t0.iloc[:40,:-1],t1.iloc[:40,:-1],t2.iloc[:40,:-1]],axis=0)
train_y = pd.concat([t0.iloc[:40,-1],t1.iloc[:40,-1],t2.iloc[:40,-1]],axis=0)

test_X = pd.concat([t0.iloc[40:,:-1],t1.iloc[40:,:-1],t2.iloc[40:,:-1]],axis=0)
test_y = pd.concat([t0.iloc[40:,-1],t1.iloc[40:,-1],t2.iloc[40:,-1]],axis=0)

#创建 KNN 对象
knn = KNN(k=3)
knn.fit(train_X,train_y)
result = knn.predict(test_X)
#display(result)

# 计算预测结果正确的个数
np.sum(result == test_y)

#计算预测结果正确率
np.sum(result == test_y)/len(test_y)

import matplotlib as mpl
import matplotlib.pyplot as plt

#  matplotlib不支持中文， 需要配置一下 ， 设置一个中文字体
mpl.rcParams["font.family"] = "SimHei"
# 能够显示 中文， 正常显示 “-”
mpl.rcParams["axes.unicode_minus"] = False

#设置画布大小
plt.figure(figsize=(9,9))
#绘制点图，需要提供x，y轴坐标；x:花蕊的长度 y：花瓣的长度
plt.scatter(x=t0["SepalLengthCm"][:40], y=t0["PetalLengthCm"][:40], color="r", label="virginica")
plt.scatter(x=t1["SepalLengthCm"][:40], y=t1["PetalLengthCm"][:40], color="g", label="setosa")
plt.scatter(x=t2["SepalLengthCm"][:40], y=t2["PetalLengthCm"][:40], color="b", label="versicolor")

# 测试数据集 test_y 待测试数据集真实数据 result 用KNN分类算法计算出来的数据集
right = test_X[test_y ==result]
wrong = test_X[test_y !=result]
plt.figure(figsize=(9,9))
plt.scatter(x=right["SepalLengthCm"], y=right["PetalLengthCm"], color="c", label="right",marker="x")

plt.figure(figsize=(9,9))
plt.legend(loc="best")
plt.xlabel('花萼')
plt.ylabel('花瓣')
plt.title('KNN分类算法显示')
plt.scatter(x=t0["SepalLengthCm"][:40], y=t0["PetalLengthCm"][:40], color="r", label="virginica")
plt.scatter(x=t1["SepalLengthCm"][:40], y=t1["PetalLengthCm"][:40], color="g", label="setosa")
plt.scatter(x=t2["SepalLengthCm"][:40], y=t2["PetalLengthCm"][:40], color="b", label="versicolor")
plt.scatter(x=right["SepalLengthCm"], y=right["PetalLengthCm"], color="c", label="right",marker="x")
plt.scatter(x=wrong["SepalLengthCm"], y=wrong["PetalLengthCm"], color="m", label="wrong",marker=">")

二、KNN回归算法

import numpy as np
import pandas as pd

data = pd.read_csv("iris.csv")
len(data)
data.drop(["Id","Species"],axis=1,inplace=True)
data.drop_duplicates(inplace=True)
len(data)

class KNN:
    """
    KNN 回归算法
    使用鸢尾花的四个特征进行训练，花瓣长度、宽度、花萼长度、宽度
    算法的目标是：根据鸢尾花的三个特征，预测最后一个特征的 度量 （预测值）
    """
    def __init__(self,k):
        """
        初始化方法
        k:int
        设置k的值，找出相邻数据的个数
        """
        
        self.k = k
        
    def fit(self,X,y):
        """
        根据参数传递过来的X，对样本数据进行预测
        """
        
        self.X = np.asarray(X)
        self.y = np.asarray(y)
        
    def predict(self,V):
         """加权重"""
        
        V = np.asarray(V)
        result = []
        
        for v in V:
            #计算距离 测试集中某一个数据到训练集中每一个点的距离
            # 数学模型就是 计算空间中某一点（含 x,y,z坐标） 到空间中一个含有多个点的距离的集合
            dis = np.sqrt(np.sum((v - self.X)**2,axis=1))
            
            #返回排序后的索引
            index = dis.argsort()
            index = index[:self.k]
            #np.mean(self.y[index])
            
            # 计算所有k个距离的倒数和
            he = np.sum(1/(dis[index]+0.0001))
            #计算每个k点的权重值
            weight = (1/(dis[index]+0.0001))/he
            
            #将前k个计算的结果(计算结果要与该点所占的权重比相乘)求均值后放入result数组中
            #result.append(np.mean(self.y[index]))
            
            result.append(np.sum(self.y[index] * weight))
        return np.asarray(result)

X = data.sample(len(data),random_state=0)
len(X)
train_X = X.iloc[:120,:-1]
train_y = X.iloc[:120,-1]
test_X = X.iloc[120:,:-1]
test_y = X.iloc[120:,-1]

knn = KNN(k = 3)
knn.fit(train_X,train_y)
result = knn.predict(test_X)
display(result)
display(test_y.values)

import matplotlib as mpl
import matplotlib.pyplot as plt

#matplotlib不支持中文，需要配置以下，设置一个中文字体
mpl.rcParams["font.family"] = "SimHei"
#能够显示 中文，正常显示 “-”
mpl.rcParams["axes.unicode_minus"] = False

plt.figure(figsize=(10,10))
plt.plot(result,"ro-",label="预测值")
plt.plot(test_y.values,"go--",label="真实值")
plt.title("KNN回归算法预测展示")
plt.xlabel("序号")
plt.ylabel("度量值")
plt.legend()
plt.show()

三、线性回归-梯度下降

import numpy as np
import pandas as pd

data = pd.read_csv("boston.csv")
data.info()

class LinearRegression:
    """ 使用梯度下降 实现线性回归算法"""
    def __init__(self,alpha,times):
        """ alpha 调控步长
            times 迭代计算次数
        """
        self.alpha = alpha
        self.times = times
    def fit(self,X,y):
        X = np.asarray(X)
        y = np.asarray(y)
        #设置初始权重为0，w_ 行数为 X数组的列数+1
        self.w_ = np.zeros(X.shape[1]+1)
        #定义损失列表
        self.loss_ = []
        #循环迭代 times次，计算出最终的 w_ 值
        for i in range(self.times):
            #将 w_ 带入公式  y=w1*x1 + w2*x2 + ...... +wn*xn +w0
            y_hat = np.dot(X,self.w_[1:])+self.w_[0]
            #计算每一个预测值与真实值之间的误差
            error = y - y_hat
            #计算损失值  损失函数 sum(（y-y_hat）**2)/2
            self.loss_.append(np.sum(error**2)/2)
            self.w_[0] = self.w_[0] + self.alpha*np.sum(error)
            self.w_[1:] = self.w_[1:] + self.alpha*np.dot(X.T,error)
    
    def predict(self,X):
        X = np.asarray(X)
        result = self.w_[0] + np.dot(X,self.w_[1:])
        return result

lr = LinearRegression(0.0005,20)
data.head()

t = data.sample(len(data),random_state=0)
len(t)

train_X = t.iloc[:400,:-1]
train_y = t.iloc[:400,-1]
test_X = t.iloc[400:,:-1]
test_y = t.iloc[400:,-1]

lr.fit(train_X,train_y)
result = lr.predict(test_X)
np.mean((result - test_y)**2)

#权重值
display(lr.w_)

#损失函数
display(lr.loss_)

#n标准正态分布类，给定数据集X，求出X中每一类数据的标准分布
class StandardScala:
    def fit(self,X):
        X = np.asarray(X)
        self.mean_ = np.mean(X,axis=0)
        self.std_ = np.std(X,axis=0)
    
    #转换正态分布
    def transform(self,X):
        return (X - self.mean_) / self.std_
    
    #训练转换
    def fit_transform(self,X):
        self.fit(X)
        return self.transform(X)

import matplotlib as mpl
import matplotlib.pyplot as plt

#  matplotlib不支持中文， 需要配置一下 ， 设置一个中文字体
mpl.rcParams["font.family"] = "SimHei"
# 能够显示 中文， 正常显示 “-”
mpl.rcParams["axes.unicode_minus"] = False

plt.figure(figsize=(10,10))
plt.plot(result,"ro-",label="预测值")
plt.plot(test_y.values,"go--",label="真实值")
plt.xlabel("样本序号")
plt.ylabel("房价")
plt.title("现行回归，梯度下降法")
plt.legend()
plt.show()

#损失值
plt.plot(range(1,lr.times+1),lr.loss_,"ro-")

四、逻辑回归

import numpy as np
import pandas as pd

data = pd.read_csv("iris.csv")
data.drop("Id",axis=1,inplace=True)
data["Species"].drop_duplicates()
# setosa versicolor virginica
len(data)
data["Species"] = data["Species"].map({"versicolor":0,"setosa":1,"virginica":2})
data = data[data["Species"]!=2]
len(data)

#data

class LogisticRegression:
    """使用Python语言实现逻辑回归"""
    
    def __init__(self, alpha, times):
        self.alpha = alpha
        self.times = times
        
    def sigmoid(self, z):
        """ g(z) = 1 / (1 + e**(-z)) 
        
        sigmoid函数   函数在有个很漂亮的“S”形， z的取值范围是负无穷到正无穷， g(z)取值范围（0， 1）
        
        """
        return 1.0 / (1 + np.exp(-z))
    
    def fit(self, X, y):
        X = np.asarray(X)
        y = np.asarray(y)
        # 创建权重
        self.w_ = np.zeros( X.shape[1] + 1 )
        # 创建损失值array
        self.loss_ = []
        
        for i in range(self.times):
            z = np.dot(X, self.w_[1:]) + self.w_[0]
            # 求出z值的sigmoid函数值  值为 0到1之间的数据
            p = self.sigmoid(z)
            
            # 计算损失值的公式
            #  J(w) = -sum( y* log(sig(z)) + (1-y)* log(1 - sig(z)) )
            
            loss = -np.sum(y * np.log(p) + (1-y) * np.log(1-p))
            self.loss_.append(loss)
            
            self.w_[0] +=  self.alpha * np.sum(y-p)
            self.w_[1:] +=  self.alpha * np.dot(X.T, y-p)
            
    
    def predic_proba(self, X):    # 预测概率
        X = np.asarray(X)
        z = np.dot(X, self.w_[1:]) + self.w_[0]
        p = self.sigmoid(z)
        # 将p 转发成一个二维数组
        p = p.reshape(-1,1)
        #  转化成一个二维数组，（n行，2列）  第一列为类型 0 的概率， 第二列为类型 1 的概率 
        return np.concatenate([1 - p, p], axis=1)
        
    def predict(self, X):
        return np.argmax(self.predic_proba(X), axis=1)

t1 = data[data["Species"]==0]
t2 = data[data["Species"]==1]
len(t1)

t1 = t1.sample(len(t1), random_state=0 )
t2 = t2.sample(len(t2), random_state=0 )


traint1 = t1.iloc[:40, :-1]
traint2 = t2.iloc[:40, :-1]
traint_X = pd.concat([traint1, traint2], axis=0)

trainy1 = t1.iloc[:40, -1]
trainy2 = t2.iloc[:40, -1]
traint_y = pd.concat([trainy1, trainy2], axis=0)

test_X = pd.concat([t1.iloc[40:, :-1], t2.iloc[40:, :-1]], axis=0)
test_y = pd.concat([t1.iloc[40:, -1], t2.iloc[40:, -1]], axis=0)

lr = LogisticRegression(0.01, 20)
lr.fit(traint_X, traint_y)

result1 = lr.predic_proba(test_X)   # 返回  类型 0  类型 1 的概率
result2 = lr.predict(test_X)     #  返回 预测的 类型 0 或 1 
display(result1)
display(result2)

np.sum(result2==test_y)/ len(test_y)

import matplotlib as mpl
import matplotlib.pyplot as plt

#  matplotlib不支持中文， 需要配置一下 ， 设置一个中文字体
mpl.rcParams["font.family"] = "SimHei"
# 能够显示 中文， 正常显示 “-”
mpl.rcParams["axes.unicode_minus"] = False

#plt.figure(figsize=(10,10))
#绘制预测值
plt.plot(result2,"ro",ms = 15,label="预测值")
#绘制真实值
plt.plot(test_y.values,"go" ,label="真实值")

plt.title("线性回归 最小二乘")
plt.xlabel("序号")
plt.ylabel("类型")
plt.legend()
plt.show

# 查看sun's
plt.plot(range(1, lr.times+1), lr.loss_, "go-")

五、k-means算法

import numpy as np
import pandas as pd

data = pd.read_csv("xigua.csv")
#data
t = data.iloc[:,-2:]
t

class KMeans:
    """KMeans算法实现"""
    
    def __init__(self, k, times):
        """
        k : int
            质心的个数
        times : int
            聚类迭代次数
        """
        self.k = k
        self.times = times
        
    def fit(self, X1):
        """根据训练数据集，对模型进行训练，找到中k个质心
        
        -----
        X : 类数组类型
            训练数据集        
        
        """       
        X = np.asarray(X1) 
        
        
        np.random.seed(1)
        numarray = np.random.randint(0, len(X), self.k)
        self.cluster_ =  X[numarray]  
        
#         self.cluster_ =  np.asarray(X1.sample(self.k, random_state=0))  
        self.label_ = np.zeros(len(X))
        
        for t in range(self.times):            
            for index, x in enumerate(X):
                dis = np.sqrt(np.sum((x-self.cluster_)**2, axis=1))
                self.label_[index] = dis.argmin()

            for i in range(self.k):
                self.cluster_[i] = np.mean(X[self.label_==i], axis=0)
            
    def predict(self, X):
        """预测 数据集"""
        X = np.asarray(X)
        result = np.zeros(len(X))
        for index, x in enumerate(X):
            dis = np.sqrt(np.sum((x-self.cluster_)**2, axis=1))
            result[index] = dis.argmin()
        return result

kmeans = KMeans(3,350)
kmeans.fit(t)

#查看中心点
display(kmeans.cluster_)

display(kmeans.label_)
display(t[kmeans.label_==0])  #类别1
display(t[kmeans.label_==1])  #类别1
display(t[kmeans.label_==2])  #类别1

# X = np.asarray(X)
# np.random.seed(123)
# X
# X[:,0]
numarray = t.sample(2, random_state=0)
numarray

# 预测，随机 创建三个样本，测试三个样本归为那个类？
test_X = np.asarray([[0.1, 0.2],[0.5,0.7],[1.8,0.7]])
result = kmeans.predict(test_X)
display(result)

X = np.asarray([[1,2,3],[4,5,6],[7,8,9],[17,18,19],[14,15,16]])
display(X)
np.random.seed(0)
numarray = np.random.randint(0,len(X),2)
display(numarray)
cluster = X[numarray]
display(cluster)

#定义label数组，记录X数据集中每一个样本所属的聚类
label_ = np.zeros(len(X))
display(label_)

for index,x in enumerate(X):
    dis = np.sqrt(np.sum((x-cluster)**2,axis=1))  #[22.516605 0]到质心两个[[14,15,16],[1,2,3]]的距离为[22.516605,0]
    display(dis)
    label_[index] = dis.argmin()  #[22.516605,0]
    
display(label_)

x1 = X[label_==0]
display(x1)
x2 = X[label_ ==1]
display(x2)
cluster1 = np.mean(x1,axis=0)
cluster2 = np.mean(x2,axis=0)
display(cluster1)
display(cluster2)

import matplotlib as mpl
import matplotlib.pyplot as plt

#  matplotlib不支持中文， 需要配置一下 ， 设置一个中文字体
mpl.rcParams["font.family"] = "SimHei"
# 能够显示 中文， 正常显示 “-”
mpl.rcParams["axes.unicode_minus"] = False

plt.figure(figsize=(10,10))
plt.scatter(t[kmeans.label_==0].iloc[:,0], t[kmeans.label_==0].iloc[:,1], label = "类别1")
plt.scatter(t[kmeans.label_==1].iloc[:,0], t[kmeans.label_==1].iloc[:,1], label = "类别2")
plt.scatter(t[kmeans.label_==2].iloc[:,0], t[kmeans.label_==2].iloc[:,1], label = "类别3")
# plt.scatter(t.iloc[:,0], t.iloc[:,1], c=kmeans.label_)
plt.scatter(kmeans.cluster_[:,0], kmeans.cluster_[:,1], marker="+", s=100)
plt.title("kmeans 示例图")
plt.xlabel("重量")
plt.ylabel("甜度")
plt.legend()
plt.show()

plt.scatter(kmeans.cluster_[:,0],kmeans.cluster_[:,1],c=[1,2,3])