机器学习pipeline总结
阅读原文时间:2023年07月11日阅读:3
# -*- coding: utf-8 -*-
"""scikit-learn introduction

Automatically generated by Colaboratory.

Original file is located at
    https://colab.research.google.com/drive/1quaJafg43SN7S6cNwKFr0_WYn2ELt4Ph

scikit-learn官方网站:https://scikit-learn.org/stable/

模块引入
"""

from sklearn import datasets
from sklearn.metrics import mean_squared_error, r2_score
import matplotlib.pyplot as plt
import numpy as np

"""#分类:
 - SVM(support vector machine):支持向量机
 - svm.SVC()

###iris数据集
 - iris feature: 花萼长度,花萼宽度,花瓣长度,花瓣宽度
 - iris lable: 山鸢尾,杂色鸢尾,维吉尼亚鸢尾
"""

iris = datasets.load_iris()
print('iris feature\n', iris.data[0:5])
print('iris label\n', iris.target[0:5])

"""###创建模型"""

from sklearn import svm
clf = svm.SVC()
irisX = iris.data
irisY = iris.target
clf.fit(irisX, irisY)
irisPred = clf.predict(irisX)
clf.predict([[5.1,3.5,1.4,0.2]])  #刚刚的第1个数据

"""###评估指标
 - accuracy
 - precision
 - recall
 - F1
"""

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
print('acc is ', accuracy_score(irisY, irisPred, normalize=False)/len(irisY))
print('precision is ', precision_score(irisY, irisPred, average='macro'))
print('recall is ', recall_score(irisY, irisPred, average='macro'))
print('F1 is ', f1_score(irisY, irisPred, average='macro'))

"""#回归
 - 线性回归
 - 模块:linear_model.LinearRegression()

###糖尿病数据集
"""

diabetes = datasets.load_diabetes()
diabetesX = np.array([[diabetes.data[i][0]] for i in range(0,diabetes.data.shape[0])])
diabetesY = diabetes.target
print('feature\n',diabetesX[:5])
print('label\n',diabetesY[:5])

"""###创建模型"""

from sklearn import svm, linear_model
regr = linear_model.LinearRegression()
regr.fit(diabetesX, diabetes.target)
diabetesPred = regr.predict(diabetesX)
regr.predict([[0.03807591]])  #对于原始数据的第一个值的预测结果

plt.scatter(diabetesX, diabetes.target)  #原始数据的散点图
plt.plot(diabetesX, diabetesPred)  #线性回归的折线图

"""###评价指标
 - 均方误差(mse)
"""

from sklearn.metrics import mean_squared_error
print('mean squared error is ', mean_squared_error(diabetesY, diabetesPred))

"""#聚类
 - k-means

###创建数据集
"""

from sklearn.datasets.samples_generator import make_blobs
clusterX, clusterY = make_blobs(n_samples=1000, n_features=2, centers=[[-1,-1], [0,0], [1,1], [2,2]], cluster_std=[0.4, 0.2, 0.2, 0.2], random_state=0)
plt.scatter(clusterX[:, 0], clusterX[:, 1])

"""###建立模型"""

from sklearn.cluster import KMeans
clu = KMeans(n_clusters=2, random_state=9)
clusterPredict = clu.fit_predict(clusterX)
plt.scatter(clusterX[:, 0], clusterX[:, 1], c=clusterPredict)
plt.show()

"""#模型评估
 - cross validation 交叉验证
 - 以iris数据集为例
"""

from sklearn.model_selection import train_test_split,cross_val_score
from sklearn.metrics import accuracy_score
from sklearn import svm
import warnings
warnings.filterwarnings('ignore')
clf = svm.SVC()
scores = cross_val_score(clf, irisX, irisY, cv=10, scoring='accuracy')
print('十折交叉验证分别的accuracy ', scores)
print('平均的accuracy ', sum(scores/10))

"""- 通过设置随机种子来进行十次十折交叉验证"""

from sklearn.model_selection import StratifiedKFold,KFold
accEachTime = []

for i in range(0,10):
    clf = svm.SVC()
    scores = cross_val_score(clf, irisX, irisY, cv=KFold(n_splits=10, random_state=i, shuffle=True), scoring='accuracy')
    print(scores)
    accEachTime.append(sum(scores/10))
print('每一次的accuracy值 ', accEachTime)
print('十次十折交叉验证的平均accuracy值 ', sum(accEachTime)/10)