sklearn-GDBT
阅读原文时间:2023年07月10日阅读:2

GDBT 可以解决分类和回归问题

回归问题

def __init__(self, loss='ls', learning_rate=0.1, n_estimators=100,
subsample=1.0, criterion='friedman_mse', min_samples_split=2,
min_samples_leaf=1, min_weight_fraction_leaf=0.,
max_depth=3, min_impurity_decrease=0.,
min_impurity_split=None, init=None, random_state=None,
max_features=None, alpha=0.9, verbose=0, max_leaf_nodes=None,
warm_start=False, presort='auto')

示例

import numpy as np
from sklearn.metrics import mean_squared_error
from sklearn.datasets import make_friedman1
from sklearn.ensemble import GradientBoostingRegressor

X, y = make_friedman1(n_samples=1200, random_state=0, noise=1.0)
X_train, X_test = X[:200], X[200:]
y_train, y_test = y[:200], y[200:]

损失函数

如果损失函数为 误差绝对值,L=|y-f(x)|,负梯度为 sign(y-f(x)),即要么1,要么-1,sklearn 中对应为 loss='lad'

如果损失函数为 huber,sklearn 中对应为 loss='huber'

如果损失函数为 均方误差,sklearn 中对应为 loss='ls'

est = GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, max_depth=1, random_state=0, loss='huber').fit(X_train, y_train)

pred = est.predict(X_test)
error = mean_squared_error(pred, y_test)

print(max(y_test), min(y_test)) # (27.214332670044374, 0.8719243023544349)
print(error)

loss='ls' 5.009154859960321

loss='lad' 5.817510629608294

loss='huber' 4.690823542377095

分类问题

def __init__(self, loss='deviance', learning_rate=0.1, n_estimators=100,
subsample=1.0, criterion='friedman_mse', min_samples_split=2,
min_samples_leaf=1, min_weight_fraction_leaf=0.,
max_depth=3, min_impurity_decrease=0.,
min_impurity_split=None, init=None,
random_state=None, max_features=None, verbose=0,
max_leaf_nodes=None, warm_start=False,
presort='auto')

示例

from sklearn.ensemble import GradientBoostingClassifier, GradientBoostingRegressor
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, mean_squared_error
from time import time
import numpy as np
import pandas as pd
import mnist

if __name__ == "__main__":
# 读取Mnist数据集, 测试GBDT的分类模型
mnistSet = mnist.loadLecunMnistSet()
train_X, train_Y, test_X, test_Y = mnistSet[0], mnistSet[1], mnistSet[2], mnistSet[3]

m, n = np.shape(train\_X)  
idx = range(m)  
np.random.shuffle(idx)

# 使用PCA降维  
# num = 30000  
# pca = PCA(n\_components=0.9, whiten=True, random\_state=0)  
# for i in range(int(np.ceil(1.0 \* m / num))):  
#     minEnd = min((i + 1) \* num, m)  
#     sub\_idx = idx\[i \* num:minEnd\]  
#     train\_pca\_X = pca.fit\_transform(train\_X\[sub\_idx\])  
#     print np.shape(train\_pca\_X)

print "\\n\*\*\*\*\*\*\*\*\*\*测试GradientBoostingClassifier类\*\*\*\*\*\*\*\*\*\*"  
t = time()  
# param\_grid1 = {"n\_estimators": range(1000, 2001, 100)}  
# param\_grid2 = {'max\_depth': range(30, 71, 10), 'min\_samples\_split': range(4, 9, 2)}  
# param\_grid3 = {'min\_samples\_split': range(4, 9, 2), 'min\_samples\_leaf': range(3, 12, 2)}  
# param\_grid4 = {'subsample': np.arange(0.6, 1.0, 0.05)}  
# model = GridSearchCV(  
#     estimator=GradientBoostingClassifier(max\_features=90, max\_depth=40, min\_samples\_split=8, learning\_rate=0.1,  
#                                          n\_estimators=1800),  
#     param\_grid=param\_grid4, cv=3)  
# # 拟合训练数据集  
# model.fit(train\_X, train\_Y)  
# print "最好的参数是:%s, 此时的得分是:%0.2f" % (model.best\_params\_, model.best\_score\_)  
model = GradientBoostingClassifier(max\_features=90, max\_depth=40, min\_samples\_split=8, min\_samples\_leaf=3,  
                                   n\_estimators=1200, learning\_rate=0.05, subsample=0.95)  
# 拟合训练数据集  
model.fit(train\_X, train\_Y)  
# 预测训练集  
train\_Y\_hat = model.predict(train\_X\[idx\])  
print "训练集精确度: ", accuracy\_score(train\_Y\[idx\], train\_Y\_hat)  
# 预测测试集  
test\_Y\_hat = model.predict(test\_X)  
print "测试集精确度: ", accuracy\_score(test\_Y, test\_Y\_hat)  
print "总耗时:", time() - t, "秒"

参考资料:

https://github.com/haidawyl/Mnist  各种模型的用法

手机扫一扫

移动阅读更方便

阿里云服务器
腾讯云服务器
七牛云服务器