GDBT 可以解决分类和回归问题
def __init__(self, loss='ls', learning_rate=0.1, n_estimators=100,
subsample=1.0, criterion='friedman_mse', min_samples_split=2,
min_samples_leaf=1, min_weight_fraction_leaf=0.,
max_depth=3, min_impurity_decrease=0.,
min_impurity_split=None, init=None, random_state=None,
max_features=None, alpha=0.9, verbose=0, max_leaf_nodes=None,
warm_start=False, presort='auto')
示例
import numpy as np
from sklearn.metrics import mean_squared_error
from sklearn.datasets import make_friedman1
from sklearn.ensemble import GradientBoostingRegressor
X, y = make_friedman1(n_samples=1200, random_state=0, noise=1.0)
X_train, X_test = X[:200], X[200:]
y_train, y_test = y[:200], y[200:]
est = GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, max_depth=1, random_state=0, loss='huber').fit(X_train, y_train)
pred = est.predict(X_test)
error = mean_squared_error(pred, y_test)
print(max(y_test), min(y_test)) # (27.214332670044374, 0.8719243023544349)
print(error)
def __init__(self, loss='deviance', learning_rate=0.1, n_estimators=100,
subsample=1.0, criterion='friedman_mse', min_samples_split=2,
min_samples_leaf=1, min_weight_fraction_leaf=0.,
max_depth=3, min_impurity_decrease=0.,
min_impurity_split=None, init=None,
random_state=None, max_features=None, verbose=0,
max_leaf_nodes=None, warm_start=False,
presort='auto')
示例
from sklearn.ensemble import GradientBoostingClassifier, GradientBoostingRegressor
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, mean_squared_error
from time import time
import numpy as np
import pandas as pd
import mnist
if __name__ == "__main__":
# 读取Mnist数据集, 测试GBDT的分类模型
mnistSet = mnist.loadLecunMnistSet()
train_X, train_Y, test_X, test_Y = mnistSet[0], mnistSet[1], mnistSet[2], mnistSet[3]
m, n = np.shape(train\_X)
idx = range(m)
np.random.shuffle(idx)
# 使用PCA降维
# num = 30000
# pca = PCA(n\_components=0.9, whiten=True, random\_state=0)
# for i in range(int(np.ceil(1.0 \* m / num))):
# minEnd = min((i + 1) \* num, m)
# sub\_idx = idx\[i \* num:minEnd\]
# train\_pca\_X = pca.fit\_transform(train\_X\[sub\_idx\])
# print np.shape(train\_pca\_X)
print "\\n\*\*\*\*\*\*\*\*\*\*测试GradientBoostingClassifier类\*\*\*\*\*\*\*\*\*\*"
t = time()
# param\_grid1 = {"n\_estimators": range(1000, 2001, 100)}
# param\_grid2 = {'max\_depth': range(30, 71, 10), 'min\_samples\_split': range(4, 9, 2)}
# param\_grid3 = {'min\_samples\_split': range(4, 9, 2), 'min\_samples\_leaf': range(3, 12, 2)}
# param\_grid4 = {'subsample': np.arange(0.6, 1.0, 0.05)}
# model = GridSearchCV(
# estimator=GradientBoostingClassifier(max\_features=90, max\_depth=40, min\_samples\_split=8, learning\_rate=0.1,
# n\_estimators=1800),
# param\_grid=param\_grid4, cv=3)
# # 拟合训练数据集
# model.fit(train\_X, train\_Y)
# print "最好的参数是:%s, 此时的得分是:%0.2f" % (model.best\_params\_, model.best\_score\_)
model = GradientBoostingClassifier(max\_features=90, max\_depth=40, min\_samples\_split=8, min\_samples\_leaf=3,
n\_estimators=1200, learning\_rate=0.05, subsample=0.95)
# 拟合训练数据集
model.fit(train\_X, train\_Y)
# 预测训练集
train\_Y\_hat = model.predict(train\_X\[idx\])
print "训练集精确度: ", accuracy\_score(train\_Y\[idx\], train\_Y\_hat)
# 预测测试集
test\_Y\_hat = model.predict(test\_X)
print "测试集精确度: ", accuracy\_score(test\_Y, test\_Y\_hat)
print "总耗时:", time() - t, "秒"
参考资料:
手机扫一扫
移动阅读更方便
你可能感兴趣的文章