决策树CART回归树——算法实现
阅读原文时间:2023年07月09日阅读:3

决策树模型

  1. 选择最好的特征和特征的值进行数据集划分
  2. 根据上面获得的结果创建决策树
  3. 根据测试数据进行剪枝(默认没有数据的树分支被剪掉)
  4. 对输入进行预测

模型树

import numpy as np

def loadDataSet(fileName):      #general function to parse tab -delimited floats
    dataMat = []                #assume last column is target value
    with open(fileName) as fr:
        for line in fr.readlines():
            curLine = line.strip().split('\t')
            # fltLine = map(float, curLine) #map all elements to float()
            fltLine = [float(i) for i in curLine]
            dataMat.append(fltLine)
        # dataMat = [map(float,line.strip().split('\t')) for line in fr.readlines()]
    return np.mat(dataMat)

# dataSet为矩阵,feature 为特征索引,value为值
def binSplitDataSet(dataSet, feature, value):
    mat0 = dataSet[np.nonzero(dataSet[:,feature] > value)[0],:]
    mat1 = dataSet[np.nonzero(dataSet[:,feature] <= value)[0],:]
    return np.mat(mat0),np.mat(mat1)

def regLeaf(dataSet):#returns the value used for each leaf
    return np.mean(dataSet[:,-1])

def regErr(dataSet): # 输出的平方误差和
    return np.var(dataSet[:,-1]) * np.shape(dataSet)[0]

# ops[0]误差下降值,小于此值不再切分
# ops[1] 切分的最小样本数,小于此值不再切分
def chooseBestSplit(dataSet, leafType=regLeaf, errType=regErr, ops=(1,4)):
    tolS = ops[0]; tolN = ops[1]
    #if all the target variables are the same value: quit and return value
    # print(set(dataSet[:,-1].T.tolist()[0]))

    if len(set(dataSet[:,-1].T.tolist()[0])) == 1: #exit cond 1
    # if len(set(dataSet[:, -1])) == 1:  # exit cond 1
        return None, leafType(dataSet) # 返回None,输出值
    m,n = np.shape(dataSet)
    #the choice of the best feature is driven by Reduction in RSS error from mean
    S = errType(dataSet)
    bestS = np.inf; bestIndex = 0; bestValue = 0
    for featIndex in range(n-1):
        for splitVal in set(dataSet[:,featIndex].T.tolist()[0]):
            mat0, mat1 = binSplitDataSet(dataSet, featIndex, splitVal)
            if (np.shape(mat0)[0] < tolN) or (np.shape(mat1)[0] < tolN):
                continue  # 结束本次循环,小于最小切分样本数,不再切分
            newS = errType(mat0) + errType(mat1)
            if newS < bestS:
                bestIndex = featIndex
                bestValue = splitVal
                bestS = newS
    #if the decrease (S-bestS) is less than a threshold don't do the split
    if (S - bestS) < tolS:  # 切分前的和切分后的误差小于给定值,不再切分
        return None, leafType(dataSet) #exit cond 2
    # mat0, mat1 = binSplitDataSet(dataSet, bestIndex, bestValue) # ?按照最优特征和值切分
    # if (np.shape(mat0)[0] < tolN) or (np.shape(mat1)[0] < tolN):  #exit cond 3
    #     return None, leafType(dataSet)
    return bestIndex,bestValue#returns the best feature to split on
                              #and the value used for that split

def createTree(dataSet, leafType=regLeaf, errType=regErr, ops=(1,4)):#assume dataSet is NumPy Mat so we can array filtering
    feat, val = chooseBestSplit(dataSet, leafType, errType, ops)#choose the best split
    if feat == None: return val #if the splitting hit a stop condition return val
    retTree = {}
    retTree['spInd'] = feat
    retTree['spVal'] = val
    lSet, rSet = binSplitDataSet(dataSet, feat, val)
    retTree['left'] = createTree(lSet, leafType, errType, ops)
    retTree['right'] = createTree(rSet, leafType, errType, ops)
    return retTree

def isTree(obj):
    return (type(obj).__name__ == 'dict')

def getMean(tree):
    if isTree(tree['right']): tree['right'] = getMean(tree['right'])
    if isTree(tree['left']): tree['left'] = getMean(tree['left'])
    return (tree['left'] + tree['right']) / 2.0

def prune(tree, testData):
    if np.shape(testData)[0] == 0: return getMean(
        tree)  # if we have no test data collapse the tree
    if (isTree(tree['right']) or isTree(
            tree['left'])):  # if the branches are not trees try to prune them
        lSet, rSet = binSplitDataSet(testData, tree['spInd'], tree['spVal'])
    if isTree(tree['left']): tree['left'] = prune(tree['left'], lSet)
    if isTree(tree['right']): tree['right'] = prune(tree['right'], rSet)
    # if they are now both leafs, see if we can merge them
    if not isTree(tree['left']) and not isTree(tree['right']):
        lSet, rSet = binSplitDataSet(testData, tree['spInd'], tree['spVal'])
        errorNoMerge = sum(np.power(lSet[:, -1] - tree['left'], 2)) + \
                       sum(np.power(rSet[:, -1] - tree['right'], 2))
        treeMean = (tree['left'] + tree['right']) / 2.0
        errorMerge = sum(np.power(testData[:, -1] - treeMean, 2))
        if errorMerge < errorNoMerge:
            print("merging")
            return treeMean
        else:
            return tree
    else:
        return tree

# 模型树代码--未测试
def linearSolve(dataSet):   #helper function used in two places
    m,n = np.shape(dataSet)
    X = np.mat(np.ones((m,n))); Y = np.mat(np.ones((m,1)))#create a copy of
    # data with 1
    # in 0th postion
    X[:,1:n] = dataSet[:,0:n-1]; Y = dataSet[:,-1]#and strip out Y
    xTx = X.T*X
    if np.linalg.det(xTx) == 0.0:
        raise NameError('This matrix is singular, cannot do inverse,\n\
        try increasing the second value of ops')
    ws = xTx.I * (X.T * Y)
    return ws,X,Y

def regTreeEval(model, inDat):
    return float(model)

def modelTreeEval(model, inDat):
    n = np.shape(inDat)[1]
    X = np.mat(np.ones((1, n + 1)))
    X[:, 1:n + 1] = inDat
    return float(X * model)

def treeForeCast(tree, inData, modelEval=regTreeEval):
    if not isTree(tree): return modelEval(tree, inData)
    if inData[tree['spInd']] > tree['spVal']:
        if isTree(tree['left']):
            return treeForeCast(tree['left'], inData, modelEval)
        else:
            return modelEval(tree['left'], inData)
    else:
        if isTree(tree['right']):
            return treeForeCast(tree['right'], inData, modelEval)
        else:
            return modelEval(tree['right'], inData)

def createForeCast(tree, testData, modelEval=regTreeEval):
    m = len(testData)
    yHat = np.mat(np.zeros((m, 1)))
    for i in range(m):
        yHat[i, 0] = treeForeCast(tree, np.mat(testData[i]), modelEval)
    return yHat

if __name__ == '__main__':
    # mat0, mat1 = binSplitDataSet(np.mat(np.eye(4)),1,0.5)  # 二分测试
    dataMat = loadDataSet('ex00.txt')  # 构建数测试
    myTree = createTree(dataMat)
    print(myTree)

    dataMat2 = loadDataSet('ex0.txt')
    myTree2 = createTree(dataMat2)
    print(myTree2)

    dataMat31 = loadDataSet('ex2.txt')  # 剪枝测试
    dataMat32 = loadDataSet('ex2test.txt')
    myTree31 = createTree(dataMat31)
    retTree = prune(myTree31, dataMat32)
    print(myTree31)
    print(retTree)