Machine learning | tree regression

Time:2019-12-2

Because of the heavy recent study QAQ, so I don’t talk nonsense, directly on the code~

Operation result

Machine learning | tree regression

Machine learning | tree regression

Code

from numpy import *
#Using binary segmentation - cutting the data set in two at a time
#If a characteristic value of the data is equal to the value required for segmentation,
#Then these data will enter the left subtree of the tree, otherwise
#Enter right subtree
def loadDataSet(fileName):
    dataMat=[]
    fr=open(fileName)
    for line in fr.readlines():
        curLine=line.strip().split('\t')
        #Map each row to a floating-point number
        fltLine=list(map(float,curLine))
        dataMat.append(fltLine)
    #Save all data in the file in the same matrix
    return dataMat


#Parameters: dataset, feature to be segmented, a value of the feature
def binSplitDataSet(dataSet,feature,value):
    #Split the data set into two subsets and return
    mat0 = dataSet[nonzero(dataSet[:,feature] > value)[0],:]
    mat1 = dataSet[nonzero(dataSet[:,feature] <= value)[0],:]

    return mat0,mat1

#Establishing the function of leaf node
#The regleaf function is called when the choosebestsplit function decides not to slice the dataset anymore
#Get the model of the leaf node. In the regression tree species, the model is actually the mean value of the target variable
def regLeaf(dataSet):
    return mean(dataSet[:,-1])

#The function of calculating error -- the total variance is calculated here
def regErr(dataSet):
    #Number of samples in the VaR * data set = total variance
    return var(dataSet[:,-1]) * shape(dataSet)[0]

#Given an error calculation method, the function will find the best binary cutting method on the dataset
#(he traverses all the features and possible values to find the segmentation threshold that minimizes the error.)
#In addition, the function also determines when to stop segmentation. Once the segmentation is stopped, a leaf node will be generated
#Errtype is the total square error (total variance)
def chooseBestSplit(dataSet, leafType=regLeaf, errType=regErr, ops=(1,4)):
    #User specified parameter to control function stop time
    #Tols is the error decreasing value, and toln is the minimum number of samples to be segmented
    tolS = ops[0]; tolN = ops[1]
    #Exit if all values are equal
    if len(set(dataSet[:,-1].T.tolist()[0])) == 1:
        #Cannot find a "good" binary segmentation, return none and call leaftype to generate leaf node at the same time
        return None, leafType(dataSet)
    m,n = shape(dataSet)
    S = errType(dataSet)
    bestS = inf; bestIndex = 0; bestValue = 0
    for featIndex in range(n-1):
        for splitVal in set((dataSet[:,featIndex].T.A.tolist())[0]): 
            mat0, mat1 = binSplitDataSet(dataSet, featIndex, splitVal)
            if (shape(mat0)[0] < tolN) or (shape(mat1)[0] < tolN): continue
            newS = errType(mat0) + errType(mat1)
            if newS < bestS: 
                bestIndex = featIndex
                bestValue = splitVal
                bestS = newS
    #Exit if the error reduction is small
    if (S - bestS) < tolS: 
        return None, leafType(dataSet) 
    mat0, mat1 = binSplitDataSet(dataSet, bestIndex, bestValue)
    #Exit if the segmented dataset is very small
    if (shape(mat0)[0] < tolN) or (shape(mat1)[0] < tolN): 
        return None, leafType(dataSet)
    #A "good" segmentation method is found, which returns feature number and segmentation feature value
    #The best segmentation method is found: the segmentation with the lowest error can be achieved after segmentation
    return bestIndex,bestValue


#Functions for building trees
#Dataset is a dataset
#Leaftype is the function to establish leaf node, and errtype is the error calculation function
#Ops is a tuple containing other parameters needed for Book Building
def createTree(dataSet, leafType=regLeaf, errType=regErr, ops=(1,4)):
    #Choosebestsplit is a segmentation function
    #If the stop condition choosebestsplit is met, the value of none and a certain model will be returned
    #If you are building a regression tree, the model is a constant. If you are building a model tree, the
    #The model is a linear equation (the regression tree assumes that the leaf node is constant)
    #If the stop condition is not met, choosebestsplit will create a new Python
    #Dictionary, and divide the data set into two parts, and continue the recursive call on the two data sets respectively
    #Using the createtree function
    feat,val=chooseBestSplit(dataSet,leafType,errType,ops)
    #Return to leaf node when stop condition is met
    if feat==None:
        return val
    retTree={}
    retTree['spInd']=feat
    retTree['spVal']=val
    #The data set is divided into two parts according to the feature to be divided and a certain value of the feature
    lSet,rSet=binSplitDataSet(dataSet,feat,val)
    #Create left and right subtrees
    retTree['left']=createTree(lSet,leafType,errType,ops)
    retTree['right']=createTree(rSet,leafType,errType,ops)
    return retTree


def drawFigure1():
    # import matplotlib.pyplot as plt 
    # myDat=loadDataSet('ex00.txt') 
    # myMat=mat(myDat) 
    # createTree(myMat) 
    # plt.plot(myMat[:,0],myMat[:,1],'ro') 
    # plt.show()
    import matplotlib.pyplot as plt 
    myDat=loadDataSet('ex0.txt') 
    myMat=mat(myDat) 
    createTree(myMat) 
    plt.plot(myMat[:,1],myMat[:,2],'ro') 
    plt.show()



def main():
    drawFigure1()
    # myDat=loadDataSet('ex00.txt')
    # myMat=mat(myDat)
    # myTree=createTree(myMat)
    # print(myTree)


    #Establish a matrix with all 1 diagonal elements
    #testMat=mat(eye(4))
    #print(testMat)
    #The feature to be split is in the first column
    #Divide by 0.5
    #mat0,mat1=binSplitDataSet(testMat,0,0.5)
    # print(mat0)
    # print(mat1)

if __name__=='__main__':
    main()```


  [1]: /img/bVbqGCZ