# Machine learning | tree regression

Time：2019-12-2

Because of the heavy recent study QAQ, so I don’t talk nonsense, directly on the code~

## Operation result  ## Code

``````from numpy import *
#Using binary segmentation - cutting the data set in two at a time
#If a characteristic value of the data is equal to the value required for segmentation,
#Then these data will enter the left subtree of the tree, otherwise
#Enter right subtree
dataMat=[]
fr=open(fileName)
curLine=line.strip().split('\t')
#Map each row to a floating-point number
fltLine=list(map(float,curLine))
dataMat.append(fltLine)
#Save all data in the file in the same matrix
return dataMat

#Parameters: dataset, feature to be segmented, a value of the feature
def binSplitDataSet(dataSet,feature,value):
#Split the data set into two subsets and return
mat0 = dataSet[nonzero(dataSet[:,feature] > value),:]
mat1 = dataSet[nonzero(dataSet[:,feature] <= value),:]

return mat0,mat1

#Establishing the function of leaf node
#The regleaf function is called when the choosebestsplit function decides not to slice the dataset anymore
#Get the model of the leaf node. In the regression tree species, the model is actually the mean value of the target variable
def regLeaf(dataSet):
return mean(dataSet[:,-1])

#The function of calculating error -- the total variance is calculated here
def regErr(dataSet):
#Number of samples in the VaR * data set = total variance
return var(dataSet[:,-1]) * shape(dataSet)

#Given an error calculation method, the function will find the best binary cutting method on the dataset
#(he traverses all the features and possible values to find the segmentation threshold that minimizes the error.)
#In addition, the function also determines when to stop segmentation. Once the segmentation is stopped, a leaf node will be generated
#Errtype is the total square error (total variance)
def chooseBestSplit(dataSet, leafType=regLeaf, errType=regErr, ops=(1,4)):
#User specified parameter to control function stop time
#Tols is the error decreasing value, and toln is the minimum number of samples to be segmented
tolS = ops; tolN = ops
#Exit if all values are equal
if len(set(dataSet[:,-1].T.tolist())) == 1:
#Cannot find a "good" binary segmentation, return none and call leaftype to generate leaf node at the same time
return None, leafType(dataSet)
m,n = shape(dataSet)
S = errType(dataSet)
bestS = inf; bestIndex = 0; bestValue = 0
for featIndex in range(n-1):
for splitVal in set((dataSet[:,featIndex].T.A.tolist())):
mat0, mat1 = binSplitDataSet(dataSet, featIndex, splitVal)
if (shape(mat0) < tolN) or (shape(mat1) < tolN): continue
newS = errType(mat0) + errType(mat1)
if newS < bestS:
bestIndex = featIndex
bestValue = splitVal
bestS = newS
#Exit if the error reduction is small
if (S - bestS) < tolS:
return None, leafType(dataSet)
mat0, mat1 = binSplitDataSet(dataSet, bestIndex, bestValue)
#Exit if the segmented dataset is very small
if (shape(mat0) < tolN) or (shape(mat1) < tolN):
return None, leafType(dataSet)
#A "good" segmentation method is found, which returns feature number and segmentation feature value
#The best segmentation method is found: the segmentation with the lowest error can be achieved after segmentation
return bestIndex,bestValue

#Functions for building trees
#Dataset is a dataset
#Leaftype is the function to establish leaf node, and errtype is the error calculation function
#Ops is a tuple containing other parameters needed for Book Building
def createTree(dataSet, leafType=regLeaf, errType=regErr, ops=(1,4)):
#Choosebestsplit is a segmentation function
#If the stop condition choosebestsplit is met, the value of none and a certain model will be returned
#If you are building a regression tree, the model is a constant. If you are building a model tree, the
#The model is a linear equation (the regression tree assumes that the leaf node is constant)
#If the stop condition is not met, choosebestsplit will create a new Python
#Dictionary, and divide the data set into two parts, and continue the recursive call on the two data sets respectively
#Using the createtree function
feat,val=chooseBestSplit(dataSet,leafType,errType,ops)
if feat==None:
return val
retTree={}
retTree['spInd']=feat
retTree['spVal']=val
#The data set is divided into two parts according to the feature to be divided and a certain value of the feature
lSet,rSet=binSplitDataSet(dataSet,feat,val)
#Create left and right subtrees
retTree['left']=createTree(lSet,leafType,errType,ops)
retTree['right']=createTree(rSet,leafType,errType,ops)
return retTree

def drawFigure1():
# import matplotlib.pyplot as plt
# myMat=mat(myDat)
# createTree(myMat)
# plt.plot(myMat[:,0],myMat[:,1],'ro')
# plt.show()
import matplotlib.pyplot as plt
myMat=mat(myDat)
createTree(myMat)
plt.plot(myMat[:,1],myMat[:,2],'ro')
plt.show()

def main():
drawFigure1()
# myMat=mat(myDat)
# myTree=createTree(myMat)
# print(myTree)

#Establish a matrix with all 1 diagonal elements
#testMat=mat(eye(4))
#print(testMat)
#The feature to be split is in the first column
#Divide by 0.5
#mat0,mat1=binSplitDataSet(testMat,0,0.5)
# print(mat0)
# print(mat1)

if __name__=='__main__':
main()```

: /img/bVbqGCZ``````

## JavaScript (E5, 6) regular learning summary learning, can see can not see!

1. overview Regular expression is a method to express text pattern (string structure). There are two ways to create: One is to use literals, with slashes for start and end. var regex = /xyz/ The other is to use the regexp constructor. var regex = new RegExp(‘xyz’); The main difference between them is that the […]