# Machine learning | tree regression

Time：2019-12-2

Because of the heavy recent study QAQ, so I don’t talk nonsense, directly on the code~

## Operation result  ## Code

``````from numpy import *
#Using binary segmentation - cutting the data set in two at a time
#If a characteristic value of the data is equal to the value required for segmentation,
#Then these data will enter the left subtree of the tree, otherwise
#Enter right subtree
dataMat=[]
fr=open(fileName)
curLine=line.strip().split('\t')
#Map each row to a floating-point number
fltLine=list(map(float,curLine))
dataMat.append(fltLine)
#Save all data in the file in the same matrix
return dataMat

#Parameters: dataset, feature to be segmented, a value of the feature
def binSplitDataSet(dataSet,feature,value):
#Split the data set into two subsets and return
mat0 = dataSet[nonzero(dataSet[:,feature] > value),:]
mat1 = dataSet[nonzero(dataSet[:,feature] <= value),:]

return mat0,mat1

#Establishing the function of leaf node
#The regleaf function is called when the choosebestsplit function decides not to slice the dataset anymore
#Get the model of the leaf node. In the regression tree species, the model is actually the mean value of the target variable
def regLeaf(dataSet):
return mean(dataSet[:,-1])

#The function of calculating error -- the total variance is calculated here
def regErr(dataSet):
#Number of samples in the VaR * data set = total variance
return var(dataSet[:,-1]) * shape(dataSet)

#Given an error calculation method, the function will find the best binary cutting method on the dataset
#(he traverses all the features and possible values to find the segmentation threshold that minimizes the error.)
#In addition, the function also determines when to stop segmentation. Once the segmentation is stopped, a leaf node will be generated
#Errtype is the total square error (total variance)
def chooseBestSplit(dataSet, leafType=regLeaf, errType=regErr, ops=(1,4)):
#User specified parameter to control function stop time
#Tols is the error decreasing value, and toln is the minimum number of samples to be segmented
tolS = ops; tolN = ops
#Exit if all values are equal
if len(set(dataSet[:,-1].T.tolist())) == 1:
#Cannot find a "good" binary segmentation, return none and call leaftype to generate leaf node at the same time
return None, leafType(dataSet)
m,n = shape(dataSet)
S = errType(dataSet)
bestS = inf; bestIndex = 0; bestValue = 0
for featIndex in range(n-1):
for splitVal in set((dataSet[:,featIndex].T.A.tolist())):
mat0, mat1 = binSplitDataSet(dataSet, featIndex, splitVal)
if (shape(mat0) < tolN) or (shape(mat1) < tolN): continue
newS = errType(mat0) + errType(mat1)
if newS < bestS:
bestIndex = featIndex
bestValue = splitVal
bestS = newS
#Exit if the error reduction is small
if (S - bestS) < tolS:
return None, leafType(dataSet)
mat0, mat1 = binSplitDataSet(dataSet, bestIndex, bestValue)
#Exit if the segmented dataset is very small
if (shape(mat0) < tolN) or (shape(mat1) < tolN):
return None, leafType(dataSet)
#A "good" segmentation method is found, which returns feature number and segmentation feature value
#The best segmentation method is found: the segmentation with the lowest error can be achieved after segmentation
return bestIndex,bestValue

#Functions for building trees
#Dataset is a dataset
#Leaftype is the function to establish leaf node, and errtype is the error calculation function
#Ops is a tuple containing other parameters needed for Book Building
def createTree(dataSet, leafType=regLeaf, errType=regErr, ops=(1,4)):
#Choosebestsplit is a segmentation function
#If the stop condition choosebestsplit is met, the value of none and a certain model will be returned
#If you are building a regression tree, the model is a constant. If you are building a model tree, the
#The model is a linear equation (the regression tree assumes that the leaf node is constant)
#If the stop condition is not met, choosebestsplit will create a new Python
#Dictionary, and divide the data set into two parts, and continue the recursive call on the two data sets respectively
#Using the createtree function
feat,val=chooseBestSplit(dataSet,leafType,errType,ops)
if feat==None:
return val
retTree={}
retTree['spInd']=feat
retTree['spVal']=val
#The data set is divided into two parts according to the feature to be divided and a certain value of the feature
lSet,rSet=binSplitDataSet(dataSet,feat,val)
#Create left and right subtrees
retTree['left']=createTree(lSet,leafType,errType,ops)
retTree['right']=createTree(rSet,leafType,errType,ops)
return retTree

def drawFigure1():
# import matplotlib.pyplot as plt
# myMat=mat(myDat)
# createTree(myMat)
# plt.plot(myMat[:,0],myMat[:,1],'ro')
# plt.show()
import matplotlib.pyplot as plt
myMat=mat(myDat)
createTree(myMat)
plt.plot(myMat[:,1],myMat[:,2],'ro')
plt.show()

def main():
drawFigure1()
# myMat=mat(myDat)
# myTree=createTree(myMat)
# print(myTree)

#Establish a matrix with all 1 diagonal elements
#testMat=mat(eye(4))
#print(testMat)
#The feature to be split is in the first column
#Divide by 0.5
#mat0,mat1=binSplitDataSet(testMat,0,0.5)
# print(mat0)
# print(mat1)

if __name__=='__main__':
main()```

: /img/bVbqGCZ``````

## PHP uses redis to solve the problem of oversold

preface In the event of the second killing of goods, for example, the inventory of goods is only 100, but in the rush purchase activity, 200 people may rush to buy at the same time, so there is concurrency. If the order of 100 goods is completed and the inventory is 0, it may continue […]