1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174
|
''' ID3决策树算法 ''' import numpy as np import math
def createDataSet(): dataSet = [['青年', '否', '否', '一般', '拒绝'], ['青年', '否', '否', '好', '拒绝'], ['青年', '是', '否', '好', '同意'], ['青年', '是', '是', '一般', '同意'], ['青年', '否', '否', '一般', '拒绝'], ['中年', '否', '否', '一般', '拒绝'], ['中年', '否', '否', '好', '拒绝'], ['中年', '是', '是', '好', '同意'], ['中年', '否', '是', '非常好', '同意'], ['中年', '否', '是', '非常好', '同意'], ['老年', '否', '是', '非常好', '同意'], ['老年', '否', '是', '好', '同意'], ['老年', '是', '否', '好', '同意'], ['老年', '是', '否', '非常好', '同意'], ['老年', '否', '否', '一般', '拒绝'], ] featureName = ['年龄', '有工作', '有房子', '信贷情况'] return dataSet, featureName
def get_Entropy(dataSet): ''' 计算给定数据集的信息熵 ''' labelCounts = {} for featVec in dataSet: currentLabel = featVec[-1] if currentLabel not in labelCounts.keys(): labelCounts.setdefault(currentLabel, 0) labelCounts[currentLabel] += 1
Entropy = 0.0 for key in labelCounts: prob = float(labelCounts[key]) / len(dataSet) Entropy -= prob * math.log(prob, 2)
return Entropy
def splitDataSet(dataSet, axis, value): ''' 按照给定的特征列及特征值划分数据集 INPUT --> 数据集, 特征列序号, 特征值 OUTPUT --> 符合该特征的所有实例并且自动移除掉这维特征 '''
output = [] for featVec in dataSet: if featVec[axis] == value: smallFeatVec = featVec[:axis] smallFeatVec.extend(featVec[axis+1:]) output.append(smallFeatVec) return output
def get_CondEntropy(dataSet, axis, uniqueVals): ''' 计算给定数据集划分的条件熵 INPUT --> 数据集, 目标特征列序号, 目标特征列的值域 OUTPUT --> 该划分的条件熵 ''' CondEntropy = 0.0 for value in uniqueVals: subDataSet = splitDataSet(dataSet, axis, value) prob = len(subDataSet) / float(len(dataSet)) CondEntropy += prob * get_Entropy(subDataSet) return CondEntropy
def InfoGain(dataSet, PEntropy, axis): ''' 计算信息增益 INPUT --> 数据集, 父节点信息熵, 目标特征列序号 OUTPUT --> 该划分的信息增益 '''
featVals = [line[axis] for line in dataSet] uniqueVals = set(featVals) newEntropy = get_CondEntropy(dataSet, axis, uniqueVals) infoGain = PEntropy - newEntropy return infoGain
def chooseBestFeat(dataSet): ''' 选择最好的划分特征 INPUT --> 数据集 OUTPUT --> 最好的划分特征 ''' numFeatures = len(dataSet[0]) -1 baseEntropy = get_Entropy(dataSet) bestInfoGain = 0.0 bestFeature = -1 for i in range(numFeatures): infoGain = InfoGain(dataSet, baseEntropy, i) if(infoGain > bestInfoGain): bestInfoGain = infoGain bestFeature = i return bestFeature
def majorityCnt(classList): ''' 投票表决代码 ''' classCount = {} for vote in classList: if vote not in classCount.keys(): classCount.setdefault(vote, 0) classCount[vote] += 1 sortedClassCount = sorted(classCount.items(), key=lambda i:i[1], reverse=True) return sortedClassCount[0][0]
def createTree(dataSet, featureName): ''' 创建决策树 INPUT --> 数据集, 特征标签 ''' classList = [line[-1] for line in dataSet]
if classList.count(classList[0]) == len(classList): return classList[0] if len(dataSet[0]) ==1: return majorityCnt(classList) bestFeat = chooseBestFeat(dataSet) bestFeatLabel = featureName[bestFeat] myTree ={bestFeatLabel:{}} del (featureName[bestFeat]) featValues = [line[bestFeat] for line in dataSet] uniqueVals = set(featValues) for value in uniqueVals: subLabels = featureName[:] myTree[bestFeatLabel][value] = createTree(splitDataSet(dataSet, bestFeat, value), subLabels) return myTree
if __name__ == '__main__': dataSet, featureName = createDataSet() r = chooseBestFeat(dataSet) myTree = createTree(dataSet, featureName) print(myTree)
|