from math import log import operator def calcShannonEnt(dataSet): numEntries = len(dataSet) labelCounts= {} for featVec in dataSet: currentLabel = featVec[-1] if currentLabel not in labelCounts.keys(): labelCounts[currentLabel] = 0 labelCounts[currentLabel] += 1 print labelCounts shannonEnt = 0.0 for key in labelCounts: prob = float(labelCounts[key]) / numEntries shannonEnt -= prob * log(prob,2) return shannonEnt def createDataSet(): dataSet = [[1, 1, 'yes'], [1, 1, 'yes'], [1, 0, 'no'], [0, 1, 'no'], [0, 1, 'no']] labels = ['no surfacing', 'flippers'] return dataSet, labels def splitDataSet(dataSet, axis, value): retDataSet = [] for featVec in dataSet: if featVec[axis] == value: reducedFeatVec = featVec[:axis] reducedFeatVec.extend(featVec[axis+1:]) retDataSet.append(reducedFeatVec) # if featVec[axis] == value: # retDataSet.append(featVec) return retDataSet def chooseBestFeatureToSplit(dataSet): numFeatures = len(dataSet[0]) - 1 print numFeatures baseEntropy = calcShannonEnt(dataSet) print "baseEntropy = %s" % baseEntropy bestInfoGain = 0.0; bestFeature = -1 for i in range(numFeatures): featList = [example[i] for example in dataSet] #链表推导式 []:取出所有dataSet中的数据的第i个元素组成一个列表 print "featList: %s" % featList uniqueVals = set(featList) #去重,不重的放在uniqueVals newEntropy = 0.0 for value in uniqueVals: #print "i = %d, value = %d" % (i, value) subDataSet = splitDataSet(dataSet, i, value) prob = len(subDataSet) / float(len(dataSet)) newEntropy += prob * calcShannonEnt(subDataSet) print newEntropy infoGain = baseEntropy - newEntropy print "infoGain = %s" % infoGain if infoGain > bestInfoGain: bestInfoGain = infoGain bestFeature = i return bestFeature def majorityCnt(classList): classCount = {} for vote in classList: if vote not in classCount.keys(): classCount[vote] = 0 classCount[vote] += 1 sortedClassCount = sorted(classCount.iteritems(), key = operator.itemgetter(1), reverse = True) return sortedClassCount[0][0] def createTree(dataSet, labels): classList = [example[-1] for example in dataSet] #dataSet里每个列表的的最右边元素组成一个新的列表clasList if classList.count(classList[0]) == len(classList): #如果同一类,return类标签 return classList[0] if len(dataSet[0]) == 1: #不是classList[0],就是说第一行只有一个特征了, return majorityCnt(classList) #return站多数的标签 bestFeat = chooseBestFeatureToSplit(dataSet) bestFeatLabel = labels[bestFeat] myTree = {bestFeatLabel:{}} del(labels[bestFeat]) featValues = [example[bestFeat] for example in dataSet] uniqueVals = set(featValues) for value in uniqueVals: subLabels = labels[:] myTree[bestFeatLabel][value] = createTree(splitDataSet(dataSet, bestFeat, value), subLabels) return myTree def classify(inputTree, featLabels, testVec): firstStr = inputTree.keys()[0] secondDict = inputTree[firstStr] featIndex = featLabels.index(firstStr) for key in secondDict.keys(): if testVec[featIndex] == key: if type(secondDict[key]).__name__ == 'dict': classLabel = classify(secondDict[key], featLabels, testVec) else: classLabel = secondDict[key] return classLabel