决策树C4.5预测号码快递送餐与广告骚扰,准确率达到98%以上
-
前言
笔者最近在做机器学习,设计到数据挖掘几个算法,主要业务用于从高频号码中分类出指定类别。
从一个开发者转向一个数据分析者,同时兼顾写代码,这感觉不好受啊。
不多说,目前的算法因为都有标签,趋向采用监督学习方向方向进行分类,实验了knn不理想,今日使用了C4.5进行分类,采用指定的时间维度,准确率达到98%以上,狠狠激动了一把。
-
算法,此算法未剪枝,后期进行二次修复。关于数据集,可联系本人邮箱[email protected]
'''
Created on 2017年12月15日
@author: Oprcalf
'''
from math import log
import operator
import common.arithmetic.tree.treePlotter as tp
def calcShannonEnt(dataSet):
"""
输入:数据集
输出:数据集的香农熵
描述:计算给定数据集的香农熵;熵越大,数据集的混乱程度越大
"""
numEntries = len(dataSet)
labelCounts = {}
for featVec in dataSet:
currentLabel = featVec[-1]
if currentLabel not in labelCounts.keys():
labelCounts[currentLabel] = 0
labelCounts[currentLabel] += 1
shannonEnt = 0.0
for key in labelCounts:
prob = float(labelCounts[key]) / numEntries
shannonEnt -= prob * log(prob, 2)
return shannonEnt
def splitDataSet(dataSet, axis, value):
"""
输入:数据集,选择维度,选择值
输出:划分数据集
描述:按照给定特征划分数据集;去除选择维度中等于选择值的项
"""
retDataSet = []
for featVec in dataSet:
if featVec[axis] == value:
reduceFeatVec = featVec[:axis]
reduceFeatVec.extend(featVec[axis + 1:])
retDataSet.append(reduceFeatVec)
return retDataSet
def chooseBestFeatureToSplit(dataSet):
"""
输入:数据集
输出:最好的划分维度
描述:选择最好的数据集划分维度
"""
numFeatures = len(dataSet[0]) - 1
baseEntropy = calcShannonEnt(dataSet)
bestInfoGainRatio = 0.0
bestFeature = -1
for i in range(numFeatures):
featList = [example[i] for example in dataSet]
uniqueVals = set(featList)
newEntropy = 0.0
splitInfo = 0.0
for value in uniqueVals:
subDataSet = splitDataSet(dataSet, i, value)
prob = len(subDataSet) / float(len(dataSet))
newEntropy += prob * calcShannonEnt(subDataSet)
splitInfo += -prob * log(prob, 2)
infoGain = baseEntropy - newEntropy
if (splitInfo == 0): # fix the overflow bug
continue
infoGainRatio = infoGain / splitInfo
if (infoGainRatio > bestInfoGainRatio):
bestInfoGainRatio = infoGainRatio
bestFeature = i
return bestFeature
def majorityCnt(classList):
"""
输入:分类类别列表
输出:子节点的分类
描述:数据集已经处理了所有属性,但是类标签依然不是唯一的,
采用多数判决的方法决定该子节点的分类
"""
classCount = {}
for vote in classList:
if vote not in classCount.keys():
classCount[vote] = 0
classCount[vote] += 1
sortedClassCount = sorted(classCount.iteritems(),
key=operator.itemgetter(1), reversed=True)
return sortedClassCount[0][0]
def createTree(dataSet, labels):
"""
输入:数据集,特征标签
输出:决策树
描述:递归构建决策树,利用上述的函数
"""
classList = [example[-1] for example in dataSet]
if classList.count(classList[0]) == len(classList):
# 类别完全相同,停止划分
return classList[0]
if len(dataSet[0]) == 1:
# 遍历完所有特征时返回出现次数最多的
return majorityCnt(classList)
bestFeat = chooseBestFeatureToSplit(dataSet)
bestFeatLabel = labels[bestFeat]
myTree = {bestFeatLabel: {}}
del(labels[bestFeat])
# 得到列表包括节点所有的属性值
featValues = [example[bestFeat] for example in dataSet]
uniqueVals = set(featValues)
for value in uniqueVals:
subLabels = labels[:]
myTree[bestFeatLabel][value] = createTree(
splitDataSet(dataSet, bestFeat, value), subLabels)
return myTree
def classify(inputTree, featLabels, testVec):
"""
输入:决策树,分类标签,测试数据
输出:决策结果
描述:跑决策树
"""
classLabel = ""
firstStr = list(inputTree.keys())[0]
secondDict = inputTree[firstStr]
featIndex = featLabels.index(firstStr)
for key in secondDict.keys():
if testVec[featIndex] == key:
if type(secondDict[key]).__name__ == 'dict':
classLabel = classify(secondDict[key], featLabels, testVec)
else:
classLabel = secondDict[key]
return classLabel
def classifyAll(inputTree, featLabels, testDataSet):
"""
输入:决策树,分类标签,测试数据集
输出:决策结果
描述:跑决策树
"""
classLabelAll = []
n = len(testDataSet)
correct = 0
for testVec in testDataSet:
label = classify(inputTree, featLabels, testVec)
classLabelAll.append(label)
if label == testVec[-1]:
correct += 1
accuracyRate = "准确率: " + str(correct / float(n))
return classLabelAll, accuracyRate
def storeTree(inputTree, filename):
"""
输入:决策树,保存文件路径
输出:
描述:保存决策树到文件
"""
import pickle
fw = open(filename, 'wb')
pickle.dump(inputTree, fw)
fw.close()
def grabTree(filename):
"""
输入:文件路径名
输出:决策树
描述:从文件读取决策树
"""
import pickle
fr = open(filename, 'rb')
return pickle.load(fr)
def createDataSet(dataset_file):
'''
返回dataset(列表集合)和features(列表)
'''
dataSet = []
for index, line in enumerate(open(dataset_file, 'rU').readlines()):
line = line.strip()
fea_and_label = line.split(',')
dataSet.append([float(fea_and_label[i]) for i in range(
len(fea_and_label) - 1)] + [fea_and_label[len(fea_and_label) - 1]])
labels = ['call1', 'call2', 'call3', 'call4', 'call5', 'call6']
return dataSet, labels
def createTestSet(dataset_file):
dataSet = []
for index, line in enumerate(open(dataset_file, 'rU').readlines()):
line = line.strip()
fea_and_label = line.split(',')
dataSet.append([float(fea_and_label[i]) for i in range(
len(fea_and_label) - 1)] + [fea_and_label[len(fea_and_label) - 1]])
return dataSet
def main():
dataSet, labels = createDataSet("F:\\train.txt")
labels_tmp = labels[:] # 拷贝,createTree会改变labels
desicionTree = createTree(dataSet, labels_tmp)
#storeTree(desicionTree, 'classifierStorage.txt')
#desicionTree = grabTree('classifierStorage.txt')
print('决策树:\n', desicionTree)
# tp.createPlot(desicionTree)
testSet = createTestSet("F:\\test.txt")
classifResult, accuracyRate = classifyAll(desicionTree, labels, testSet)
print('分析的结果集:\n', classifResult)
print('分析的准确率:\n', accuracyRate)
if __name__ == '__main__':
main()