哈尔滨理工大学
软件与微电子学院
实 验 报 告
(2020-2021第二学期)
课程名称: | 数据分析方法 |
班 级: | 软件18- 1 班 |
学 号: | 1814010130 |
姓 名: | 张立辉 |
哈尔滨理工大学软件与微电子学院
实验名称: | 实验四 分类分析 | 专 业 | 软件工程 | |||
---|---|---|---|---|---|---|
姓 名 | 张立辉 | 学 号 | 1814010130 | 班 级 | 软件18-1 |
一、实验目的:
通过实验,体会分类过程,深刻理解分类思想;
熟悉和掌握决策树的分类原理、实质和过程,掌握典型的决策树算法和实现技术。
熟悉和掌握贝叶斯分类原理、实质和过程,掌握贝叶斯分类算法和实现技术。
二、实验内容:
根据play或Car Evaluation数据集,使用ID3算法设计创建决策树,并测试决策树的效用。
根据Car Evaluation数据集,使用贝叶斯分类对数据集进行分析。
三、实验设备及软件环境:
Windows10
Python3.8
PyCharm 2020.3.5 (Professional Edition)
四、实验过程及结果:
根据play或Car Evaluation数据集,使用ID3算法设计创建决策树,并测试决策树的效用。
ID3Tree.py
# -*- coding: utf-8 -*-
import operator
from math import log
import pandas as pd
import numpy as np
def majorityCnt(classList):
classCount = {}
for vote in classList: # 统计classList中每个元素出现的次数
if vote not in classCount.keys():
classCount[vote] = 0
classCount[vote] += 1
sortedClassCount = sorted(classCount.items(), key=operator.itemgetter(1), reverse=True)
# 根据字典的值降序排序
return sortedClassCount[0][0] # 返回classList中出现次数最多的元素
##创建数据集
def createDataSet():
"""
创建数据集
"""
data = pd.read_excel(r"C:\Users\zlh\Desktop\数据分析方法实验\实验三、四\play.xlsx")
# data = pd.read_excel(r"C:\Users\zlh\Desktop\数据分析方法实验\实验三、四\Car Evaluation.xlsx")
train_data = np.array(data) # np.ndarray()
dataSet = train_data.tolist() # list
# print(dataSet)
featureName = ['outlook','teperature','humidity','windy']
# featureName = ['buying', 'maint', 'doors', 'persons', 'lug_boot', 'safety']
# 返回数据集和每个维度的名称
return dataSet, featureName
##分割数据集
def splitDataSet(dataSet, axis, value):
"""
按照给定特征划分数据集
:param axis:划分数据集的特征的维度
:param value:特征的值
:return: 符合该特征的所有实例(并且自动移除掉这维特征)
"""
# 循环遍历dataSet中的每一行数据
retDataSet = []
for featVec in dataSet:
if featVec[axis] == value:
reduceFeatVec = featVec[:axis] # 删除这一维特征
reduceFeatVec.extend(featVec[axis + 1:])
retDataSet.append(reduceFeatVec)
return retDataSet
##计算信息熵
# 计算的始终是类别标签的不确定度
def calcShannonEnt(dataSet):
"""
计算训练数据集中的Y随机变量的香农熵
:param dataSet:
:return:
"""
numEntries = len(dataSet) # 实例的个数
labelCounts = {}
for featVec in dataSet: # 遍历每个实例,统计标签的频次
currentLabel = featVec[-1] # 表示最后一列
# 当前标签不在labelCounts map中,就让labelCounts加入该标签
if currentLabel not in labelCounts.keys():
labelCounts[currentLabel] = 0
labelCounts[currentLabel] += 1
shannonEnt = 0.0
for key in labelCounts:
prob = float(labelCounts[key]) / numEntries
shannonEnt -= prob * log(prob, 2) # log base 2
return shannonEnt
## 计算条件熵
def calcConditionalEntropy(dataSet, i, featList, uniqueVals):
"""
计算x_i给定的条件下,Y的条件熵
:param dataSet: 数据集
:param i: 维度i
:param featList: 数据集特征列表
:param unqiueVals: 数据集特征集合
:return: 条件熵
"""
ce = 0.0
for value in uniqueVals:
subDataSet = splitDataSet(dataSet, i, value)
prob = len(subDataSet) / float(len(dataSet)) # 极大似然估计概率
ce += prob * calcShannonEnt(subDataSet) # ∑pH(Y|X=xi) 条件熵的计算
return ce
##计算信息增益
def calcInformationGain(dataSet, baseEntropy, i):
"""
计算信息增益
:param dataSet: 数据集
:param baseEntropy: 数据集中Y的信息熵
:param i: 特征维度i
:return: 特征i对数据集的信息增益g(dataSet | X_i)
"""
featList = [example[i] for example in dataSet] # 第i维特征列表
uniqueVals = set(featList) # 换成集合 - 集合中的每个元素不重复
newEntropy = calcConditionalEntropy(dataSet, i, featList, uniqueVals) # 计算条件熵,
infoGain = baseEntropy - newEntropy # 信息增益 = 信息熵 - 条件熵
return infoGain
## 算法框架
def chooseBestFeatureToSplitByID3(dataSet):
"""
选择最好的数据集划分
:param dataSet:
:return:
"""
numFeatures = len(dataSet[0]) - 1 # 最后一列是分类
baseEntropy = calcShannonEnt(dataSet) # 返回整个数据集的信息熵
bestInfoGain = 0.0
bestFeature = -1
for i in range(numFeatures): # 遍历所有维度特征
infoGain = calcInformationGain(dataSet, baseEntropy, i) # 返回具体特征的信息增益
if (infoGain > bestInfoGain):
bestInfoGain = infoGain
bestFeature = i
return bestFeature # 返回最佳特征对应的维度
def createTree(dataSet, featureName, chooseBestFeatureToSplitFunc=chooseBestFeatureToSplitByID3):
"""
创建决策树
:param dataSet: 数据集
:param featureName: 数据集每一维的名称
:return: 决策树
"""
classList = [example[-1] for example in dataSet] # 类别列表
if classList.count(classList[0]) == len(classList): # 统计属于列别classList[0]的个数
return classList[0] # 当类别完全相同则停止继续划分
if len(dataSet[0]) == 1: # 当只有一个特征的时候,遍历所有实例返回出现次数最多的类别
return majorityCnt(classList) # 返回类别标签
bestFeat = chooseBestFeatureToSplitFunc(dataSet) # 最佳特征对应的索引
bestFeatLabel = featureName[bestFeat] # 最佳特征
myTree = {bestFeatLabel: {}} # map 结构,且key为featureLabel
del (featureName[bestFeat])
# 找到需要分类的特征子集
featValues = [example[bestFeat] for example in dataSet]
uniqueVals = set(featValues)
for value in uniqueVals:
subLabels = featureName[:] # 复制操作
myTree[bestFeatLabel][value] = createTree(splitDataSet(dataSet, bestFeat, value), subLabels)
return myTree
# 测试决策树的构建
dataSet, featureName = createDataSet()
myTree = createTree(dataSet, featureName)
print(myTree)
treePlotter.py
# -*- coding: utf-8 -*-
import matplotlib.pyplot as plt
# 定义文本框和箭头格式
decisionNode = dict(boxstyle="round4", color='#3366FF') # 定义判断结点形态
leafNode = dict(boxstyle="circle", color='#FF6633') # 定义叶结点形态
arrow_args = dict(arrowstyle="<-", color='g') # 定义箭头
# 绘制带箭头的注释
def plotNode(nodeTxt, centerPt, parentPt, nodeType):
createPlot.ax1.annotate(nodeTxt, xy=parentPt, xycoords='axes fraction',
xytext=centerPt, textcoords='axes fraction',
va="center", ha="center", bbox=nodeType, arrowprops=arrow_args)
# 计算叶结点数
def getNumLeafs(myTree):
numLeafs = 0
firstStr = list(myTree.keys())[0]
secondDict = myTree[firstStr]
for key in secondDict.keys():
if type(secondDict[key]).__name__ == 'dict':
numLeafs += getNumLeafs(secondDict[key])
else:
numLeafs += 1
return numLeafs
# 计算树的层数
def getTreeDepth(myTree):
maxDepth = 0
firstStr = list(myTree.keys())[0]
secondDict = myTree[firstStr]
for key in secondDict.keys():
if type(secondDict[key]).__name__ == 'dict':
thisDepth = 1 + getTreeDepth(secondDict[key])
else:
thisDepth = 1
if thisDepth > maxDepth:
maxDepth = thisDepth
return maxDepth
# 在父子结点间填充文本信息
def plotMidText(cntrPt, parentPt, txtString):
xMid = (parentPt[0] - cntrPt[0]) / 2.0 + cntrPt[0]
yMid = (parentPt[1] - cntrPt[1]) / 2.0 + cntrPt[1]
createPlot.ax1.text(xMid, yMid, txtString, va="center", ha="center", rotation=30)
def plotTree(myTree, parentPt, nodeTxt):
numLeafs = getNumLeafs(myTree)
depth = getTreeDepth(myTree)
firstStr = list(myTree.keys())[0]
cntrPt = (plotTree.xOff + (1.0 + float(numLeafs)) / 2.0 / plotTree.totalW, plotTree.yOff)
plotMidText(cntrPt, parentPt, nodeTxt) # 在父子结点间填充文本信息
plotNode(firstStr, cntrPt, parentPt, decisionNode) # 绘制带箭头的注释
secondDict = myTree[firstStr]
plotTree.yOff = plotTree.yOff - 1.0 / plotTree.totalD
for key in secondDict.keys():
if type(secondDict[key]).__name__ == 'dict':
plotTree(secondDict[key], cntrPt, str(key))
else:
plotTree.xOff = plotTree.xOff + 1.0 / plotTree.totalW
plotNode(secondDict[key], (plotTree.xOff, plotTree.yOff), cntrPt, leafNode)
plotMidText((plotTree.xOff, plotTree.yOff), cntrPt, str(key))
plotTree.yOff = plotTree.yOff + 1.0 / plotTree.totalD
def createPlot(inTree):
fig = plt.figure(1, facecolor='white')
fig.clf()
axprops = dict(xticks=[], yticks=[])
createPlot.ax1 = plt.subplot(111, frameon=False, **axprops)
plotTree.totalW = float(getNumLeafs(inTree))
plotTree.totalD = float(getTreeDepth(inTree))
plotTree.xOff = -0.5 / plotTree.totalW
plotTree.yOff = 1.0
plotTree(inTree, (0.5, 1.0), '')
plt.show()
main.py
# -*- coding: utf-8 -*-
from pylab import *
import treePlotter
from ID3Tree import *
# mpl.rcParams['font.sans-serif'] = ['SimHei'] # 指定默认字体
# mpl.rcParams['axes.unicode_minus'] = False # 解决保存图像时负号'-'显示为方块的问题
# 测试决策树的构建
myDat, labels = createDataSet()
myTree = createTree(myDat, labels)
# 绘制决策树
treePlotter.createPlot(myTree)
运行结果:
play.xlsx:
Car Evaluation.xlsx:
根据Car Evaluation数据集,使用贝叶斯分类对数据集进行分析。
Naive_bayes.py
import numpy as np
class Naive_bayes:
'''
我们需要计算先验概率,类条件密度概率,封装参数为标签和特征。
'''
num = 0
feature_cat = 0
label_cat = 0
def __init__(self):
pass
def NaiveBayes(self, Py, Px_y, x):
featrueNum = self.feature_cat
classNum = self.label_cat
# 建立存放所有标记的估计概率数组
P = [0] * classNum
# 对于每一个类别,单独估计其概率
for i in range(classNum):
# 初始化sum为0,sum为求和项。
# 在训练过程中对概率进行了log处理,所以这里原先应当是连乘所有概率,最后比较哪个概率最大
# 但是当使用log处理时,连乘变成了累加,所以使用sum
sum = 0
for j in range(featrueNum):
if x[j] in Px_y[i][j]:
sum += Px_y[i][j][x[j]]
P[i] = sum + Py[i]
return P.index(max(P))
def cost_NaiveBayes(self, Py, Px_y, x, cost):
featrueNum = self.feature_cat
classNum = self.label_cat
# 建立存放所有标记的估计概率数组
P = [0] * classNum
P_ = [0] * classNum
# 对于每一个类别,单独估计其概率
for i in range(classNum):
# 初始化sum为0,sum为求和项。
# 在训练过程中对概率进行了log处理,所以这里原先应当是连乘所有概率,最后比较哪个概率最大
# 但是当使用log处理时,连乘变成了累加,所以使用sum
sum = 0
for j in range(featrueNum):
if x[j] in Px_y[i][j]:
sum += Px_y[i][j][x[j]]
P[i] = sum + Py[i]
for m in range(classNum):
totall = 0
for n in range(classNum):
totall += P[n] * cost[m][n]
P_[m] = totall
return P_.index(min(P_))
# def Naive_test(self, Py, Px_y, test_data, test_label):
# # 错误值计数
# errorCnt = 0
# # 循环遍历测试集中的每一个样本
# for i in range(len(test_data)):
# # 获取预测值
#
# presict = self.NaiveBayes(Py, Px_y, test_data[i])
# # 与答案进行比较
# print("presict", presict)
# if presict != test_label[i]:
# # 若错误 错误值计数加1
# errorCnt += 1
# # 返回准确率
# return 1 - (errorCnt / len(test_data))
def Naive_test(self, Py, Px_y, test_data, test_label):
# 错误值计数
n1 = 0
n2 = 0
n3 = 0
n0 = 0
# 循环遍历测试集中的每一个样本
for i in range(len(test_data)):
# 获取预测值
presict = self.NaiveBayes(Py, Px_y, test_data[i])
# 与答案进行比较
if test_label[i] == 0:
n0 += 1
elif test_label[i] == 1:
n1 += 1
elif test_label[i] == 2:
n2 += 1
elif test_label[i] == 3:
n3 += 1
# 返回准确率
return n0 / len(test_data), n1 / len(test_data), n2 / len(test_data), n3 / len(test_data)
def cost_Naive_test(self, Py, Px_y, test_data, test_label, cost):
# 错误值计数
errorCnt = 0
# 循环遍历测试集中的每一个样本
for i in range(len(test_data)):
# 获取预测值
presict = self.cost_NaiveBayes(Py, Px_y, test_data[i], cost)
# 与答案进行比较
if presict != test_label[i]:
# 若错误 错误值计数加1
errorCnt += 1
# 返回准确率
return 1 - (errorCnt / len(test_data))
def fit(self, train_data, train_label):
featureNum = train_data.shape[1]
self.feature_cat = featureNum
label = set(train_label)
self.label_cat = len(label)
classNum = len(label)
Py = np.zeros((classNum, 1))
# 计算先验概率分布
label_dic = {}
for i in label:
# 若训练集中没有某一类的数据则其预测概率为零。加一保证不为零,还要同时保证分母不为零 确保预测概率不为零
label_dic[i] = ((np.sum(train_label == i)) + 1)
Py[int(i)] = (label_dic[i]) / (len(train_label) + classNum)
# 转换为log对数形式,防止数据下溢
Py = np.log(Py)
# 初始化为全0矩阵,用于存放所有情况下的条件概率
Px_y = {}
for i in range(classNum):
Px_y[i] = {}
for j in range(featureNum):
Px_y[i][j] = {}
for m in range(len(train_label)):
label = train_label[m]
x = train_data[m]
for n in range(featureNum):
# 这里还没有计算条件概率,先把所有数累加,全加完以后,在后续步骤中再求对应的条件概率
if x[n] not in Px_y[label][n]:
Px_y[label][n][x[n]] = 1
else:
Px_y[label][n][x[n]] += 1
for label in range(classNum):
for z in range(featureNum):
l = len(Px_y[label][z].keys())
for key, item in Px_y[label][z].items():
Px_y[label][z][key] = np.log((item + 1) / (label_dic[label]) + l)
# 返回先验概率分布和条件概率分布
return Py, Px_y
main.py
import Naive_bayes
from sklearn.model_selection import RepeatedKFold
from sklearn import preprocessing
import pandas as pd
import numpy as np
if __name__ == "__main__":
df = pd.read_excel(r"C:\Users\zlh\Desktop\数据分析方法实验\实验三、四\test.xlsx")
raw_set = df.values
label_encoder = []
# 放置每一列的encoder
encoded_set = np.empty(raw_set.shape)
for i, _ in enumerate(raw_set[0]):
# 拟合每一列上的数据
encoder = preprocessing.LabelEncoder()
encoded_set[:, i] = encoder.fit_transform(raw_set[:, i])
label_encoder.append(encoder)
dataset_X = encoded_set[:, :-1].astype(int)
dataset_y = encoded_set[:, -1].astype(int)
# 将数据集拆分为train set 和test set start = time.time()
naive_bys = Naive_bayes.Naive_bayes()
# 使用习得的先验概率分布和条件概率分布对测试集进行测试
kf = RepeatedKFold(n_splits=10)
n0 = 0
n1 = 0
n2 = 0
n3 = 0
Accuracy0 = 0
Accuracy1 = 0
Accuracy2 = 0
Accuracy3 = 0
for train_index, test_index in kf.split(dataset_X):
train_X, train_y = dataset_X[train_index], dataset_y[train_index]
test_X, test_y = dataset_X[test_index], dataset_y[test_index]
Py, Px_y = naive_bys.fit(train_X, train_y)
n0, n1, n2, n3 = naive_bys.Naive_test(Py, Px_y, test_X, test_y)
Accuracy0 += n0
Accuracy1 += n1
Accuracy2 += n2
Accuracy3 += n3
# print(naive_bys.Naive_test(Py, Px_y, test_X, test_y))
print("class \t N\t\t\tN[%]")
print('acc: \t', np.sum(dataset_y == 0), ' \t%f' % Accuracy0, "%")
print('good: \t', np.sum(dataset_y == 1), ' \t%f' % Accuracy1, "%")
print('unacc: \t', np.sum(dataset_y == 2), ' \t%f' % Accuracy2, "%")
print('v-good:\t', np.sum(dataset_y == 3), ' \t%f' % Accuracy3, "%")
print("这里Word里面有个错误,总数据行数为1576,而1210+384+69+65=",1210+384+69+65)
print("所以N[%]也不一定是对的,而我的N[%]相加为",Accuracy0 + Accuracy1 + Accuracy2 + Accuracy3)
print("近似100%(由于int转flot或flot转int时取舍导致的误差)")
运行结果:(Car Evaluation.xlsx 中doors 和persons 中的5more,more会导致int和str报错,所以我把5more改成6,more改成8)
五、总结:
通过实验,体会分类过程,深刻理解分类思想。熟悉和掌握决策树的分类原理、实质和过程,掌握典型的决策树算法和实现技术。熟悉和掌握贝叶斯分类原理、实质和过程,掌握贝叶斯分类算法和实现技术。
实验成绩: 指导教师: 年 月 日