哈尔滨理工大学
软件与微电子学院
实 验 报 告
(2020-2021第二学期)
课程名称: | 数据分析方法 |
班 级: | 软件18- 1 班 |
学 号: | 1814010130 |
姓 名: | 张立辉 |
哈尔滨理工大学软件与微电子学院
实验名称: | 实验一 数据预处理 | 专 业 | 软件工程 | |||
---|---|---|---|---|---|---|
姓 名 | 张立辉 | 学 号 | 1814010130 | 班 级 | 软件18-1 |
一、实验目的:
掌握数据集成、变换、归约等预处理技术方法;
掌握复杂数据对象的相似性度量方法
二、实验内容:
1.基础数据预处理技术
集成Wine1和Wine2两个数据集,形成新数据集Wine
将Wine中的Alcohol 和Proline两个属性标准化处理,映射到[0,1]区间
根据Alcohol(%)和 Weight(g)两个属性计算每份样本中的酒精质量
2.复杂数据对象的相似性度量
创建数据集,数据集中包含10个数据对象,每个数据对象由一个二值型属性和两个五维向量组成,如下所示
属性A的权重为0.3,属性B的权重为0.2,属性C的权重为0.5
属性A的不同属性值间由简单匹配系数度量相似性
属性B的不同属性值间由欧式距离度量距离
属性C的不同属性值间由余弦距离度量相似性
计算不同数据对象间相似性,以矩阵形式输出计算结果
(相似性与距离转换函数:s=1/(1+d))
三、实验设备及软件环境:
Windows10
Python3.8
PyCharm 2020.3.5 (Professional Edition)
四、实验过程及结果:
Shiyan1.1.py
import xlrd
import xlwt
def read():
data1 = xlrd.open_workbook(r'C:\Users\zlh\Desktop\数据分析方法实验\实验一、二\实验一数据\wine1.xlsx')
sheet1 = data1.sheet_names()
table1 = data1.sheet_by_name(sheet1[0])
data2 = xlrd.open_workbook(r'C:\Users\zlh\Desktop\数据分析方法实验\实验一、二\实验一数据\wine2.xlsx')
sheet2 = data2.sheet_names()
table2 = data2.sheet_by_name(sheet2[0])
Alcohol = table1.col_values(1)
del Alcohol[0]
# print(Alcohol)
Weight = table1.col_values(5)
del Weight[0]
Proline = table2.col_values(8)
del Proline[0]
return table1, table2, Alcohol, Weight, Proline
def write(table1, table2, Alcohol, Weight, Proline, p, a):
workbook = xlwt.Workbook(encoding='utf-8')
worksheet = workbook.add_sheet('Sheet1')
for i in range(table1.ncols):
for j in range(table1.nrows):
worksheet.write(j, i, label=table1.cell(j, i).value)
for i in range(1, table2.ncols):
for j in range(table2.nrows):
worksheet.write(j, i + table1.ncols - 1, label=table2.cell(j, i).value)
Quality = ["Quality"]
for i in range(len(Alcohol)):
n = Alcohol[i] / Weight[i]
Quality.append(n)
for j in range(table2.nrows):
worksheet.write(j, 14, label=p[j])
worksheet.write(j, 15, label=a[j])
worksheet.write(j, 16, label=Quality[j])
workbook.save('Wine.xls')
def Standardization(Alcohol, Proline):
Proline_min = min(Proline)
Proline_max = max(Proline)
Alcohol_min = min(Alcohol)
Alcohol_max = max(Alcohol)
# print(Proline_min)
# print(Proline_max)
# print(Alcohol_min)
# print(Alcohol_max)
Alcohol_Standardization = ['Alcohol_Standardization']
Proline_Standardization = ['Proline_Standardization']
for i in range(len(Alcohol)):
num = (Alcohol[i] - Alcohol_min) / (Alcohol_max - Alcohol_min)
Alcohol_Standardization.append(num)
# print(num)
for i in range(len(Proline)):
num = (Proline[i] - Proline_min) / (Proline_max - Proline_min)
Proline_Standardization.append(num)
# print(num)
return Proline_Standardization, Alcohol_Standardization
if __name__ == "__main__":
table1, table2, Alcohol, Weight, Proline = read()
p, a = Standardization(Alcohol, Proline)
write(table1, table2, Alcohol, Weight, Proline, p, a)
Shiyan1.2.py
import math
from random import randrange, random
A=[[0 for i in range(6)]for i in range(10)]
B = [[0 for i in range(6)]for i in range(10)]
C = [[0 for i in range(6)]for i in range(10)]
for i in range(0, 10):
for j in range(0, 6):
if random() < 0.5:
A[i][j] = 0
else:
A[i][j] = 1
B[i][j] = randrange(0, 10)
C[i][j] = randrange(0, 10)
def Smc(f00, f01, f10, f11):
smc = [[0 for i in range(10)]for i in range(10)]
smc1 = [[0 for i in range(10)]for i in range(10)]
for j in range(0, 10):
for i in range(0, 10):
smc[j][i] = (f11[j][i] + f00[j][i])
smc1[j][i] = smc[j][i] / (f01[j][i] + f10[j][i] + f11[j][i] + f00[j][i])
return smc1
def distB():
sumb =[[0 for i in range(10)]for i in range(10)]
sb = [[1 for i in range(10)]for i in range(10)]
distb = [[0 for i in range(10)]for i in range(10)]
for n in range(0, 10):
for xb in range(0, 10):
for yb in range(0, 6):
sumb[n][xb] = sumb[n][xb] + (B[n][yb] - B[xb][yb]) * (B[n][yb] - B[xb][yb])
# print(sumb)
distb[n][xb] = math.sqrt(sumb[n][xb])
# print(distb)
sb[n][xb] = sb[n][xb] / (1 + distb[n][xb])
return sb
def smcC():
sumc1 = [[0 for i in range(10)]for i in range(10)]
sumc2 = [[0 for i in range(10)]for i in range(10)]
sumc3 = [[0 for i in range(10)]for i in range(10)]
sumc = [[0 for i in range(10)]for i in range(10)]
sum = [[0 for i in range(10)]for i in range(10)]
for n in range(0, 10):
for xc in range(0, 10):
for yc in range(0, 6):
sumc1[n][xc] = sumc1[n][xc] + C[n][yc] * C[xc][yc]
for yc in range(0, 6):
sumc2[n][xc] = sumc2[n][xc] + C[n][yc] * C[n][yc]
for yc in range(0, 6):
sumc3[n][xc] = sumc3[n][xc] + C[xc][yc] * C[xc][yc]
sumc[n][xc] = math.sqrt(sumc2[n][xc]) * math.sqrt(sumc3[n][xc])
sum[n][xc] = sumc1[n][xc] / sumc[n][xc]
return sum
def Sim(a, b, c, wa, wb, wc):
sim = [[0 for i in range(10)]for i in range(10)]
for n in range(0, 10):
for i in range(0, 10):
sim[n][i] = wa * a[n][i] + wb * b[n][i] + wc * c[n][i]
return sim
def f():
f00 = [[0 for i in range(10)]for i in range(10)]
f01 = [[0 for i in range(10)]for i in range(10)]
f10 = [[0 for i in range(10)]for i in range(10)]
f11 = [[0 for i in range(10)]for i in range(10)]
for n in range(0, 10):
for j in range(0, 10):
for i in range(0, 6):
if (A[n][i] == A[j][i]) & A[n][i] == 0:
f00[n][j] = f00[n][j] + 1
for i in range(0, 6):
if (A[n][i] != A[j][i]) & A[n][i] == 0 & A[j][i] == 1:
f01[n][j] = f01[n][j] + 1
# print(f01)
for i in range(0, 6):
if (A[n][i] != A[j][i]) & A[n][i] == 1:
f10[n][j] = f10[n][j] + 1
# print(f10)
for i in range(0, 6):
if (A[n][i] == A[j][i]) & A[n][i] == 1:
f11[n][j] = f11[n][j] + 1
# print(f11)
# print(f00)
# print(f01)
# print(f10)
# print(f11)
return f00, f01, f10, f11
def out(e):
for i in range(0,10):
for j in range(0,10):
print('%-10f' % (e[i][j]),end=" ")
print()
if __name__ == "__main__":
f00, f01, f10, f11 = f()
sA = Smc(f00, f01, f10, f11)
print("简单匹配系数度量相似性")
out(sA)
sB = distB()
print("欧式距离度量距离")
out(sB)
sC = smcC()
print("余弦距离度量相似性")
out(sC)
sim = Sim(sA, sB, sC, 0.3, 0.2, 0.5)
print("相似性")
out(sim)
运行结果:
实验1.1
实验1.2
五、总结:
通过本次实验,掌握数据了集成、变换、归约等预处理技术方法,掌握了复杂数据对象的相似性度量方法
实验成绩: 指导教师: 年 月 日