還在跑程序褪那,我這渣筆記本CPU估計(jì)要跑很久,不如來貼個(gè)代碼吧式塌。
這個(gè)是之前一篇粗糙集相對屬性約簡python實(shí)現(xiàn)的一個(gè)小小改進(jìn)
當(dāng)中距離函數(shù)使用的是歐幾里得距離公式,入坑沒多久友浸,代碼寫的不是很好峰尝,請見諒。
# _*_coding:utf-8 _*_
#@Time :2018/12/22 上午10:45
#@Author :we2swing
#@FileName: test.py
import pandas as pd
import numpy as np
import math
import matplotlib.pyplot as plt
def basic_set(df):
basic = {}
for i in df.drop_duplicates().values.tolist():
basic[str(i)] = []
for j, k in enumerate(df.values.tolist()):
if k == i:
basic[str(i)].append(j)
return basic
def Euclidean(x_data):
n = x_data.shape[1]#代表列數(shù)
m = x_data.shape[0]#代表行數(shù)
num = [0] * n
num2 = np.zeros((m,m))
#轉(zhuǎn)為list
arr = np.array(x_data)
counti,countj,countk,countz = 0,0,0,0
#遍歷每一行的元素
for j in arr:
countk = 0
for k in arr:
if countj != countk:
countz = 0
# 遍歷每一行的每一個(gè)元素
for z in k:
temp = np.square(z - j[countz])
if countz == 0:
num[countz] = temp
else:
#把每一行中每個(gè)元素的差值平方相加
num[countz] = num[countz-1] + temp
countz = countz + 1
else:
num[countz-1] = 0
num2[countj][countk] = round(math.sqrt(num[countz-1]),1)
countk = countk + 1
countj = countj + 1
return num2
def key_basic(num,k):
# k = 4 # 設(shè)δ=k
# k = [4.0,4.1,4.2,4.3,4.4,4.5,4.6,4.7,4.8,4.9,5.0]
num1 = {}
counti, countj = 0, 0
for i in num:
num1[counti] = []
countj = 0
for j in i:
if j <= k:
num1[counti].append(countj)
countj = countj + 1
counti = counti + 1
return num1
def compare(data):
# k = [4.0, 4.1, 4.2, 4.3, 4.4, 4.5, 4.6, 4.7, 4.8, 4.9, 5.0]
global t1
t1 = 4.0
data = data.dropna(axis=0, how='any')
x_data = data.drop(['judge'], axis=1)
y_data = data.loc[:, 'judge']
n = x_data.shape[1] # 代表列數(shù)
m = x_data.shape[0] # 代表行數(shù)
# 決策屬性基本集
y_basic_set = sorted([v for k, v in basic_set(y_data).items()])
num = Euclidean(x_data)
# print(num)
x_basic_set = [v for k, v in key_basic(num,t1).items()]
# γC(D)
pos = []
for i in x_basic_set:
for j in y_basic_set:
if set(i).issubset(j):
pos.append(i)
# pos.sort()#排序
r_x_y = len(pos) / len(data) # 也許可以寫一個(gè)card函數(shù)
print('依賴度r_x_(y):', r_x_y)
# 計(jì)算每一個(gè)γ(C-a)(D)
# 獲取條件屬性個(gè)數(shù)為總下標(biāo)存入集合
# 收集屬性重要性
imp_attr = []
columns_num = list(range(len(x_data.columns)))
for i in columns_num:
c = columns_num.copy()
c.remove(i)
u = data.iloc[:, c]
num = Euclidean(u)
u = sorted([v for k, v in key_basic(num,t1).items()])
# γC(D)
pos_va = []
for k in u:
for j in y_basic_set:
if set(k).issubset(j):
pos_va.append(k)
r_x_y_2 = len(pos_va) / len(data)
r_diff = round((r_x_y - r_x_y_2), 4)
imp_attr.append(r_diff)
dict_imp = {}
for o, p in enumerate(imp_attr):
dict_imp[data.columns[o]] = p
result = dict_imp
# print(imp_attr)
return result
def sepdata(data):
# 獲取數(shù)據(jù)長度
len = data.iloc[:, 0].size
# 將數(shù)據(jù)劃分
if len % 100 != 0:
if len > 100:
num = len // 100 + 1
else:
num = 1
else:
if len > 100:
num = int(len / 100)
else:
num = 1
arr = [[]] * num
count = 0
for i in arr:
# 如果數(shù)少于100或者最后一部分?jǐn)?shù)少于100收恢,則放入一個(gè)由數(shù)長決定的數(shù)組
if num == 1:
arr[count] = data.iloc[0:len] # 取100開始武学,取
elif count == num - 1:
arr[count] = data.iloc[100 * count:len]
else:
arr[count] = data.iloc[100 * count:(count + 1) * 100]
count = count + 1
sorted_dict_imp = [[]] * num
total = [0] * 27
title = ['C1', 'C2', 'C3', 'C4', 'C5', 'C6', 'C7', 'C8', 'C9', 'C10', 'C11', 'C12', 'C13', 'C14', 'C15', 'C16',
'C17', 'C18', 'C19', 'C20', 'C21', 'C22', 'C23', 'C24', 'C25', 'C26', 'C27']
# total = [0] * 16
# title = ['C1', 'C2', 'C3', 'C4', 'C5', 'C6', 'C7', 'C8', 'C9', 'C10', 'C11', 'C12', 'C13', 'C14', 'C15', 'C16']
print(sorted_dict_imp)
count = 0
for i in arr:
print('-------------------------------------第%d個(gè)數(shù)據(jù)集數(shù)據(jù)-----------------------------------------' % (count + 1))
sorted_dict_imp[count] = compare(i)
count = count + 1
count1 = 0
# 將dict的key為C1-Cn的value存入total中保存,并且相加
for i in sorted_dict_imp:
count = 0
if count1 == 0:
for j in title:
total[count] = i.get(j)
count = count + 1
else:
for z in title:
total[count] = i.get(z) + total[count]
count = count + 1
count1 = count1 + 1
# 輸出最終C1-Cn的結(jié)果
count = 0
for i in title:
print(i, ':', round(total[count], 4))
count = count + 1
figure(total)
def figure(opt):
count = 0
plt.rcParams['font.sans-serif'] = ['SimHei'] # 用來正常顯示中文標(biāo)簽
title = ['C1', 'C2', 'C3', 'C4', 'C5', 'C6', 'C7', 'C8', 'C9', 'C10', 'C11', 'C12', 'C13', 'C14', 'C15', 'C16',
'C17', 'C18', 'C19', 'C20', 'C21', 'C22', 'C23', 'C24', 'C25', 'C26', 'C27']
plt.scatter(title, opt, c='b', s=7)
plt.plot(title, opt, linewidth=3, alpha=0.7)
plt.show()
def main():
data = pd.read_csv(filepath_or_buffer='/Users/we2swing/Desktop/111.csv')
sepdata(data)
if __name__ == '__main__':
main()
里面很多代碼延續(xù)的是之前上面?zhèn)魉烷T里文章的代碼。
覺得有用的話點(diǎn)個(gè)贊
測試數(shù)據(jù):
鏈接:https://pan.baidu.com/s/1dtle6A6od8bOCwDPW1lZ2w
提取碼:o2qf