大創(chuàng)項目,基于粗糙集理論的高校學生實踐能力培養(yǎng)研究颜武。進行了一段時間,有了點小成果最易,再此總結(jié)一下腕窥。
首先了解一下什么是粗糙集理論粒没,它是處理不確定信息的一種方法〈乇可以從不完備的信息中得出現(xiàn)有的規(guī)律革娄,并從中提取出一些規(guī)則。
百度百科:https://baike.baidu.com/item/%E7%B2%97%E7%B3%99%E9%9B%86%E7%90%86%E8%AE%BA/4047224
https://baike.baidu.com/item/%E7%B2%97%E7%B3%99%E9%9B%86/8248139?fr=aladdin
其他更詳細的參考論文冕碟,我們選擇的是廈門大學黃麗萍寫的論文《基于粗糙集的屬性約簡與規(guī)則提取》。
其中我們的屬性約簡算法就是參考她文中的改進的CEBARKCC算法匆浙。
首先描述一下我們的項目流程:
1.設計一個可以體現(xiàn)大學生實踐能力的問卷
2.對問卷進行處理安寺,即將數(shù)據(jù)都轉(zhuǎn)換為方便處理的離散型數(shù)據(jù)
3.對數(shù)據(jù)進行屬性約簡,去掉無用的屬性
4.對約簡后的數(shù)據(jù)進行規(guī)則提取
5.分析提取出的規(guī)則
本次主要描述我們的屬性約簡算法
因為需要處理的數(shù)據(jù)并不是非常巨量首尼,所以采用了基于信息熵的屬性約簡算法挑庶。
信息熵簡介:https://baike.baidu.com/item/%E4%BF%A1%E6%81%AF%E7%86%B5/7302318?fr=aladdin
我們將要處理的信息假如是這樣:
病人 頭痛 胸口痛 體溫 流感
a1 是 是 正常 否
a2 是 是 高 是
a3 是 是 很高 是
a4 否 是 正常 否
a5 否 否 高 否
a6 否 是 很高 是
a7 否 否 高 是
a8 否 是 很高 否
信息熵的主要公式:
其中
U為論域,即我們數(shù)據(jù)的 a1 到 a8
Xi 和 Yj 是屬性集合软能,是針對某一屬性來說的迎捺,舉個例子,“頭痛”這個屬性可以分為
X1集合{a1,a2,a3}
X2集合{a4,a5,a6,a7,a8}
所以P(X1)= 3/8
頭痛的信息熵就為:H(P)= 3/8 * log(3/8) + 5/8 * log(5/8)
代表頭痛的不確定度查排。
由信息熵延伸一下凳枝,得到條件信息熵:
我們的算法思路很簡單,從全集開始跋核,依次去掉屬性岖瑰。如果這個屬性的缺失導致決策屬性(流感)的信息熵變大了,說明這個屬性對與是否流感的判斷有益砂代,是核屬性蹋订!
得到核屬性集合只是第一步,下面運用黃麗萍的改進CEBARKCC算法得到約簡:
算法的整體思路:
計算原本的數(shù)據(jù)中刻伊,條件屬性集合對于決策屬性的條件信息熵作為算法的終止值露戒,
每次對所有的屬性進行重要度的計算,如果這個屬性對于當前條件集合來說重要度為0捶箱,則可以刪除智什。然后挑選最重要的屬性加入當前條件集合,進行下一輪循環(huán)讼呢。直到當前條件集合的條件信息熵與原本數(shù)據(jù)的條件信息熵相等撩鹿。說明當前集合可以代替原本的集合。算法結(jié)束悦屏。
代碼:
#include <windows.h>
#include <iostream>
#include <string>
#include <stdio.h>
#include <tchar.h>
#include <sstream>
#include<fstream>
#include <vector>
using namespace std;
class Tuple
{
public:
Tuple();
~Tuple();
string list[50];
int PropertyNum;//屬性個數(shù)
int LineNum;//元組個數(shù)
void outline(int a);//輸出一行,a指輸出元素個數(shù)
private:
};
void Tuple::outline(int a) {//輸出一行
cout << list[0];
for (int i = 1; i <= a; i++) {
cout << "\t" << list[i];
}
cout << endl;
}
Tuple::Tuple()
{
}
Tuple::~Tuple()
{
}
bool init(Tuple a[200])//初始化,定義讀取的文件名
{
LPCWSTR file = _T("輸出.txt");
DeleteFile(file);//刪除現(xiàn)有的舊文件
FILE *fp1 = NULL;
FILE *fp2 = NULL;
fp1 = fopen("實驗.txt", "r");
fp2 = fopen("輸出.txt", "w");
if (fp1 == NULL || fp2 == NULL) {//判斷文件是否存在
cout << "文件打開失斀诼佟键思!" << endl;
return false;
}
string st = "";
string out;
int i = 0;
int j = 0;
int k = 0;
char s = NULL;
s = fgetc(fp1);//讀取第一個字符到s
while (!feof(fp1)) {
char out1[2] = { s,0 };
out = out1;//字符轉(zhuǎn)換為字符串
cout << out;//打印字符
//printf("%d ",s);
if (s == 10) {
i++;
j = 0;
st = "";
}
else if (s == 9) {
j++;
st = "";
if (j >= k)
k = j;
}
else
st = st + out;
a[i].list[j] = st;//字符串輸入元組中
fputc(s, fp2);//寫入輸出文件
s = fgetc(fp1);//取下一個字符
//fprintf(fp2,"%c",s);
}
fclose(fp1);
fclose(fp2);
for (j = 0; j <= i; j++) {
a[j].LineNum = i - 1;//元組個數(shù)
a[j].PropertyNum = k;//屬性個數(shù)
}
cout << "初始化成功!" << endl;
return true;
}
string c2s(char a) {//單個字符轉(zhuǎn)字符串
char b[2] = {a,0};
string s = b;
return s;
}
string int2s(int a) {//整數(shù)轉(zhuǎn)字符串
string s;
stringstream sstr;
sstr << a;
sstr >> s;
return s;
}
int s2int(string s) {//字符串轉(zhuǎn)整數(shù)
int a;
stringstream sstr;
sstr << s;
sstr >> a;
return a;
}
float Entropymath(string a[],int b) {//信息熵的計算
float s = 0;
float k = 0;
b = (float)b;
for (int i = 1; i <= b; i++) {
if (a[i] == "&&")
continue;
k = 1;
for (int j = i+1; j <= b; j++) {
if (a[i] == a[j]) {
k++;
a[j] = "&&";
}
}
//cout << k << endl;
s = s - k*log2(k / b) / b;
}
return s;
}
float Entropy(Tuple a[], string str2 = "all") {//計算決策屬性的信息熵和關(guān)于其他屬性的條件熵
string b[200];
string c[200];
float x = 0;
if (str2 == "all") {//非條件信息熵
for (int i = 1; i <= a[0].LineNum; i++) {
b[i] = a[i].list[a[i].PropertyNum];
}
x = Entropymath(b, a[0].LineNum);
return x;
}
string s = "";//條件信息熵
for (int i = 0; i <= str2.length(); i++) {
if (str2[i] == ','|| str2[i] == 0) {//函數(shù)傳遞進來的參數(shù)是屬性之間用“甫贯,”隔開
int k = 1; //分析條件集合拼接屬性值吼鳞,便于比較
for (; k <= a[0].PropertyNum; k++) {
if (s == a[0].list[k])
break;
}
for (int j = 1; j <= a[0].LineNum; j++) {
c[j] = c[j] + a[j].list[k];
}
s = "";
continue;
}
s = s + c2s(str2[i]);
}
for (int i = 1; i <= a[0].LineNum; i++) {//比較條件集合的值,計算條件信息熵
if (c[i] == "&&")
continue;
int k = 1;
b[1] = a[i].list[a[0].PropertyNum];
for (int j = i + 1; j <= a[0].LineNum; j++) {
if (c[i] == c[j]) {
k++;
b[k] = a[j].list[a[0].PropertyNum];
c[j] = "&&";
}
}
x = x + float(k)*Entropymath(b, k)/a[0].LineNum;
}
return x;
}
string CORE(Tuple a[]) {//求核叫搁,如果去掉屬性集中的某一個屬性赔桌,結(jié)果對于D關(guān)于C的條件信息熵變大,則是核屬性
string core = "";
string c = "";
for (int i = 1; i < a[0].PropertyNum; i++) {
c = c + a[0].list[i] + ",";
}
float hdc = Entropy(a,c);//D關(guān)于C的條件信息熵
for (int i = 1; i < a[0].PropertyNum; i++) {
for (int j = 1; j < a[0].PropertyNum; j++) {
if (i == j)
continue;
c = c + a[0].list[j] + ",";
}
float hdca = Entropy(a, c);//D關(guān)于C-a的條件信息熵
if (hdc < hdca) {
core = core + a[0].list[i] + ",";
}
}
return core;
}
float SGF(Tuple a[],string s,string p) {//屬性s關(guān)于p對d的屬性依賴度
float hdp = Entropy(a, p);
p = s + "," + p;
float sgf = Entropy(a, p);
sgf = hdp - sgf;
return sgf;
}
vector<string> split(const string& str, const string& delim) {
vector<string> res;
if ("" == str) return res;
//先將要切割的字符串從string類型轉(zhuǎn)換為char*類型
char * strs = new char[str.length() + 1];
strcpy(strs, str.c_str());
char * d = new char[delim.length() + 1];
strcpy(d, delim.c_str());
char *p = strtok(strs, d);
while (p) {
string s = p; //分割得到的字符串轉(zhuǎn)換為string類型
res.push_back(s); //存入結(jié)果數(shù)組
p = strtok(NULL, d);
}
return res;
}
string addto(string s, string p) {//向集合s中添加元素p
std::vector<string> res = split(s, ",");
for (int i = 0; i < res.size(); ++i) {
if (res[i] == p)
p = "";
}
if (p == "")
return s;
s = p + "," + s;
return s;
}
string remove(string s,string p) {//從集合s中刪去元素p
string y = "";
std::vector<string> res = split(s, ",");
for (int i = 0; i < res.size(); ++i) {
if (res[i] == p)
continue;
y = y + res[i] + ",";
}
return y;
}
string buji(string u,string s) {//全集為u渴逻,s為其子集疾党,計算u-s
string b = "";
std::vector<string> resu = split(u, ",");
std::vector<string> ress = split(s, ",");
for (int i = 0; i < ress.size(); ++i)
for (int j = 0; j < resu.size(); ++j) {
if (resu[j] == ress[i])
resu[j] = "&&";
}
for (int j = 0; j < resu.size(); ++j) {
if (resu[j] == "&&")
continue;
b = b + resu[j] + ",";
}
return b;
}
string reduce(Tuple a[]) {
string c = "";
for (int i = 1; i < a[0].PropertyNum; i++) {
c = c + a[0].list[i] + ",";
}
string core = CORE(a);
string p = core;
string b = buji(c,core);
std::ofstream openfile("輸出.txt", std::ios::app);
while (Entropy(a,c)!= Entropy(a,p))
{
openfile <<"\n" << "H(D|C) = " << Entropy(a, c) << "\nH(D|P) = " << Entropy(a, p) << endl;
cout << "\n";
cout << "H(D|C) = " << Entropy(a, c) << "\nH(D|P) = " << Entropy(a, p) << endl;
std::vector<string> resb = split(b, ",");
int max = 0;
float maxsgf = 0;
for (int i = 0; i < resb.size(); ++i) {
float sgf = SGF(a, resb[i], p);
openfile<< "SGF(" << resb[i] << ",P,D) = H(D|" << p << ") - H(D|" << addto(p, resb[i]) << ") = " << sgf << endl;
cout << "SGF(" << resb[i] << ",P,D) = H(D|" << p << ") - H(D|" << addto(p, resb[i]) << ") = " << sgf << endl;
if (sgf == 0)
b = remove(b, resb[i]);
if (sgf>maxsgf) {
maxsgf = sgf;
max = i;
}
}
p = addto(p, resb[max]);
b = remove(b, resb[max]);
cout << "\n\n";
openfile << "\n\n";
}
openfile << "約簡結(jié)果為:" << p << endl;
openfile << "CORE :" << CORE(a) << endl;
openfile.close();
return p;
}
void main()
{
Tuple a[200];
init(a);
//a[6].outline(a[6].PropertyNum);
//cout << buji("a1,a2,a5,a4,a6", "a1,a4") << endl;
//cout << remove("a1,a2,a5,a4,a3,", "a5") << endl;
//cout << addto("a1,a2,a6,a4,a3,", "a5") << endl;
//cout << SGF(a, "a4", "") << endl;
cout <<"約簡結(jié)果為:"<< reduce(a) << endl;
cout << "CORE :"<<CORE(a) << endl;
//cout << "\n" << a[4].list[5] << endl;
//cout << Entropy(a,"")<<endl;
getchar();
}
運行結(jié)果實例: