最近误证,在DataFountain平臺由中國計算機學會 & 西門子舉辦了一個“離散制造過程中典型工件的質(zhì)量符合率預(yù)測”的比賽,就嘗試了一下岸军。
1.數(shù)據(jù)清洗
發(fā)現(xiàn)數(shù)據(jù)集的中每個特征最大最小值相差非常大苟弛,而且很大的數(shù)字不在少數(shù)。嘗試用正態(tài)分布異常點檢測法初步對異常值進行判斷掺冠,并用均值進行替換。然而码党,替換完后德崭,還是存在不少的異常點。
2.構(gòu)建模型
其實揖盘,還可以利用加減乘除構(gòu)建人工特征……并用多個模型融合以提高準確率眉厨,這里就是做了一個Baseline,在沒有調(diào)參數(shù)的情況下兽狭,分別用SVM憾股、MLP、CNN箕慧、LihtGBM服球、XGBoost跑了一遍,發(fā)現(xiàn)最后一個準確率在50%左右颠焦,其他的都是在41%-45%之間斩熊。
2.1 SVM (這個使用MATLAB跑的)
Data=csvread('Train_AfterQinXi.csv');
BiLi=0.1;? %注意點 1.最后一列要按順序排列,并且最后一列一定是類型伐庭,需要設(shè)定測試集的比例
[m,n]=size(Data);%最后一列是分類的類型粉渠,Excel要排序
Testnum=zeros(1,max(Data(:,n))+1);
Speicesnum=Testnum;
kkk=1;
sum0=0;
BJS=Data(1,n);
for j=1:m
? ? if Data(j,n)==BJS
? ? ? sum0=sum0+1;
? ? else
? ? ? Speicesnum(kkk)=sum0;
? ? ? Testnum(kkk)=floor(BiLi*sum0);kkk=kkk+1;
? ? ? sum0=1;BJS=Data(j,n);
? ? end
end
Testnum(1,end)=floor(BiLi*sum0);
Speicesnum(1,end)=sum0;
for j=1:length(Testnum)
? ? if Testnum(j)==0
? ? ? Testnum(j)=1;
? ? end
end
%求出每類的個數(shù)
Train_Feature=[];
Train_Label=[];
Test_Feature=[];
Test_Label=[];
for j=1:max(Data(:,n))+1
? ? if j==1
? ? ? Kaishi=1;
? ? else
? ? ? Kaishi=sum(Speicesnum(1,1:j-1))+1;
? ? end
? ? JieSu1=sum(Speicesnum(1,1:j))-Testnum(j);
? ? JieSu2=sum(Speicesnum(1,1:j));
? ? Train_Feature=[Train_Feature;Data(Kaishi:JieSu1,1:n-1)];
? ? Train_Label=[Train_Label;Data(Kaishi:JieSu1,n)];
? ? Test_Feature=[Test_Feature;Data(JieSu1+1:JieSu2,1:n-1)];
? ? Test_Label=[Test_Label;Data(JieSu1+1:JieSu2,n)];
end
%數(shù)據(jù)預(yù)處理,將訓練集和測試集歸一化到[0,1]區(qū)間
[mtrain,ntrain] = size(Train_Feature);
[mtest,ntest] = size(Test_Feature);
dataset = [Train_Feature;Test_Feature];
[dataset_scale,ps] = mapminmax(dataset',0,1);
dataset_scale = dataset_scale';
Train_Feature = dataset_scale(1:mtrain,:);
Test_Feature = dataset_scale( (mtrain+1):(mtrain+mtest),: );
%SVM網(wǎng)絡(luò)訓練和預(yù)測
model = fitcecoc(Train_Feature,Train_Label);
[predict_label] =predict(model,Test_Feature);
accuracy=0;
for j=1:length(Test_Label)
? ? if Test_Label(j)==predict_label(j)
? ? ? accuracy=accuracy+1;
? ? end
end
accuracy=accuracy/length(Test_Label)
2.2 LightGBM
import lightgbm as lgb
import numpy as np
from pandas import read_csv
from sklearn import datasets
from xgboost import plot_importance
from matplotlib import pyplot as plt
from sklearn.model_selection import train_test_split
dataset = read_csv('ZeroOne_Train.csv')
XXX = read_csv('ZeroOne_Test.csv')
values = dataset.values
XY= values
Y = XY[:,10]
n_train_hours1 =5398
x_train=XY[:n_train_hours1,0:10]
trainY =Y[:n_train_hours1]
x_test =XY[n_train_hours1:, 0:10]
testY =Y[n_train_hours1:]
X_train=np.array(x_train,dtype=np.float)
X_test=np.array(x_test,dtype=np.float)
y_train=np.array(trainY,dtype=np.int)
y_test=np.array(testY,dtype=np.int)
XXX=np.array(XXX,dtype=np.float)
params = {
'boosting_type': 'gbdt',
'objective': 'multiclassova',
'num_class': 4,?
'metric': 'multi_error',
'num_leaves': 63,
'learning_rate': 0.01,
'feature_fraction': 0.9,
'bagging_fraction': 0.9,
'bagging_seed':0,
'bagging_freq': 1,
'verbose': -1,
'reg_alpha':1,
'reg_lambda':2,
'lambda_l1': 0,
'lambda_l2': 1,
'num_threads': 8,
}
train_data=lgb.Dataset(X_train,label=y_train)
validation_data=lgb.Dataset(X_test,label=y_test)
clf=lgb.train(params,train_data,valid_sets=[validation_data],num_boost_round = 1300,verbose_eval = 100)
y_pred=clf.predict(XXX, num_iteration=1300)
2.3 XGBoost
import xgboost as xgb
import numpy as np
from pandas import read_csv
from xgboost import plot_importance
from matplotlib import pyplot as plt
from sklearn.model_selection import train_test_split
dataset = read_csv('ZeroOne_Train.csv')
XXX = read_csv('ZeroOne_Test.csv')
values = dataset.values
XY= values
Y = XY[:,10]
n_train_hours1 =5398
x_train=XY[:n_train_hours1,0:10]
trainY =Y[:n_train_hours1]
x_test =XY[n_train_hours1:, 0:10]
testY =Y[n_train_hours1:]
X_train=np.array(x_train,dtype=np.float)
X_test=np.array(x_test,dtype=np.float)
y_train=np.array(trainY,dtype=np.int)
y_test=np.array(testY,dtype=np.int)
XXX=np.array(XXX,dtype=np.float)
params = {
? ? 'booster': 'gbtree',
? ? 'objective': 'multi:softmax',
? ? 'num_class': 4,
? ? 'gamma': 0.1,
? ? 'max_depth': 6,
? ? 'lambda': 2,
? ? 'subsample': 0.7,
? ? 'colsample_bytree': 0.7,
? ? 'min_child_weight': 3,
? ? 'silent': 1,
? ? 'eta': 0.1,
? ? 'seed': 1000,
? ? 'nthread': 4,
}
plst = params.items()
dtrain = xgb.DMatrix(X_train, y_train)
num_rounds = 500
model = xgb.train(plst, dtrain, num_rounds)
# 對測試集進行預(yù)測
dtest = xgb.DMatrix(XXX)
ans = model.predict(dtest)
2.4 MLP
from __future__ import print_function
import keras
from keras.models import Sequential
from keras.layers import Dense, Dropout
from pandas import read_csv
batch_size = 100
num_classes = 4
epochs = 200
dataset = read_csv('ZeroOne_Train.csv')
XXX = read_csv('ZeroOne_Test.csv')
values = dataset.values
XY= values
Y = XY[:,10]
n_train_hours1 =5398
x_train=XY[:n_train_hours1,0:10]
trainY =Y[:n_train_hours1]
x_test =XY[n_train_hours1:, 0:10]
testY =Y[n_train_hours1:]
y_train = keras.utils.to_categorical(trainY, num_classes)
y_test = keras.utils.to_categorical(testY, num_classes)
model = Sequential()
model.add(Dense(128,input_dim=10,kernel_initializer='normal',activation='relu'))
model.add(Dense(128,kernel_initializer='normal',activation='relu'))
model.add(Dense(128,kernel_initializer='normal',activation='relu'))
model.add(Dropout(0.25))
model.add(Dense(num_classes, activation='softmax'))
model.summary()
model.compile(loss=keras.losses.categorical_crossentropy,
? ? ? ? ? ? ? optimizer=keras.optimizers.Adadelta(),
? ? ? ? ? ? ? metrics=['accuracy'])
history=model.fit(x_train, y_train,
? ? ? ? ? ? ? ? ? ? ? ? batch_size=batch_size,
? ? ? ? ? ? ? ? ? ? ? ? epochs=epochs,
? ? ? ? ? ? ? ? ? ? ? ? verbose=2,
? ? ? ? ? ? ? ? ? ? ? ? validation_data=(x_test, y_test))
prediction=model.predict_classes(XXX)
2.5 CNN
from __future__ import print_function
import keras
from keras.datasets import mnist
from keras.models import Sequential
from keras.layers import Dense, Dropout, Flatten
from keras.layers import Conv2D, MaxPooling2D
from keras import backend as K
from pandas import read_csv
batch_size = 32
num_classes = 4
epochs = 200
# input image dimensions
# 輸入圖像維度
img_rows, img_cols = 4, 4
input_shape = (img_rows, img_cols, 1)
# the data, shuffled and split between train and test sets
# 用于訓練和測試的數(shù)據(jù)集,經(jīng)過了篩選(清洗圾另、數(shù)據(jù)樣本順序打亂)和分割(分割為訓練和測試集)
dataset = read_csv('ZeroOne_Train_CNN.csv')
values = dataset.values
XY= values
Featurenumber=img_rows*img_cols
Y = XY[:,Featurenumber]
n_train_hours1 =5398
x_train=XY[:n_train_hours1,0:Featurenumber]
trainY =Y[:n_train_hours1]
x_test =XY[n_train_hours1:, 0:Featurenumber]
testY =Y[n_train_hours1:]
x_train = x_train.reshape(-1,4,4,1)
x_test = x_test.reshape(-1,4,4,1)
y_train = keras.utils.to_categorical(trainY, num_classes)
y_test = keras.utils.to_categorical(testY, num_classes)
model = Sequential()
model.add(Conv2D(16, kernel_size=(3, 3),
? ? ? ? ? ? ? ? activation='relu',
? ? ? ? ? ? ? ? padding='same',
? ? ? ? ? ? ? ? input_shape=input_shape))
model.add(MaxPooling2D(pool_size=(2, 2)))
model.add(Conv2D(32, kernel_size=(3, 3),
? ? ? ? ? ? ? ? activation='relu',
? ? ? ? ? ? ? ? padding='same'))
model.add(MaxPooling2D(pool_size=(2, 2)))
model.add(Dropout(0.25))
model.add(Flatten())
model.add(Dense(16, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(num_classes, activation='softmax'))
model.summary()
model.compile(loss=keras.losses.categorical_crossentropy,
? ? ? ? ? ? ? optimizer=keras.optimizers.Adadelta(),
? ? ? ? ? ? ? metrics=['accuracy'])
history=model.fit(x_train, y_train,
? ? ? ? ? ? ? ? ? ? ? ? batch_size=batch_size,
? ? ? ? ? ? ? ? ? ? ? ? epochs=epochs,
? ? ? ? ? ? ? ? ? ? ? ? verbose=2,
? ? ? ? ? ? ? ? ? ? ? ? validation_data=(x_test, y_test))
a=history.history['acc']
b=history.history['val_acc']