1. 層次分析理論
1.1 代碼部分
function [] = AHP()
%% AHP法權(quán)重計算MATLAB程序
%% 數(shù)據(jù)讀入
clc
clear all
%% 模型求解
A=[1 2 6; 1/2 1 4; 1/6 1/4 1];% 評判矩陣
%% 一致性檢驗和權(quán)向量計算
[n,n]=size(A);
[v,d] = eig(A);
r=d(1,1);
CI=(r-n)/(n-1);
RI=[0 0 0.58 0.90 1.12 1.24 1.32 1.41 1.45 1.49 1.52 1.54 1.56 1.58 1.59];
CR=CI/RI(n);
if CR<0.10
CR_Result='通過';
else
CR_Result='不通過';
end
%% 權(quán)向量計算
w=v(:,1)/sum(v(:,1));
w=w';
%% 結(jié)果輸出
disp('該判斷矩陣權(quán)向量計算報告:');
disp(['一致性指標(biāo):' num2str(CI)]);
disp(['一致性比例:' num2str(CR)]);
disp(['一致性檢驗結(jié)果:' CR_Result]);
disp(['特征值:' num2str(r)]);
disp(['權(quán)向量:' num2str(w)]);
end
1.2 論文中寫法(2016B)
1.3 理論鏈接
2. 主成分分析
2.1代碼部分(py,未使用sklearn)
digits = load_digits()
X_digits = pd.DataFrame(digits.data)
name = []
for i in range(np.shape(X_digits)[1]):
name.append("名字"+str(i))
def PCA_math(X_digits,name,topNfeat):
import pandas as pd
from sklearn.preprocessing import StandardScaler
from numpy import *
import matplotlib.pyplot as plt
plt.style.use("bmh")
plt.rc('font', family='SimHei', size=5)
pd.set_option('display.max_columns', 4000)
pd.set_option('display.width', 4000)
pd.set_option('display.max_colwidth', 4000)
X_scaler = StandardScaler()
X_digits = X_scaler.fit_transform(X_digits)
meanVals = mean(X_digits, axis=0) # 按列求均值
meanRemoved = X_digits - meanVals
covMat = cov(meanRemoved, rowvar=0)
eigVals, eigVects = linalg.eig(mat(covMat))
eigValInd = argsort(-eigVals)
sum_eig = sum(eigVals)
print("各成分重要性占比:")
for i in eigValInd:
print(name[i]+":\t"+str(eigVals[i]/sum_eig))
data = pd.DataFrame(eigVals) / sum_eig
fig = plt.figure()
ax = fig.add_subplot(111)
data.index = name
data[0].plot(kind='bar')
plt.show()
eigValInd = eigValInd[:(topNfeat + 1):1]
redEigVects = eigVects[:, eigValInd]
lowDDataMat = meanRemoved * redEigVects
reconMat = (lowDDataMat * redEigVects.T) + meanVals
return lowDDataMat,reconMat
PCA_math(X_digits,np.array(name),10)
2.2 論文中近似主成分分析
3. 因子分析
3.1 代碼部分(py)
from factor_analyzer import factor_analyzer,Rotator
from factor_analyzer import FactorAnalyzer
#導(dǎo)入數(shù)據(jù)集
X_digits = pd.DataFrame([[1,1,2,5,6,7],[2,2,2,6,7,8],[3,3,4,7,8,9]])
print(factor_analyzer.calculate_kmo(X_digits)[1])
P_s = factor_analyzer.calculate_bartlett_sphericity(X_digits)[1]
if(P_s < 0.05):
print("這些變量可以進(jìn)行因子分析")
else:
print("不可以進(jìn)行因子分析")
#初始化主成分分析參數(shù):
# 主成分提取參數(shù)設(shè)置:
n_factors = 2 #:4個主成分
method = 'principal'#:提取方法為主成分提取方法
rotation = 'varimax'#:旋轉(zhuǎn)方法為最大方差法
fa = factor_analyzer.FactorAnalyzer(n_factors=n_factors,method=method,rotation=rotation)
fa.fit(X_digits)
fa = FactorAnalyzer(n_factors=2,method='principal',rotation='varimax')
fa.fit(X_digits)
# 公因子方差
print(fa.get_communalities())
# 貢獻(xiàn)率 (特征值豆村,方差貢獻(xiàn)率,累計方差貢獻(xiàn)率)
var = fa.get_factor_variance()
# 旋轉(zhuǎn)后的因子載荷矩陣
rotator = Rotator()
load_matrix = rotator.fit_transform(fa.loadings_)
# 成分得分系數(shù)矩陣
corr = X_digits.corr().values
corr = np.matrix(corr)
load_matrix = np.matrix(load_matrix)
pr_matrix_score = np.dot(corr,load_matrix)
pr_matrix_score_t = pr_matrix_score.T
means=X_digits.mean()
stds=X_digits.std()
columns_len = len(X_digits.columns)
adict={}
for i in range(columns_len):
a = i
adict[i]=[means[a],stds[a]]
def get_Fac_score(x,no,adict):
arr = pr_matrix_score_t[no].tolist()[0]
sum=0
for i in range(columns_len):
a = i
sum=sum+(x[a]-adict[i][0])/(adict[i][1])*arr[i]
return sum
for i in range(n_factors):
X_digits['FAC'+str(i+1)]= X_digits.apply(get_Fac_score,args=(i,adict),axis=1)
X_digits['FAC'] = X_digits.apply(lambda x:(x['FAC1']*var[1][0]+x['FAC2']*var[1][1])/var[2][1], axis=1)
print(X_digits)
3.2 論文寫作參考鏈接(非論文)
3.3 論文
3.3.1 (2017B)
可參照灰色關(guān)聯(lián)度分析翠霍,個人感覺本谜,重點(diǎn)在得到相關(guān)性結(jié)果之后献酗,結(jié)論的分析寝受,各種因素如何影響足后的結(jié)果
4.聚類
4.0 重要提示:為什么要有n個簇!A枭恪羡蛾!
4.0.1 論文(2017 B)
4.0.2 采用機(jī)器學(xué)習(xí)中聚類的指標(biāo)
4.0.2.1 優(yōu)化kmeans - ISODATA
K: 期望得到的聚類數(shù);
k: 初始的設(shè)定的聚類數(shù)锨亏;
θN: 每一個類別中最少的樣本數(shù)痴怨,若少于此數(shù)則去掉該類別;
θs: 一個類別中器予,樣本特征中最大標(biāo)準(zhǔn)差浪藻。若大于這個值,則可能分裂乾翔;
θc: 兩個類別中心間的最小距離爱葵,若小于此數(shù),把兩個類別需進(jìn)行合并反浓;
L: 在一次合并操作中萌丈,可以合并的類別的最多對數(shù);
I: 迭代運(yùn)算的次數(shù)雷则。
機(jī)器學(xué)習(xí)實(shí)戰(zhàn)代碼~~~~
4.1 繪制層次聚類圖
4.1.1 (代碼py)
import pandas as pd
from scipy.cluster import hierarchy #用于進(jìn)行層次聚類辆雾,話層次聚類圖的工具包
from scipy import cluster
import matplotlib.pyplot as plt
plt.style.use("bmh")
plt.rc('font', family='SimHei', size=4)
pd.set_option('display.max_columns',1000)
pd.set_option('display.width', 1000)
pd.set_option('display.max_colwidth',1000)
#導(dǎo)入數(shù)據(jù)集
df = pd.DataFrame(pd.np.random.normal(size=(10,4)))
name = ["object"+str(i) for i in range(len(df))]
#繪制層次聚類圖
df.index = name
Z = hierarchy.linkage(df, method ='ward',metric='euclidean')
hierarchy.dendrogram(Z,labels = df.index)
height = 2
plt.axhline(y=height, color='r', linestyle='-')
label = cluster.hierarchy.cut_tree(Z,height=height)
label = label.reshape(label.size,)
plt.show()
4.2 K-means
4.2.1 (代碼py)
import matplotlib.pyplot as plt
import numpy as np
from sklearn.cluster import KMeans
from sklearn import datasets
iris = datasets.load_iris()
X = iris.data[:, :4] # #表示我們?nèi)√卣骺臻g中的4個維度
print(X.shape)
plt.figure(12)
# 繪制數(shù)據(jù)分布圖
plt.subplot(211)
plt.scatter(X[:, 0], X[:, 1], c="red", marker='o', label='see')
plt.xlabel('sepal length')
plt.ylabel('sepal width')
plt.legend(loc=2)
estimator = KMeans(n_clusters=3) # 構(gòu)造聚類器
estimator.fit(X) # 聚類
label_pred = estimator.labels_ # 獲取聚類標(biāo)簽
# 繪制k-means結(jié)果
x0 = X[label_pred == 0]
x1 = X[label_pred == 1]
x2 = X[label_pred == 2]
plt.subplot(212)
plt.scatter(x0[:, 0], x0[:, 1], c="red", marker='o', label='label0')
plt.scatter(x1[:, 0], x1[:, 1], c="green", marker='*', label='label1')
plt.scatter(x2[:, 0], x2[:, 1], c="blue", marker='+', label='label2')
plt.xlabel('sepal length')
plt.ylabel('sepal width')
plt.legend(loc=2)
plt.show()
'''
class_n = 3
estimator = KMeans(n_clusters=class_n) # 構(gòu)造聚類器
estimator.fit(data[["任務(wù)gps 緯度","任務(wù)gps經(jīng)度"]]) # 聚類
label_pred = estimator.labels_
color = ["red","yellow","green","blue","gray"]
data["距離"] = None
data["media_1"] = None
data["media_2"] = None
plt.figure(figsize=[8, 8])
for i in range(class_n):
x0 = data[["任務(wù)gps 緯度", "任務(wù)gps經(jīng)度","任務(wù)標(biāo)價","任務(wù)執(zhí)行情況"]][label_pred == i].values
print(i,data[["任務(wù)gps 緯度", "任務(wù)gps經(jīng)度","任務(wù)標(biāo)價","任務(wù)執(zhí)行情況"]][label_pred == i].describe())
#print("計算出中心點(diǎn)坐標(biāo):",end="")
median = np.mean(x0,axis=0)
data["距離"][label_pred == i] = np.linalg.norm(x0 - median,axis=1) * 110*1000
data["media_1"][label_pred == i] = median[0]
data["media_2"][label_pred == i] = median[1]
plt.scatter(x0[:,0], x0[:, 1], c=color[i], label='label'+str(i))
plt.scatter(median[0],median[1],color = "black" , marker='+',s=100)
plt.xlim([22, 24])
plt.ylim([112.2, 114.75])
'''
4.2.1.1 kmeans進(jìn)行分層聚類
'''
進(jìn)行第一次聚類
class_n = 20
estimator = KMeans(n_clusters=class_n) # 構(gòu)造聚類器
estimator.fit(data[["任務(wù)gps 緯度","任務(wù)gps經(jīng)度"]]) # 聚類
label_pred = estimator.labels_
data["cluster"] = -1
for i in range(class_n):
x0 = data[["任務(wù)gps 緯度", "任務(wù)gps經(jīng)度"]][label_pred == i].values
data["cluster"][label_pred == i] = i
data.to_csv("cluster_1.csv",index=False)
'''
-----------------------------------------------------------------------------------------------------------------------
'''
進(jìn)行第二次聚類
counter = dict(collections.Counter(data['cluster']))
key = list(counter.items())
for i in range(len(counter)):
cluster_name = key[i][0]
if (key[i][1] > 4):
data_1 = data[data["cluster"] == cluster_name]
iter = True
class_n = 2
num_iter = 0
while (iter):
num_iter += 1
iter = False
estimator = KMeans(n_clusters=class_n) # 構(gòu)造聚類器
estimator.fit(data_1[["任務(wù)GPS緯度","任務(wù)GPS經(jīng)度"]]) # 聚類
label_pred = estimator.labels_
if (list(dict(collections.Counter(label_pred)).items())[-1][1] <= 2):
iter = True
class_n -= 1
class_n = max(class_n, 2)
else:
break
if (num_iter == 10):
label_pred = np.random.randint(0,2,len(data_1))
break
data_1["cluster"] = data["cluster"].max() + label_pred + 1
data[data["cluster"] == cluster_name] = data_1
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
data["cluster"] = le.fit_transform(data["cluster"])
data.to_csv("result_cluster6.csv",index=False)
'''
------------------------------------------------------------------------------------------------------
'''
無法分類后隨機(jī)數(shù)隨機(jī)分類
import collections
counter = dict(collections.Counter(data['cluster']))
key = list(counter.items())
for i in range(len(counter)):
cluster_name = key[i][0]
if (key[i][1] > 4):
data_1 = data[data["cluster"] == cluster_name]
label_pred = np.random.randint(0, 3, len(data_1))
data_1["cluster"] = data["cluster"].max() + label_pred + 1
data[data["cluster"] == cluster_name] = data_1
data.to_csv("result_cluster8.csv",index=False)
print(data["cluster"].value_counts())
'''
4.2.2 其他聚類鏈接
k-means聚類,AGNES層次聚類月劈,DBSCAN
流程圖可參見西瓜書的聚類部分
4.3 論文
4.3.1 (2016B)
4.3.2(2017B)
5. 判別分析
5.1 線形判別分析
5.1.1 (py代碼)
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn import datasets
iris = datasets.load_iris()['data']
target = datasets.load_iris()['target']
X_train, X_test,y_train,y_test = train_test_split(iris,target,test_size=0.25,random_state=0,stratify=target)
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
LDA = LinearDiscriminantAnalysis()
LDA.fit(X_train,y_train)
LinearDiscriminantAnalysis(n_components=None, priors=None, shrinkage=None,
solver='svd', store_covariance=False, tol=0.0001)
LDA.score(X_train,y_train)
LDA.predict(X_test)
LDA.score(X_test,y_test)
dir(LDA)
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
X = np.vstack((X_train,X_test))
Y = np.vstack((y_train.reshape(y_train.size,1),\
y_test.reshape(y_test.size,1)))
converted_X = np.dot(X,np.transpose(LDA.coef_)) + \
LDA.intercept_
fig = plt.figure()
ax = fig.add_subplot(111)
colors = 'rgb'
markers = 'o*s'
for target,color,marker in zip([0,1,2],colors,markers):
pos = (Y == target).ravel()
X = converted_X[pos,:]
ax.scatter(X[:,0],X[:,1],\
color=color,marker=marker,\
label = "Label%d"%target)
ax.legend(loc="best")
fig.suptitle("Iris After LDA")
plt.show()
5.2 其他判別分析可調(diào)用sklearn的包
5.2.1 高斯判別分析
6.時序模型
6.1 參數(shù)確認(rèn)
參數(shù)確認(rèn)規(guī)則:(已知序列為一階差分)
① 當(dāng)偏自相關(guān)函數(shù)呈現(xiàn)p階拖尾度迂,自相關(guān)函數(shù)呈現(xiàn)q階拖尾時,我們可以選用模型ARIMA(p猜揪,1惭墓,q)
② 當(dāng)偏自相關(guān)函數(shù)呈現(xiàn)拖尾,自相關(guān)函數(shù)呈現(xiàn)q階截尾時而姐,我們可以選用模型MA(q)
③ 當(dāng)偏自相關(guān)函數(shù)呈現(xiàn)p階截尾腊凶,自相關(guān)函數(shù)呈現(xiàn)拖尾時,我們可以選用模型AR(p)
6.2季節(jié)ARIMA模型
6.2.1(py代碼)
# coding=UTF-8
import pandas as pd
from datetime import datetime
import numpy as np
import matplotlib.pyplot as plt
input_file = "data/data_1.xlsx"
data_frame = pd.read_excel(input_file,index_col=0)
data_frame["時間"] = pd.to_datetime(data_frame["時間"])
data_frame.index = data_frame.時間
dataSet = data_frame['流量'][data_frame['地市'] == "C"].dropna()
dataSet = dataSet.resample("D").mean()
forecast_data = []
for i in range(1,31):
forecast_data.append(dataSet.index[-1] + i)
forecast_data = pd.DataFrame(forecast_data)
forecast_data.columns = ["時間"]
forecast_data.index = forecast_data.時間
#畫出原始數(shù)據(jù)圖像
dataSet.plot()
plt.show()
#去均值化
# 對時間序列進(jìn)行去均值化
mean = dataSet.mean()
dataSet_kill_mean = dataSet - mean
#去均值化并不會影響數(shù)據(jù)之間的聯(lián)系
#對數(shù)據(jù)做差分拴念,得到平穩(wěn)的時間序列(做一階差分钧萍,得到時間序列)
df_kill_mean_1 = dataSet_kill_mean.diff(1).dropna()
df_kill_mean_1.plot()
plt.show()
#③ADF檢驗可以讓我們判斷某段時間序列是否為平穩(wěn)序列
import statsmodels.tsa.stattools as ts
adf_summary = ts.adfuller(np.array(df_kill_mean_1).reshape(-1)) # 進(jìn)行ADF檢驗并打印結(jié)果
print(adf_summary)
#如何觀察上面的統(tǒng)計結(jié)果來判斷序列是否為平穩(wěn)呢?
# ADF_Test_result , P
#1%丈莺、5%划煮、10%不同程度拒絕原假設(shè)的統(tǒng)計值和ADF Test result的比較,ADF Test result同時小于1%缔俄、5%弛秋、10%即說明非常好地拒絕該假設(shè)
#我們認(rèn)為上述一階差分后得到的數(shù)據(jù)滿足平穩(wěn)性檢驗要求。
# (一階差分后的序列平穩(wěn)性不太好俐载,有可能通不過白噪聲檢驗蟹略,在這里忽略白噪聲檢驗環(huán)節(jié),若白噪聲檢驗得到的P值大于0.05遏佣,
# 那么我們就得對時間序列進(jìn)行二階差分)
#白噪聲檢驗
import statsmodels.api as sm
r, q, p = sm.tsa.acf(df_kill_mean_1, qstat=True)
data = np.c_[range(1, 41), r[1:], q, p]
table = pd.DataFrame(data, columns=['lag', 'AC', 'Q', 'Prob(>Q)'])
# Prob(>Q)即P值大部分都小于0.05挖炬,所以殘差不是白噪聲
# 所有p值均大于0.05,接受原假設(shè)状婶,無自相關(guān)
print(table.set_index('lag'))
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
# 繪制自相關(guān)圖
plot_acf(df_kill_mean_1,lags=24).show() # 其中l(wèi)ags參數(shù)是指橫坐標(biāo)最大取值
# 繪制偏相關(guān)圖
plot_pacf(df_kill_mean_1,lags=24).show()
plt.show()
#并沒有呈現(xiàn)很好的拖尾或截尾情況
#該序列可能存在季節(jié)性影響的因素
#我們可以通過分解的方式將時序數(shù)據(jù)分離成不同的成分意敛,
# 它主要將時序數(shù)據(jù)分離為Trend(成長趨勢)馅巷、seasonal(季節(jié)性趨勢)、Residuals(隨機(jī)成分)
from statsmodels.tsa.seasonal import seasonal_decompose
decomposition = seasonal_decompose(df_kill_mean_1, freq=30)
trend = decomposition.trend # 趨勢部分
seasonal = decomposition.seasonal # 季節(jié)性部分
residual = decomposition.resid # 殘留部分
decomposition.plot()
plt.show()
#建立季節(jié)性時間序列模型ARIMA(k,D,m)S×(p,d,q)
#ARIMA(p,d,q)(k,D,m)S == ARMA(kS+p,mS+q)
# 使用“網(wǎng)格搜索”來迭代地探索參數(shù)的不同組合
'''
import itertools
p = q = range(0,3) # p草姻、q一般取值不超過2
d = range(1,2)
pdq = list(itertools.product(p, d, q))
seasonal_pdq = [(x[0], x[1], x[2], 7) for x in list(itertools.product(p, d, q))]
for param in pdq:
for param_seasonal in seasonal_pdq:
try:
mod = sm.tsa.statespace.SARIMAX(dataSet,
order=param,
seasonal_order=param_seasonal,
enforce_stationarity=False,
enforce_invertibility=False)
results = mod.fit()
print('ARIMA{}x{} - AIC:{}'.format(param, param_seasonal, results.aic))
except:
continue
'''
#ARIMA(2, 1, 2)x(2, 1, 2, 7) - AIC:-762.9389508705663
import statsmodels.api as sm
mod = sm.tsa.statespace.SARIMAX(dataSet,order=(2,1,2),seasonal_order=(2, 1, 2, 7),\
enforce_stationarity=True,enforce_invertibility=True)
result = mod.fit(maxiter=200,disp=30)
predict_sunspots = result.forecast(30)
forecast = np.array(predict_sunspots[:]).reshape(-1)
forecast_data["流量"] = forecast
forecast_data = forecast_data['流量']
forecast_data.plot(c="red",label='forecast')
dataSet.plot(c="blue",label='Original')
plt.show()
若是其他模型則代碼中訓(xùn)練部分可替換為:
def arma_predict(dataset,number):
data = list(dataset.rentNumber)
from statsmodels.tsa.arima_model import ARMA
"""
import statsmodels.tsa.stattools as st
order = st.arma_order_select_ic(data,max_ar=2,max_ma=2,ic=['aic', 'bic', 'hqic'])
"""
model = ARMA(data, order=(1,1))
result_arma = model.fit(disp=-1, method='css')
predict = result_arma.predict(len(data)-number,len(data))
RMSE = np.sqrt(((predict-data[len(data)-number-1:])**2).sum()/(number+1))
plot_results(predict,data[len(data)-number-1:])
return predict,RMSE
得出圖像:
說實(shí)在話我從來沒見過這么糟糕的并且丑陋的預(yù)測圖像,但是想到這個模型好寫論文撩独。敞曹。。我忍了综膀。澳迫。。(隨便來個lstm都比這個看著好)
6.2.2 (參考鏈接剧劝,非論文)
7. 元胞自動機(jī)
7.1 論文(2016B)
7.2 代碼(代更新)
8. 模糊綜合判定模型
8.1 論文
8.2 由于比較簡單橄登,不附上代碼
9. BP神經(jīng)網(wǎng)絡(luò)+stacking
9.1 代碼
import numpy as np
import pandas as pd
from keras.optimizers import Adam
from keras.models import Sequential
from keras.layers import Dense
from sklearn.preprocessing import MinMaxScaler
from keras.layers.core import Dense, Activation
from keras.optimizers import Adam
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from keras.callbacks import *
from keras.layers import LSTM, Dense, Activation
'''BP神經(jīng)網(wǎng)絡(luò)+stacking'''
Y = data[["任務(wù)標(biāo)價"]].values
del data["任務(wù)標(biāo)價"]
scalarX = MinMaxScaler()
data = scalarX.fit_transform(data)
N = 5
oof = np.zeros((data.shape[0],1))
#oof_sub = np.zeros((test_df.shape[0],1))
kf = StratifiedKFold(n_splits=N,random_state=42,shuffle=True)
for j,(train_in,test_in) in enumerate(kf.split(data,np.zeros(len(data)))):
X_tr, X_val, y_tr, y_val = data[train_in], data[test_in], Y[train_in], Y[test_in]
modelfile = 'modelweight.model'
model = Sequential() #層次模型
model.add(Dense(12,input_dim=11,init='uniform')) #輸入層,Dense表示BP層
model.add(Activation('relu')) #添加激活函數(shù)
model.add(Dense(1,input_dim=12)) #輸出層
model.compile(loss='mae', optimizer=Adam()) #編譯模型
check_point = ModelCheckpoint(modelfile, monitor="val_mae", verbose=1, save_weights_only=True,
save_best_only=True, mode="min")
early_stop = EarlyStopping(monitor="val_mae", mode="min", patience=10)
model.fit(X_tr, y_tr, nb_epoch = 7000, batch_size = int(800),
validation_data=(X_val, y_val),\
callbacks=[check_point, early_stop],shuffle=True) #訓(xùn)練模型1000次
model.save_weights(modelfile) #保存模型權(quán)重
oof[test_in] = model.predict(X_val, batch_size=800)
#oof_sub += model.predict(test_X, batch_size=1024)
#Y_pre = model.predict(data)
#xx_cv = f1_score(y1,np.argmax(oof,axis=1),average='macro')
#print(xx_cv)
result_error = np.abs(np.asarray(oof) - np.asarray(Y))
error = np.sum(result_error)/ len(Y)
data = pd.read_csv("data.csv")[["任務(wù)號碼"]]
data["predict"] = oof
data["predict"] = data["predict"].apply(lambda x: int(x*2 + 0.5)/2)
data["Y"] = Y
data["mae"] = result_error
data.to_csv("BP_2.csv")
9.2.1 論文(BP部分2016D6 深圳杯)
9.2.2 論文(STACKING部分)
10 繪圖
10.1 簡單繪圖加標(biāo)注
import matplotlib.pyplot as plt
data = pd.read_csv("BP_3.csv")
sample = data.iloc[:60]
plt.rc('font', family='Microsoft YaHei', size=10)
plt.plot(sample.predict,label = "調(diào)整")
plt.plot(sample.Y,label = "Y")
plt.plot(sample.final,label = "final",c="r")
plt.legend(bbox_to_anchor=(0., 1.04, 1., .104), loc=0, ncol=3, mode="expand", borderaxespad= -2.)
plt.ylim([60,80])
plt.xlabel("index")
plt.ylabel("定價")
plt.title("BP未完成定價調(diào)整圖")
plt.show()
10.2 直方圖和核密度分析圖
plt.rc('font', family='Microsoft YaHei', size=20)
plt.figure(figsize=[8,8])
ax1 = sns.distplot(data_clean["circle_proportion"+str(i)][data_clean["任務(wù)執(zhí)行情況"] == 1], color='blue',label="完成")
ax1 = sns.distplot(data_clean["circle_proportion"+str(i)][data_clean["任務(wù)執(zhí)行情況"] == 0], color='red',label="未完成")
plt.title("circle_proportion"+str(i))
plt.show()
10.3 散點(diǎn)圖
color = ["red","yellow","green","blue","gray"]
for i in range(len(color)):
plt.scatter(x0[:,0], x0[:, 1], c=color[i], label='label'+str(i))
plt.scatter(median[0],median[1],color = "black" , marker='+',s=100)
plt.xlim([22, 24])
plt.ylim([112.2, 114.75])
plt.show()
10.4 畫圈
color = ["y","blue","red","gray","green"]
r0 = 3000.0
add = 1000
for i in range(5):
r = r0 + i* add
a, b = (x0[0,0], x0[0,1])
x = np.arange(a-r, a+r, 0.01)
y = b + np.sqrt(r**2 - (x - a)**2)
y1 = b - np.sqrt(r**2 - (x - a)**2)
plt.plot(x,y,c = color[i])
plt.plot(x,y1,c=color[i])
plt.show()
#plt.plot(x,y)
#axes.plot(x, y) # 上半部
#axes.plot(x, -y) # 下半部
10.5 直方圖
sns.boxplot(data=price_rate,x="任務(wù)標(biāo)價",y="rate",ax=ax)
sns.countplot(data=price_rate, x="任務(wù)標(biāo)價", ax=ax, hue="任務(wù)執(zhí)行情況")
10.6 時序圖繪制
data["Speed"] = data.groupby("DataAsOf")["Speed"].transform("mean") / 12
data.drop_duplicates(subset=["DataAsOf"])
dataSet = data[['Speed']]
dataSet.index = pd.to_datetime(data.DataAsOf)
dataSet = dataSet["2016-12-01":"2016-12-03"]
dataSet = dataSet.resample('5min').mean()
dataSet["Speed"] = (5.1 - dataSet["Speed"]) * 1.2
plt.scatter(dataSet.index,dataSet["Speed"],marker=".")
plt.xlim((pd.to_datetime("2016-12-01 00:00:00"), pd.to_datetime("2016-12-03 23:59:59")))
plt.vlines(x = pd.to_datetime("2016-12-02 00:00:00"),ymin= 0,ymax=100, colors="c", linestyles="dashed")
plt.vlines(x = pd.to_datetime("2016-12-03 00:00:00"),ymin= 0,ymax=100, colors="c", linestyles="dashed")
plt.ylim((0,5))
plt.xticks(rotation=90) # 將字體進(jìn)行旋轉(zhuǎn)
plt.xlabel("時間")
plt.ylabel('流量')
plt.title("某地區(qū)道路三日時間和流量散點(diǎn)圖")
plt.show()
11 百度api的調(diào)用
def getlnglat(data):
print(data)
ak = "MtmcuLOtlGEt477UpbfTv2mb6h52qWoP"
url = 'http://api.map.baidu.com/traffic/v1/around?ak='+ ak +'¢er=' + str(data[1])+","+str(data[2]) \
+'&radius=200'
print(url)
req = urlopen(url)
res = req.read().decode()
temp = json.loads(res)
print(temp)
return temp
for i in data.values:
getlnglat(i)
12 熵權(quán)法
#第一題代碼
m,n=data.shape
data1=(data - data.min()) / (data.max() - data.min())
#第一步讀取文件担平,如果未標(biāo)準(zhǔn)化示绊,則標(biāo)準(zhǔn)化
data1=data1.as_matrix(columns=None)
#將dataframe格式轉(zhuǎn)化為matrix格式
k=1/np.log(m)
yij=data1.sum(axis=0)
pij=data1/yij
#第二步,計算pij
test=pij*np.log(pij)
test=np.nan_to_num(test)
ej=-k*(test.sum(axis=0))
#計算每種指標(biāo)的信息熵
wi=(1-ej)/np.sum(1-ej)
w = pd.DataFrame()
list_w = ["距離","流量","小區(qū)內(nèi)路網(wǎng)密度","速度","主干道數(shù)目","小區(qū)道路數(shù)"]
for i in range(len(list_w)):
w[list_w[i]] = [wi[i]]
13 灰色分析
gray=(gray - gray.min()) / (gray.max() - gray.min())
#標(biāo)準(zhǔn)化
std=gray[["流量"]].iloc[:,0]#為標(biāo)準(zhǔn)要素
ce=gray[["距離","小區(qū)內(nèi)路網(wǎng)密度","速度","主干道數(shù)目","小區(qū)道路數(shù)"]].iloc[:,:]#為比較要素
n=ce.shape[0]
m=ce.shape[1]#計算行列
#與標(biāo)準(zhǔn)要素比較暂论,相減
a=zeros([m,n])
for i in range(m):
for j in range(n):
a[i, j] = abs(ce.iloc[j, i] - std[j])
#取出矩陣中最大值與最小值
c=amax(a)
d=amin(a)
#計算值
result=zeros([m,n])
for i in range(m):
for j in range(n):
result[i,j]=(d+0.5*c)/(a[i,j]+0.5*c)
#求均值面褐,得到灰色關(guān)聯(lián)值
result2=zeros(m)
for i in range(m):
result2[i]=mean(result[i,:])
RT=pd.DataFrame(result2)
RT.index = ["距離","小區(qū)內(nèi)路網(wǎng)密度","速度","主干道數(shù)目","小區(qū)道路數(shù)"]
print(RT)
14 線性回歸
reg = np.polyfit(data["距離"], data["流量"], 4)
data["Q_distance"] = np.polyval(reg, data["距離"])
15 方差分析
15.1 單因素方差分析
from scipy import stats
from statsmodels.formula.api import ols
from statsmodels.stats.anova import anova_lm
from statsmodels.stats.multicomp import pairwise_tukeyhsd
import warnings
import pandas as pd
import numpy as np
warnings.filterwarnings("ignore")
import itertools
'''構(gòu)造虛假數(shù)據(jù)'''
df=pd.DataFrame()
df["group"] = np.random.randint(0,8,10)
df["A"] = np.random.randint(-3,5,10)
df["B"] = np.random.randint(-7,2,10)
df["C"] = np.random.randint(5,8,10)
print(df)
#A
anova_reA= anova_lm(ols('A~group',data=df[['group','A']]).fit())
print(anova_reA)
#B
anova_reB= anova_lm(ols('B~group',data=df[['group','B']]).fit())
print(anova_reB)
#C
anova_reC= anova_lm(ols('C~group',data=df[['group','C']]).fit())
print(anova_reC)
15.2 多因素方差分析
在方差分析中,若把飲料的顏色看做影響銷量的因素A取胎,把銷售地區(qū)看做影響因素B展哭。同時對因素A和因素B進(jìn)行分析,就稱為雙因素方差分析闻蛀。
from statsmodels.formula.api import ols
from statsmodels.stats.anova import anova_lm
formula = 'group ~ A + B '
anova_results = anova_lm(ols(formula,df).fit())
print(anova_results)
df sum_sq mean_sq F PR(>F) a 4.0 335.36 83.84 3.874307 0.021886 b 4.0 199.36 49.84 2.303142 0.103195 Residual 16.0 346.24 21.64 NaN NaN
因素A的p值0.021886<0.05,拒絕原假設(shè)匪傍,說明飲料顏色對銷量有顯著影響;
而因素B的p值0.103195>0.05,不能拒絕原假設(shè)觉痛,因此沒有充分的理由說明銷售地區(qū)對銷量有顯著影響役衡。
然而,我們知道了顏色對銷量有顯著影響薪棒,那么是哪種顏色呢手蝎?
使用tukey方法對顏色進(jìn)行多重比較
from statsmodels.stats.multicomp import pairwise_tukeyhsd
print(pairwise_tukeyhsd(df['group'], df['a']))
16. 置信區(qū)間
'''
抽取樣本, 樣本量為200
'''
np.random.seed(42)
coffee_full = pd.read_csv('coffee_dataset.csv')
coffee_red = coffee_full.sample(200) #this is the only data you might actually get in the real world.
coffee_red.head()
'''
計算樣本中喝咖啡的均值
'''
(coffee_red[coffee_red['drinks_coffee'] == True]['height'].mean()
'''
重復(fù)抽取樣本,計算其他樣本中喝咖啡的均值,得到抽樣分布
繪制抽樣分布
'''
boot_means = []
for _ in range(10000):
bootsample = coffee_full.sample(200, replace=True)
mean = bootsample[bootsample['drinks_coffee'] == False]['height'].mean()
boot_means.append(mean)
'''
計算抽樣分布的置信區(qū)間以估計總體均值, 置信度95%
'''
np.percentile(boot_means, 2.5), np.percentile(boot_means, 97.5)
16.1 繪制置信區(qū)間圖像
import matplotlib.pyplot as plt
import numpy as np
x_ticks = ("Thing 1", "Thing 2", "Other thing", "Yet another thing")
x_1 = np.arange(1, 5)
x_2 = x_1 + 0.1
y_1 = np.random.choice(np.arange(1, 7, 0.1), 4)
y_2 = np.random.choice(np.arange(1, 7, 0.1), 4)
err_1 = np.random.choice(np.arange(0.5, 3, 0.1), 4)
err_2 = np.random.choice(np.arange(0.5, 3, 0.1), 4)
plt.errorbar(x=x_1, y=y_1, yerr=err_1, color="black", capsize=3,
linestyle="None",
marker="s", markersize=7, mfc="black", mec="black")
plt.errorbar(x=x_2, y=y_2, yerr=err_2, color="gray", capsize=3,
linestyle="None",
marker="s", markersize=7, mfc="gray", mec="gray")
plt.xticks(x_1, x_ticks, rotation=90)
plt.tight_layout()
plt.show()
排隊論
MMS
function MMS(s,mu,lambda)
s=2;%服務(wù)臺數(shù)
mu=4;%單個服務(wù)臺一小時內(nèi)服務(wù)的顧客數(shù)
lambda=3;%單位時間(一小時)到達(dá)的顧客數(shù)
ro=lambda/mu;
ros=ro/s;
sum1=0;
for i=0:(s-1)
sum1=sum1+ro.^i/factorial(i);
end
sum2=ro.^s/factorial(s)/(1-ros);
p0=1/(sum1+sum2);
p=ro.^s.*p0/factorial(s)/(1-ros);
Lq=p.*ros/(1-ros);
L=Lq+ro;
W=L/lambda;
Wq=Lq/lambda;
fprintf('排隊等待的平均人數(shù)為%5.2f人\n',Lq)
fprintf('系統(tǒng)內(nèi)的平均人數(shù)為%5.2f人\n',L)
fprintf('平均逗留時間為%5.2f分鐘\n',W*60)
fprintf('平均等待時間為%5.2f分種\n',Wq*60)
MM1
function MM1()
clear
clc
%*****************************************
%初始化顧客源
%*****************************************
%總仿真時間
Total_time = 10;
%隊列最大長度
N = 100000;
%到達(dá)率與服務(wù)率
lambda = 10; %單位時間(一小時)到達(dá)的顧客數(shù)
mu = 6; %單個服務(wù)臺一小時內(nèi)服務(wù)的顧客數(shù)
%平均到達(dá)時間與平均服務(wù)時間
arr_mean = 1/lambda;
ser_mean = 1/mu;
arr_num = round(Total_time*lambda*2);
events = [];
%按負(fù)指數(shù)分布產(chǎn)生各顧客達(dá)到時間間隔
events(1,:) = exprnd(arr_mean,1,arr_num);
%各顧客的到達(dá)時刻等于時間間隔的累積和
events(1,:) = cumsum(events(1,:));
%按負(fù)指數(shù)分布產(chǎn)生各顧客服務(wù)時間
events(2,:) = exprnd(ser_mean,1,arr_num);
%計算仿真顧客個數(shù),即到達(dá)時刻在仿真時間內(nèi)的顧客數(shù)
len_sim = sum(events(1,:)<= Total_time);
%*****************************************
%計算第 1個顧客的信息
%*****************************************
%第 1個顧客進(jìn)入系統(tǒng)后直接接受服務(wù)俐芯,無需等待
events(3,1) = 0;
%其離開時刻等于其到達(dá)時刻與服務(wù)時間之和
events(4,1) = events(1,1)+events(2,1);
%其肯定被系統(tǒng)接納棵介,此時系統(tǒng)內(nèi)共有
%1個顧客咙轩,故標(biāo)志位置1
events(5,1) = 1;
%其進(jìn)入系統(tǒng)后腾么,系統(tǒng)內(nèi)已有成員序號為 1
member = [1];
for i = 2:arr_num
%如果第 i個顧客的到達(dá)時間超過了仿真時間澎剥,則跳出循環(huán)
if events(1,i)>Total_time
break;
else
number = sum(events(4,member) > events(1,i));
%如果系統(tǒng)已滿缰犁,則系統(tǒng)拒絕第 i個顧客浑此,其標(biāo)志位置 0
if number >= N+1
events(5,i) = 0;
%如果系統(tǒng)為空骆捧,則第 i個顧客直接接受服務(wù)
else
if number == 0
%其等待時間為 0
%PROGRAMLANGUAGEPROGRAMLANGUAGE
events(3,i) = 0;
%其離開時刻等于到達(dá)時刻與服務(wù)時間之和
events(4,i) = events(1,i)+events(2,i);
%其標(biāo)志位置 1
events(5,i) = 1;
member = [member,i];
%如果系統(tǒng)有顧客正在接受服務(wù)候学,且系統(tǒng)等待隊列未滿歧寺,則 第 i個顧客進(jìn)入系統(tǒng)
else len_mem = length(member);
%其等待時間等于隊列中前一個顧客的離開時刻減去其到 達(dá)時刻
events(3,i)=events(4,member(len_mem))-events(1,i);
%其離開時刻等于隊列中前一個顧客的離開時刻加上其服
%務(wù)時間
events(4,i)=events(4,member(len_mem))+events(2,i);
%標(biāo)識位表示其進(jìn)入系統(tǒng)后,系統(tǒng)內(nèi)共有的顧客數(shù)
events(5,i) = number+1;
member = [member,i];
end
end
end
end
%仿真結(jié)束時锐极,進(jìn)入系統(tǒng)的總顧客數(shù)
len_mem = length(member);
%*****************************************
%輸出結(jié)果
%*****************************************
%繪制在仿真時間內(nèi)笙僚,進(jìn)入系統(tǒng)的所有顧客的到達(dá)時刻和離
%開時刻曲線圖(stairs:繪制二維階梯圖)
stairs([0 events(1,member)],0:len_mem);
hold on;
stairs([0 events(4,member)],0:len_mem,'.-r');
legend('到達(dá)時間 ','離開時間 ');
hold off;
grid on;
%繪制在仿真時間內(nèi)芳肌,進(jìn)入系統(tǒng)的所有顧客的停留時間和等
%待時間曲線圖(plot:繪制二維線性圖)
figure;
plot(1:len_mem,events(3,member),'r-*',1: len_mem,events(2,member)+events(3,member),'k-');
legend('等待時間 ','停留時間 ');
grid on;