在數(shù)據(jù)處理中常會需要快速查看各數(shù)據(jù)之間的關系,能否通過歷史預測未來糕再。
這里把這種分析“歷史”到預測“未來”的叫做回歸(regression)送悔,當然有些地方也會叫做擬合。
常見的回歸方式有線性回歸和非線性回歸梨熙。在本文中線性回歸以嶺回歸為代表开镣,非線性委會使用了高斯回歸為代表,順帶還介紹了下基于神經網絡的回歸算法咽扇。
數(shù)據(jù)準備
為簡單起見邪财,這里給出了帶高斯噪聲的一維正弦函數(shù)為基礎數(shù)據(jù)進行分析,具體構造代碼如下:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.neural_network import MLPRegressor
# 定義數(shù)據(jù)
x_data = np.linspace(0, 10, 100)
y_data = np.sin(x_data) + np.random.normal(0, 0.1, 100)
如果使用所有的數(shù)據(jù)來訓練會有過擬合問題(overfit)质欲,這里切分出80%的數(shù)據(jù)作為訓練集树埠,20%的數(shù)據(jù)作為測試集;
rng = np.random.RandomState(0)
training_sample_indices = rng.choice(np.arange(0, 100), size=80, replace=False)
training_data = x_data[training_sample_indices]
training_noisy_target = y_data[training_sample_indices] + 0.2 * rng.randn(
len(training_sample_indices)
)
繪制原始數(shù)據(jù)如下:
import numpy as np
plt.figure("Gaussian Regression")
plt.plot(x_data, y_data, 'bo', label='gt signal')
#plt.plot(x_data, y_data, label="gt signal", linewidth=2)
plt.scatter(
training_data,
training_noisy_target,
color="black",
label="Noisy measurements",
)
plt.legend()
plt.xlabel("data")
plt.ylabel("target")
_ = plt.title(
"Gaussian Regression process"
)
plt.show()
線性回歸
ridge 回歸
#%% lieanr model
from sklearn.linear_model import Ridge
rng = np.random.RandomState(0)
training_sample_indices = rng.choice(np.arange(0, 100), size=60, replace=False)
training_data = x_data[training_sample_indices]
training_noisy_target = y_data[training_sample_indices] + 0.2 * rng.randn(
len(training_sample_indices)
)
ridge = Ridge().fit(training_data, training_noisy_target)
plt.figure("Ridge Regression")
plt.plot(x_data, y_data, 'bo', label='gt signal')
plt.scatter(
training_data,
training_noisy_target,
color="black",
label="Noisy measurements",
)
plt.plot(x_data, ridge.predict(x_data), label="Ridge regression")
plt.legend()
plt.xlabel("data")
plt.ylabel("target")
_ = plt.title("Limitation of a linear model such as ridge")
plt.show()
對于添加kernal后線性回歸的能力會產生質的飛躍:
#%% ridge kernal
from sklearn.gaussian_process.kernels import ExpSineSquared
from sklearn.kernel_ridge import KernelRidge
rng = np.random.RandomState(0)
training_sample_indices = rng.choice(np.arange(0, 100), size=60, replace=False)
training_data = x_data[training_sample_indices]
training_noisy_target = y_data[training_sample_indices] + 0.2 * rng.randn(
len(training_sample_indices)
)
kernel_ridge = KernelRidge(kernel=ExpSineSquared())
kernel_ridge.fit(training_data, training_noisy_target)
plt.figure("Kernal Ridge Regression")
plt.plot(x_data, y_data, 'bo', label='gt signal')
plt.scatter(
training_data,
training_noisy_target,
color="black",
label="Noisy measurements",
)
plt.plot(
x_data,
kernel_ridge.predict(x_data),
label="Kernel ridge",
linewidth=2,
linestyle="dashdot",
)
from scipy.stats import loguniform
from sklearn.model_selection import RandomizedSearchCV
#random kernal
param_distributions = {
"alpha": loguniform(1e0, 1e3),
"kernel__length_scale": loguniform(1e-2, 1e2),
"kernel__periodicity": loguniform(1e0, 1e1),
}
kernel_ridge_tuned = RandomizedSearchCV(
kernel_ridge,
param_distributions=param_distributions,
n_iter=500,
random_state=0,
)
kernel_ridge_tuned.fit(training_data, training_noisy_target)
predictions_kr = kernel_ridge_tuned.predict(x_data)
plt.plot(
x_data,
predictions_kr,
label="Kernel ridge tuned hyperparameters",
linewidth=2,
linestyle="dashdot",
)
plt.legend(loc="lower right")
plt.xlabel("data")
plt.ylabel("target")
_ = plt.title(
"Kernel ridge regression with an exponential sine squared"
)
非線性回歸
高斯回歸
高斯回歸也是使用了離散采樣的和核心點作為種子回歸高斯模型嘶伟。
#%%
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import WhiteKernel
kernel = 1.0 * ExpSineSquared(1.0, 5.0, periodicity_bounds=(1e-2, 1e1)) + WhiteKernel(
1e-1
)
gaussian_process = GaussianProcessRegressor(kernel=kernel)
gaussian_process.fit(training_data, training_noisy_target)
mean_predictions_gpr, std_predictions_gpr = gaussian_process.predict(
x_data,
return_std=True,
)
plt.figure("Kernal Gaussian Regression")
plt.plot(x_data, y_data, label="True signal", linewidth=2, linestyle="dashed")
plt.scatter(
training_data,
training_noisy_target,
color="black",
label="Noisy measurements",
)
# Plot the predictions of the gaussian process regressor
plt.plot(
x_data,
mean_predictions_gpr,
label="Gaussian process regressor",
linewidth=2,
linestyle="dotted",
)
plt.fill_between(
x_data.ravel(),
mean_predictions_gpr - std_predictions_gpr,
mean_predictions_gpr + std_predictions_gpr,
color="tab:green",
alpha=0.2,
)
plt.legend(loc="lower right")
plt.xlabel("data")
plt.ylabel("target")
_ = plt.title("gaussian process regressor")
神經網絡
這里使用了sklearn內的多層感知機模型(MLP),給出計算結果
#%% MLP
from sklearn.preprocessing import StandardScaler
from sklearn.neural_network import MLPRegressor
scaler = StandardScaler()
scaler.fit(training_data)
x_data_mlp = scaler.transform(training_data)
# 定義模型
mlp = MLPRegressor(hidden_layer_sizes=(100,), # 一個隱藏層怎憋,包含100個神經元
max_iter=1000, # 最大迭代次數(shù)
activation='relu', # 激活函數(shù)使用ReLU
solver='adam', # 優(yōu)化算法使用Adam
alpha=0.00001, # L2正則化參數(shù)
random_state=42)
# trian
mlp.fit(x_data_mlp, training_noisy_target)
# predict
y_pred = mlp.predict(scaler.transform(x_data))
# plot resut
plt.figure("MLP regression")
plt.plot(x_data, y_data, 'bo', label='gt signal')
plt.plot(x_data, y_pred, 'r-', label='pred result')
plt.scatter(
training_data,
training_noisy_target,
color="black",
label="Noisy measurements",
)
plt.legend()
plt.xlabel("data")
plt.ylabel("target")
_ = plt.title(
"MLP trainning result")
計算結果如下: