Scrapy爬蟲與機(jī)器學(xué)習(xí)之三:房屋掛牌價格預(yù)測
本文在前期抓取房產(chǎn)中介二手房某區(qū)域所有2453套房屋基礎(chǔ)上咳焚,使用機(jī)器學(xué)習(xí)的線性回歸模型進(jìn)行預(yù)測朋友擬掛牌房屋的價格逸吵。經(jīng)過比較幾個模型葛虐,使用平均誤差評價指標(biāo)MAE迫肖,發(fā)現(xiàn)表現(xiàn)最好的模型是Grandient Boosting Regressor. MAE=1837元/平方米乱豆。把朋友房屋信息輸入此預(yù)測模型,得到的預(yù)測掛牌價格是75064.67元/平方米旺垒。比去年市場掛牌價下降了10%左右。
代碼:
import pandas as pd
#import some necessary librairies
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
get_ipython().run_line_magic('matplotlib', 'inline')
import matplotlib.pyplot as plt? # Matlab-style plotting
import seaborn as sns
color = sns.color_palette()
sns.set_style('darkgrid')
import warnings
def ignore_warn(*args, **kwargs):
? ? pass
warnings.warn = ignore_warn #ignore annoying warning (from sklearn and seaborn)
from scipy import stats
from scipy.stats import norm, skew #for some statistics
pd.set_option('display.float_format', lambda x: '{:.3f}'.format(x)) #Limiting floats output to 3 decimal points
from subprocess import check_output
#讀入數(shù)據(jù)文件
dftrain = pd.read_csv(r'C:\Users\Guoli\Desktop\scrapyfolder\new_env\ajk\ajk0115.csv',encoding="gbk")
#刪除總價
dftrain=dftrain.drop('totalprice',axis=1)
dftrain.head()
print("keys of df dataset:\n{}".format(dftrain.keys()))
from sklearn.model_selection import train_test_split
print("type of data:{}".format(type(dftrain['unitprice'])))
fig, ax = plt.subplots()
ax.scatter(x = dftrain['floorsize'], y = dftrain['unitprice'])
plt.ylabel('unitprice', fontsize=13)
plt.xlabel('floorsize', fontsize=13)
plt.show()
#刪除異常數(shù)據(jù):面積超過400平方米
dftrain = dftrain.drop(dftrain[(dftrain['floorsize']>400)].index)
#Check the graphic again
fig, ax = plt.subplots()
ax.scatter(dftrain['floorsize'], dftrain['unitprice'])
plt.ylabel('unitprice', fontsize=13)
plt.xlabel('floorsize', fontsize=13)
plt.show()
#刪除單價大于10萬的數(shù)據(jù)
dftrain = dftrain.drop(dftrain[(dftrain['unitprice']>100000)].index)
#面積與單價的關(guān)系畫圖
#Check the graphic again
fig, ax = plt.subplots()
ax.scatter(dftrain['floorsize'], dftrain['unitprice'])
plt.ylabel('unitprice', fontsize=13)
plt.xlabel('floorsize', fontsize=13)
plt.show()
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
get_ipython().run_line_magic('matplotlib', 'inline')
import matplotlib.pyplot as plt? # Matlab-style plotting
import seaborn as sns
color = sns.color_palette()
sns.set_style('darkgrid')
import warnings
def ignore_warn(*args, **kwargs):
? ? pass
warnings.warn = ignore_warn #ignore annoying warning (from sklearn and seaborn)
#查看單價的分布狀態(tài)
from scipy import stats
from scipy.stats import norm, skew #for some statistics
pd.set_option('display.float_format', lambda x: '{:.3f}'.format(x)) #Limiting floats output to 3 decimal points
from subprocess import check_output
import seaborn as sns
import matplotlib.pyplot as plt
sns.distplot(dftrain['unitprice'] , fit=norm);
# Get the fitted parameters used by the function
(mu, sigma) = norm.fit(dftrain['unitprice'])
print( '\n mu = {:.2f} and sigma = {:.2f}\n'.format(mu, sigma))
#Now plot the distribution
plt.legend(['Normal dist. ($\mu=$ {:.2f} and $\sigma=$ {:.2f} )'.format(mu, sigma)],
? ? ? ? ? ? loc='best')
plt.ylabel('Frequency')
plt.title('unitprice distribution')
#Get also the QQ-plot
fig = plt.figure()
res = stats.probplot(dftrain['unitprice'], plot=plt)
plt.show()
#相關(guān)系數(shù)
#Correlation map to see how features are correlated with SalePrice
corrmat = dftrain.corr()
plt.subplots(figsize=(12,9))
sns.heatmap(corrmat, vmax=0.9, square=True)
dftrain.corr()#correlations of features
#category 獨(dú)熱編碼
dftraindummy = pd.get_dummies(dftrain)
#分箱
bins = [0,0.1,0.5,0.9,1]
groupnames = ['floorsize1','floorsize2','floorsize3','floorsize4']
floorsize_binned = pd.get_dummies(pd.qcut(dftraindummy['floorsize'],bins,labels=groupnames))
dftrainbined = pd.concat([dftraindummy,floorsize_binned],axis=1)
#Correlation map to see how features are correlated with SalePrice
corrmat = dftrainbined.corr()
plt.subplots(figsize=(12,9))
sns.heatmap(corrmat, vmax=0.9, square=True)
#劃分訓(xùn)練集和測試集
X = dftrainbined.iloc[:,dftrainbined.columns!='unitprice']
y = dftrainbined['unitprice'].values
#預(yù)測分析
import numpy as np
import matplotlib.pyplot as plt
from sklearn import ensemble
from sklearn.utils import shuffle
from sklearn.metrics import mean_squared_error
#線性回歸模型
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y, random_state=42)#random_state=42時SCORE OF TEST:0.79
lr = LinearRegression().fit(X_train, y_train)
# print("Training set score:{:.2f}".format(lr.score(X_train,y_train)))
# print('Test train score:{:.2f}'.format(grid_search.score(X_train,y_train)))
print("Train set score:{:.2f}".format(lr.score(X_train,y_train)))
print("Test set score:{:.2f}".format(lr.score(X_test,y_test)))
mse = mean_squared_error(y_test, lr.predict(X_test))
print("MSE: %.4f" % mse)
#Ridge模型
from sklearn.linear_model import Ridge
ridge = Ridge().fit(X_train, y_train)#alpha=1.0 by default
print("Ridge Training set score:{:.2f}".format(ridge.score(X_train, y_train)))
print("Ridge Test set score:{:.2f}".format(ridge.score(X_test, y_test)))
from sklearn.model_selection import cross_val_score
#Ridge CV優(yōu)化
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.linear_model import Ridge
X_train, X_test, y_train, y_test = train_test_split(X,y, random_state=42)#random_state=42時SCORE OF TEST:0.79
ridge = Ridge()
param_grid={'alpha':[0.001,0.01,0.1,1]}
grid_search = GridSearchCV(ridge, param_grid, cv=5)
grid_search.fit(X_train,y_train)
print('Test set score:{:.2f}'.format(grid_search.score(X_test,y_test)))
print("Best parameters:{}".format(grid_search.best_params_))
print('Test train score:{:.2f}'.format(grid_search.score(X_train,y_train)))
print('Best cross-validation score:{:.2f}'.format(grid_search.best_score_))
#Lasso 模型
# L1 regularization, some coefficients are exactly zero.
from sklearn.linear_model import Lasso
lasso = Lasso(max_iter=1000000)
from sklearn.model_selection import GridSearchCV
param_grid={'alpha':[0.001,0.01,0.1,1]}
grid_search = GridSearchCV(lasso, param_grid,cv=5)
X_train,X_test,y_train,y_test = train_test_split(X,y,random_state=42)
grid_search.fit(X_train,y_train)
print('Test set score:{:.2f}'.format(grid_search.score(X_test,y_test)))
print("Best parameters:{}".format(grid_search.best_params_))
print('Test train score:{:.2f}'.format(grid_search.score(X_train,y_train)))
print('Best cross-validation score:{:.2f}'.format(grid_search.best_score_))
lasso=grid_search.fit(X_train,y_train)
acutuals = y_test
predictions = lasso.predict(X_test)
maelasso = median_absolute_error(actuals, predictions)
print("MAE: %.4f" % maelasso)#平均絕對差
#Lasso negelect some features, the score is not as good as Ridge, let's do another combination
#ElasticNet 模型
from sklearn.linear_model import ElasticNet, Lasso,? BayesianRidge, LassoLarsIC
ENet = ElasticNet(l1_ratio=.9,max_iter=1000000, random_state=42)
param_grid={'alpha':[0.001,0.01,0.1,1]}
grid_search = GridSearchCV(ENet, param_grid,cv=5)
X_train,X_test,y_train,y_test = train_test_split(X,y,random_state=42)
grid_search.fit(X_train,y_train)
print('Test train score:{:.2f}'.format(grid_search.score(X_train,y_train)))
print('Test set score:{:.2f}'.format(grid_search.score(X_test,y_test)))
enet=grid_search.fit(X_train,y_train)
acutuals = y_test
predictions = enet.predict(X_test)
maeenet = median_absolute_error(actuals, predictions)
print("MAE: %.4f" % maeenet)#平均絕對差
# GBoost模型
from sklearn.ensemble import RandomForestRegressor,? GradientBoostingRegressor
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
kfold = KFold(n_splits=5)
GBoost = GradientBoostingRegressor(n_estimators=500,learning_rate=0.1,max_depth=10, max_features='sqrt',
? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? min_samples_leaf=5, min_samples_split=5,
? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? loss='huber', random_state =42)
# param_grid={'n_estimator':[100,500,1000,2000,30000]}
# grid_search = GridSearchCV(GBoost, param_grid,cv=5)
X_train,X_test,y_train,y_test = train_test_split(X,y,random_state=42)
GBoost.fit(X_train,y_train)
print("cross-validation train scores:\n{}".format(np.mean(cross_val_score(GBoost,X_train,y_train,cv=kfold))))
print("cross-validation test scores:\n{}".format(np.mean(cross_val_score(GBoost,X_test,y_test,cv=kfold))))
mse = mean_squared_error(y_test, GBoost.predict(X_test))
print("MSE: %.4f" % mse)
from sklearn.metrics import mean_squared_log_error
msle = mean_squared_log_error(y_test, GBoost.predict(X_test))
print("MSLE: %.4f" % msle)
from sklearn.metrics import median_absolute_error
mae = median_absolute_error(y_test, GBoost.predict(X_test))
print("MAE: %.4f" % mae)#平均絕對差
# GradientBoosting Regressor 模型參數(shù)優(yōu)化
<h1>GradientBoostingRegressor優(yōu)化后mae:1795.62元/平方米肤无,在所有模型表現(xiàn)最好</h1>
params = {'n_estimators': 100, 'max_depth': 10, 'min_samples_split': 5,
? ? ? ? ? 'learning_rate': 0.1, 'loss': 'ls'}
clfr = ensemble.GradientBoostingRegressor(**params)
clfr.fit(X_train, y_train)
print("cross-validation train scores:\n{}".format(np.mean(cross_val_score(clfr,X_train,y_train,cv=kfold))))
print("cross-validation test scores:\n{}".format(np.mean(cross_val_score(clfr,X_test,y_test,cv=kfold))))
mse = mean_squared_error(y_test, clfr.predict(X_test))
print("MSE: %.4f" % mse)
mae = median_absolute_error(y_test, clfr.predict(X_test))
print("MAE: %.4f" % mae)#平均絕對差
# compute test set deviance
test_score = np.zeros((params['n_estimators'],), dtype=np.float64)
for i, y_pred in enumerate(clfr.staged_predict(X_test)):
? ? test_score[i] = clfr.loss_(y_test, y_pred)
plt.figure(figsize=(12, 6))
plt.subplot(1, 2, 1)
plt.title('Deviance')
plt.plot(np.arange(params['n_estimators']) + 1, clfr.train_score_, 'b-',
? ? ? ? label='Training Set Deviance')
plt.plot(np.arange(params['n_estimators']) + 1, test_score, 'r-',
? ? ? ? label='Test Set Deviance')
plt.legend(loc='upper right')
plt.xlabel('Boosting Iterations')
plt.ylabel('Deviance')
# #############################################################################
# Plot feature importance
feature_importance = clfr.feature_importances_
# make importances relative to max importance
feature_importance = 100.0 * (feature_importance / feature_importance.max())
sorted_idx = np.argsort(feature_importance)
pos = np.arange(sorted_idx.shape[0]) + .5
plt.subplot(1, 2, 2)
plt.barh(pos, feature_importance[sorted_idx], align='center')
plt.yticks(pos, dftrainbined.columns[sorted_idx])
plt.xlabel('Relative Importance')
plt.title('Variable Importance')
plt.show()
# <h1>預(yù)測房屋的擬掛牌價格76293元/平方米先蒋,誤差在1795.62元/平方米</h1>
# In[259]:
import numpy as np
X2 =np.array([3,2004,155,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0])
X3=X2.reshape(1, -1)
pred_clfr=clfr.predict(X3)
# In[260]:
#LightGBM模型
import lightgbm as lgb
gbm1 = lgb.LGBMRegressor(objective='regression',
? ? ? ? ? ? ? ? ? ? ? ? num_leaves=31,
? ? ? ? ? ? ? ? ? ? ? ? learning_rate=0.05,
? ? ? ? ? ? ? ? ? ? ? ? n_estimators=1000)
gbm1.fit(X_train, y_train,
? ? ? ? eval_set=[(X_test, y_test)],
? ? ? ? eval_metric='l1',
? ? ? ? early_stopping_rounds=5)
print('Start predicting...')
y_pred = gbm1.predict(X_test)
# eval
#print('The rmse of prediction is:', mean_squared_error(y_test, y_pred) ** 0.5)
maelgbm = median_absolute_error(y_test, y_pred)
print("MAE: %.4f" % maelgbm)#平均絕對差
print("cross-validation test scores:\n{}".format(np.mean(cross_val_score(gbm1,X_test,y_test,cv=5))))
#XGbOOST 模型
import pickle
import xgboost as xgb
import numpy as np
from sklearn.model_selection import KFold, train_test_split, GridSearchCV
from sklearn.metrics import confusion_matrix, mean_squared_error
from sklearn.model_selection import KFold
kf = KFold(n_splits=2, shuffle=True, random_state=42)
for train_index, test_index in kf.split(X):
? ? xgb_model = xgb.XGBRegressor().fit(X, y)
? ? predictions = xgb_model.predict(X)
? ? actuals = y
? ? print(mean_squared_error(actuals, predictions))
print("cross-validation test scores:\n{}".format(np.mean(cross_val_score(xgb_model,X_test,y_test,cv=kfold))))
maelxgb_model = median_absolute_error(actuals, predictions)
print("MAE: %.4f" % maelxgb_model)#平均絕對差
xgb_model2 = xgb.XGBRegressor(n_estimators= 1000,early_stopping_rounds=5,learning_rate=0.05,n_jobs=-1)
clf = xgb_model2.fit(X_train,y_train)
predictions = clf.predict(X_test)
actuals = y_test
print(mean_squared_error(actuals, predictions))
print("cross-validation test scores:\n{}".format(np.mean(cross_val_score(clf,X_test,y_test,cv=kfold))))
maeclf = median_absolute_error(actuals, predictions)
print("MAE: %.4f" % maeclf)#平均絕對差
xgb_model3 = xgb.XGBRegressor(n_estimators= 2000,learning_rate=0.01,max_depth=2,n_jobs=-1)#learning rate 越小,得分越高
clf2 = xgb_model3.fit(X_train,y_train)
predictions = clf2.predict(X_test)
actuals = y_test
print(mean_squared_error(actuals, predictions))
print("cross-validation test scores:\n{}".format(np.mean(cross_val_score(clf2,X_test,y_test,cv=kfold))))
maeclf2 = median_absolute_error(actuals, predictions)
print("MAE: %.4f" % maeclf2)#平均絕對差
xgb_model4 = xgb.XGBRegressor(n_estimators= 1000,learning_rate=0.1,max_depth=3,gamma=0.01,n_jobs=-1)
clf3 = xgb_model4.fit(X_train,y_train)
predictions = clf3.predict(X_test)
actuals = y_test
print(mean_squared_error(actuals, predictions))
print("cross-validation test scores:\n{}".format(np.mean(cross_val_score(clf3,X_test,y_test,cv=5))))
maeclf3 = median_absolute_error(actuals, predictions)
print("MAE: %.4f" % maeclf3)#平均絕對差
#
ensemble3 = X2*0.3+ pred_gbm1*0.7
ensemble3
#把表現(xiàn)最好的GBM模型保存
from sklearn.externals import joblib
model1=clfr
filename = 'GBMclfr_finalized_model0115.sav'
joblib.dump(model1, filename)
# some time later...
# load the model from disk
# loaded_model = joblib.load(filename)
# result = loaded_model.score(X_test, Y_test)
# print(result)
# #The sklearn API models are picklable
# print("Pickling sklearn API models")
# # must open in binary format to pickle
# pickle.dump(clf3, open("XGB0918NYCTaxi.pkl", "wb"))
# # clf2 = pickle.load(open("best_boston.pkl", "rb"))
# # print(np.allclose(clf.predict(X), clf2.predict(X)))