2019-01-15

Scrapy爬蟲與機(jī)器學(xué)習(xí)之三：房屋掛牌價格預(yù)測

本文在前期抓取房產(chǎn)中介二手房某區(qū)域所有2453套房屋基礎(chǔ)上咳焚，使用機(jī)器學(xué)習(xí)的線性回歸模型進(jìn)行預(yù)測朋友擬掛牌房屋的價格逸吵。經(jīng)過比較幾個模型葛虐，使用平均誤差評價指標(biāo)MAE迫肖，發(fā)現(xiàn)表現(xiàn)最好的模型是Grandient Boosting Regressor. MAE=1837元/平方米乱豆。把朋友房屋信息輸入此預(yù)測模型，得到的預(yù)測掛牌價格是75064.67元/平方米旺垒。比去年市場掛牌價下降了10%左右。

代碼：

import pandas as pd

#import some necessary librairies

import numpy as np # linear algebra

import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

get_ipython().run_line_magic('matplotlib', 'inline')

import matplotlib.pyplot as plt? # Matlab-style plotting

import seaborn as sns

color = sns.color_palette()

sns.set_style('darkgrid')

import warnings

def ignore_warn(*args, **kwargs):

? ? pass

warnings.warn = ignore_warn #ignore annoying warning (from sklearn and seaborn)

from scipy import stats

from scipy.stats import norm, skew #for some statistics

pd.set_option('display.float_format', lambda x: '{:.3f}'.format(x)) #Limiting floats output to 3 decimal points

from subprocess import check_output

#讀入數(shù)據(jù)文件

dftrain = pd.read_csv(r'C:\Users\Guoli\Desktop\scrapyfolder\new_env\ajk\ajk0115.csv',encoding="gbk")

#刪除總價

dftrain=dftrain.drop('totalprice',axis=1)

dftrain.head()

print("keys of df dataset:\n{}".format(dftrain.keys()))

from sklearn.model_selection import train_test_split

print("type of data:{}".format(type(dftrain['unitprice'])))

fig, ax = plt.subplots()

ax.scatter(x = dftrain['floorsize'], y = dftrain['unitprice'])

plt.ylabel('unitprice', fontsize=13)

plt.xlabel('floorsize', fontsize=13)

plt.show()

#刪除異常數(shù)據(jù)：面積超過400平方米

dftrain = dftrain.drop(dftrain[(dftrain['floorsize']>400)].index)

#Check the graphic again

fig, ax = plt.subplots()

ax.scatter(dftrain['floorsize'], dftrain['unitprice'])

plt.ylabel('unitprice', fontsize=13)

plt.xlabel('floorsize', fontsize=13)

plt.show()

#刪除單價大于10萬的數(shù)據(jù)

dftrain = dftrain.drop(dftrain[(dftrain['unitprice']>100000)].index)

#面積與單價的關(guān)系畫圖

#Check the graphic again

fig, ax = plt.subplots()

ax.scatter(dftrain['floorsize'], dftrain['unitprice'])

plt.ylabel('unitprice', fontsize=13)

plt.xlabel('floorsize', fontsize=13)

plt.show()

import numpy as np # linear algebra

import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

get_ipython().run_line_magic('matplotlib', 'inline')

import matplotlib.pyplot as plt? # Matlab-style plotting

import seaborn as sns

color = sns.color_palette()

sns.set_style('darkgrid')

import warnings

def ignore_warn(*args, **kwargs):

? ? pass

warnings.warn = ignore_warn #ignore annoying warning (from sklearn and seaborn)

#查看單價的分布狀態(tài)

from scipy import stats

from scipy.stats import norm, skew #for some statistics

pd.set_option('display.float_format', lambda x: '{:.3f}'.format(x)) #Limiting floats output to 3 decimal points

from subprocess import check_output

import seaborn as sns

import matplotlib.pyplot as plt

sns.distplot(dftrain['unitprice'] , fit=norm);

# Get the fitted parameters used by the function

(mu, sigma) = norm.fit(dftrain['unitprice'])

print( '\n mu = {:.2f} and sigma = {:.2f}\n'.format(mu, sigma))

#Now plot the distribution

plt.legend(['Normal dist. ($\mu=$ {:.2f} and $\sigma=$ {:.2f} )'.format(mu, sigma)],

? ? ? ? ? ? loc='best')

plt.ylabel('Frequency')

plt.title('unitprice distribution')

#Get also the QQ-plot

fig = plt.figure()

res = stats.probplot(dftrain['unitprice'], plot=plt)

plt.show()

#相關(guān)系數(shù)

#Correlation map to see how features are correlated with SalePrice

corrmat = dftrain.corr()

plt.subplots(figsize=(12,9))

sns.heatmap(corrmat, vmax=0.9, square=True)

dftrain.corr()#correlations of features

#category 獨(dú)熱編碼

dftraindummy = pd.get_dummies(dftrain)

#分箱

bins = [0,0.1,0.5,0.9,1]

groupnames = ['floorsize1','floorsize2','floorsize3','floorsize4']

floorsize_binned = pd.get_dummies(pd.qcut(dftraindummy['floorsize'],bins,labels=groupnames))

dftrainbined = pd.concat([dftraindummy,floorsize_binned],axis=1)

#Correlation map to see how features are correlated with SalePrice

corrmat = dftrainbined.corr()

plt.subplots(figsize=(12,9))

sns.heatmap(corrmat, vmax=0.9, square=True)

#劃分訓(xùn)練集和測試集

X = dftrainbined.iloc[:,dftrainbined.columns!='unitprice']

y = dftrainbined['unitprice'].values

#預(yù)測分析

import numpy as np

import matplotlib.pyplot as plt

from sklearn import ensemble

from sklearn.utils import shuffle

from sklearn.metrics import mean_squared_error

#線性回歸模型

from sklearn.linear_model import LinearRegression

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X,y, random_state=42)#random_state=42時SCORE OF TEST:0.79

lr = LinearRegression().fit(X_train, y_train)

# print("Training set score:{:.2f}".format(lr.score(X_train,y_train)))

# print('Test train score:{:.2f}'.format(grid_search.score(X_train,y_train)))

print("Train set score:{:.2f}".format(lr.score(X_train,y_train)))

print("Test set score:{:.2f}".format(lr.score(X_test,y_test)))

mse = mean_squared_error(y_test, lr.predict(X_test))

print("MSE: %.4f" % mse)

#Ridge模型

from sklearn.linear_model import Ridge

ridge = Ridge().fit(X_train, y_train)#alpha=1.0 by default

print("Ridge Training set score:{:.2f}".format(ridge.score(X_train, y_train)))

print("Ridge Test set score:{:.2f}".format(ridge.score(X_test, y_test)))

from sklearn.model_selection import cross_val_score

#Ridge CV優(yōu)化

from sklearn.model_selection import GridSearchCV

from sklearn.model_selection import train_test_split

from sklearn.linear_model import Ridge

X_train, X_test, y_train, y_test = train_test_split(X,y, random_state=42)#random_state=42時SCORE OF TEST:0.79

ridge = Ridge()

param_grid={'alpha':[0.001,0.01,0.1,1]}

grid_search = GridSearchCV(ridge, param_grid, cv=5)

grid_search.fit(X_train,y_train)

print('Test set score:{:.2f}'.format(grid_search.score(X_test,y_test)))

print("Best parameters:{}".format(grid_search.best_params_))

print('Test train score:{:.2f}'.format(grid_search.score(X_train,y_train)))

print('Best cross-validation score:{:.2f}'.format(grid_search.best_score_))

#Lasso 模型

# L1 regularization, some coefficients are exactly zero.

from sklearn.linear_model import Lasso

lasso = Lasso(max_iter=1000000)

from sklearn.model_selection import GridSearchCV

param_grid={'alpha':[0.001,0.01,0.1,1]}

grid_search = GridSearchCV(lasso, param_grid,cv=5)

X_train,X_test,y_train,y_test = train_test_split(X,y,random_state=42)

grid_search.fit(X_train,y_train)

print('Test set score:{:.2f}'.format(grid_search.score(X_test,y_test)))

print("Best parameters:{}".format(grid_search.best_params_))

print('Test train score:{:.2f}'.format(grid_search.score(X_train,y_train)))

print('Best cross-validation score:{:.2f}'.format(grid_search.best_score_))

lasso=grid_search.fit(X_train,y_train)

acutuals = y_test

predictions = lasso.predict(X_test)

maelasso = median_absolute_error(actuals, predictions)

print("MAE: %.4f" % maelasso)#平均絕對差

#Lasso negelect some features, the score is not as good as Ridge, let's do another combination

#ElasticNet 模型

from sklearn.linear_model import ElasticNet, Lasso,? BayesianRidge, LassoLarsIC

ENet = ElasticNet(l1_ratio=.9,max_iter=1000000, random_state=42)

param_grid={'alpha':[0.001,0.01,0.1,1]}

grid_search = GridSearchCV(ENet, param_grid,cv=5)

X_train,X_test,y_train,y_test = train_test_split(X,y,random_state=42)

grid_search.fit(X_train,y_train)

print('Test train score:{:.2f}'.format(grid_search.score(X_train,y_train)))

print('Test set score:{:.2f}'.format(grid_search.score(X_test,y_test)))

enet=grid_search.fit(X_train,y_train)

acutuals = y_test

predictions = enet.predict(X_test)

maeenet = median_absolute_error(actuals, predictions)

print("MAE: %.4f" % maeenet)#平均絕對差

# GBoost模型

from sklearn.ensemble import RandomForestRegressor,? GradientBoostingRegressor

from sklearn.model_selection import KFold

from sklearn.model_selection import cross_val_score

kfold = KFold(n_splits=5)

GBoost = GradientBoostingRegressor(n_estimators=500,learning_rate=0.1,max_depth=10, max_features='sqrt',

? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? min_samples_leaf=5, min_samples_split=5,

? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? loss='huber', random_state =42)

# param_grid={'n_estimator':[100,500,1000,2000,30000]}

# grid_search = GridSearchCV(GBoost, param_grid,cv=5)

X_train,X_test,y_train,y_test = train_test_split(X,y,random_state=42)

GBoost.fit(X_train,y_train)

print("cross-validation train scores:\n{}".format(np.mean(cross_val_score(GBoost,X_train,y_train,cv=kfold))))

print("cross-validation test scores:\n{}".format(np.mean(cross_val_score(GBoost,X_test,y_test,cv=kfold))))

mse = mean_squared_error(y_test, GBoost.predict(X_test))

print("MSE: %.4f" % mse)

from sklearn.metrics import mean_squared_log_error

msle = mean_squared_log_error(y_test, GBoost.predict(X_test))

print("MSLE: %.4f" % msle)

from sklearn.metrics import median_absolute_error

mae = median_absolute_error(y_test, GBoost.predict(X_test))

print("MAE: %.4f" % mae)#平均絕對差

# GradientBoosting Regressor 模型參數(shù)優(yōu)化

<h1>GradientBoostingRegressor優(yōu)化后mae：1795.62元/平方米肤无，在所有模型表現(xiàn)最好</h1>

params = {'n_estimators': 100, 'max_depth': 10, 'min_samples_split': 5,

? ? ? ? ? 'learning_rate': 0.1, 'loss': 'ls'}

clfr = ensemble.GradientBoostingRegressor(**params)

clfr.fit(X_train, y_train)

print("cross-validation train scores:\n{}".format(np.mean(cross_val_score(clfr,X_train,y_train,cv=kfold))))

print("cross-validation test scores:\n{}".format(np.mean(cross_val_score(clfr,X_test,y_test,cv=kfold))))

mse = mean_squared_error(y_test, clfr.predict(X_test))

print("MSE: %.4f" % mse)

mae = median_absolute_error(y_test, clfr.predict(X_test))

print("MAE: %.4f" % mae)#平均絕對差

# compute test set deviance

test_score = np.zeros((params['n_estimators'],), dtype=np.float64)

for i, y_pred in enumerate(clfr.staged_predict(X_test)):

? ? test_score[i] = clfr.loss_(y_test, y_pred)

plt.figure(figsize=(12, 6))

plt.subplot(1, 2, 1)

plt.title('Deviance')

plt.plot(np.arange(params['n_estimators']) + 1, clfr.train_score_, 'b-',

? ? ? ? label='Training Set Deviance')

plt.plot(np.arange(params['n_estimators']) + 1, test_score, 'r-',

? ? ? ? label='Test Set Deviance')

plt.legend(loc='upper right')

plt.xlabel('Boosting Iterations')

plt.ylabel('Deviance')

# #############################################################################

# Plot feature importance

feature_importance = clfr.feature_importances_

# make importances relative to max importance

feature_importance = 100.0 * (feature_importance / feature_importance.max())

sorted_idx = np.argsort(feature_importance)

pos = np.arange(sorted_idx.shape[0]) + .5

plt.subplot(1, 2, 2)

plt.barh(pos, feature_importance[sorted_idx], align='center')

plt.yticks(pos, dftrainbined.columns[sorted_idx])

plt.xlabel('Relative Importance')

plt.title('Variable Importance')

plt.show()

# <h1>預(yù)測房屋的擬掛牌價格76293元/平方米先蒋，誤差在1795.62元/平方米</h1>

# In[259]:

import numpy as np

X2 =np.array([3,2004,155,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0])

X3=X2.reshape(1, -1)

pred_clfr=clfr.predict(X3)

# In[260]:

#LightGBM模型

import lightgbm as lgb

gbm1 = lgb.LGBMRegressor(objective='regression',

? ? ? ? ? ? ? ? ? ? ? ? num_leaves=31,

? ? ? ? ? ? ? ? ? ? ? ? learning_rate=0.05,

? ? ? ? ? ? ? ? ? ? ? ? n_estimators=1000)

gbm1.fit(X_train, y_train,

? ? ? ? eval_set=[(X_test, y_test)],

? ? ? ? eval_metric='l1',

? ? ? ? early_stopping_rounds=5)

print('Start predicting...')

y_pred = gbm1.predict(X_test)

# eval

#print('The rmse of prediction is:', mean_squared_error(y_test, y_pred) ** 0.5)

maelgbm = median_absolute_error(y_test, y_pred)

print("MAE: %.4f" % maelgbm)#平均絕對差

print("cross-validation test scores:\n{}".format(np.mean(cross_val_score(gbm1,X_test,y_test,cv=5))))

#XGbOOST 模型

import pickle

import xgboost as xgb

import numpy as np

from sklearn.model_selection import KFold, train_test_split, GridSearchCV

from sklearn.metrics import confusion_matrix, mean_squared_error

from sklearn.model_selection import KFold

kf = KFold(n_splits=2, shuffle=True, random_state=42)

for train_index, test_index in kf.split(X):

? ? xgb_model = xgb.XGBRegressor().fit(X, y)

? ? predictions = xgb_model.predict(X)

? ? actuals = y

? ? print(mean_squared_error(actuals, predictions))

print("cross-validation test scores:\n{}".format(np.mean(cross_val_score(xgb_model,X_test,y_test,cv=kfold))))

maelxgb_model = median_absolute_error(actuals, predictions)

print("MAE: %.4f" % maelxgb_model)#平均絕對差

xgb_model2 = xgb.XGBRegressor(n_estimators= 1000,early_stopping_rounds=5,learning_rate=0.05,n_jobs=-1)

clf = xgb_model2.fit(X_train,y_train)

predictions = clf.predict(X_test)

actuals = y_test

print(mean_squared_error(actuals, predictions))

print("cross-validation test scores:\n{}".format(np.mean(cross_val_score(clf,X_test,y_test,cv=kfold))))

maeclf = median_absolute_error(actuals, predictions)

print("MAE: %.4f" % maeclf)#平均絕對差

xgb_model3 = xgb.XGBRegressor(n_estimators= 2000,learning_rate=0.01,max_depth=2,n_jobs=-1)#learning rate 越小，得分越高

clf2 = xgb_model3.fit(X_train,y_train)

predictions = clf2.predict(X_test)

actuals = y_test

print(mean_squared_error(actuals, predictions))

print("cross-validation test scores:\n{}".format(np.mean(cross_val_score(clf2,X_test,y_test,cv=kfold))))

maeclf2 = median_absolute_error(actuals, predictions)

print("MAE: %.4f" % maeclf2)#平均絕對差

xgb_model4 = xgb.XGBRegressor(n_estimators= 1000,learning_rate=0.1,max_depth=3,gamma=0.01,n_jobs=-1)

clf3 = xgb_model4.fit(X_train,y_train)

predictions = clf3.predict(X_test)

actuals = y_test

print(mean_squared_error(actuals, predictions))

print("cross-validation test scores:\n{}".format(np.mean(cross_val_score(clf3,X_test,y_test,cv=5))))

maeclf3 = median_absolute_error(actuals, predictions)

print("MAE: %.4f" % maeclf3)#平均絕對差

ensemble3 = X2*0.3+ pred_gbm1*0.7

ensemble3

#把表現(xiàn)最好的GBM模型保存

from sklearn.externals import joblib

model1=clfr

filename = 'GBMclfr_finalized_model0115.sav'

joblib.dump(model1, filename)

# some time later...

# load the model from disk

# loaded_model = joblib.load(filename)

# result = loaded_model.score(X_test, Y_test)

# print(result)

# #The sklearn API models are picklable

# print("Pickling sklearn API models")

# # must open in binary format to pickle

# pickle.dump(clf3, open("XGB0918NYCTaxi.pkl", "wb"))

# # clf2 = pickle.load(open("best_boston.pkl", "rb"))

# # print(np.allclose(clf.predict(X), clf2.predict(X)))

?著作權(quán)歸作者所有,轉(zhuǎn)載或內(nèi)容合作請聯(lián)系作者

人面猴
序言：七十年代末宛渐，一起剝皮案震驚了整個濱河市竞漾，隨后出現(xiàn)的幾起案子，更是在濱河造成了極大的恐慌窥翩，老刑警劉巖业岁，帶你破解...
沈念sama閱讀 211,290評論 6贊 491
死咒
序言：濱河連續(xù)發(fā)生了三起死亡事件，死亡現(xiàn)場離奇詭異寇蚊，居然都是意外死亡笔时，警方通過查閱死者的電腦和手機(jī)，發(fā)現(xiàn)死者居然都...
沈念sama閱讀 90,107評論 2贊 385
救了他兩次的神仙讓他今天三更去死
文/潘曉璐我一進(jìn)店門仗岸，熙熙樓的掌柜王于貴愁眉苦臉地迎上來允耿，“玉大人，你說我怎么就攤上這事扒怖〗衔” “怎么了？”我有些...
開封第一講書人閱讀 156,872評論 0贊 347
道士緝兇錄：失蹤的賣姜人
文/不壞的土叔我叫張陵盗痒，是天一觀的道長蚂蕴。經(jīng)常有香客問我，道長积糯，這世上最難降的妖魔是什么掂墓？我笑而不...
開封第一講書人閱讀 56,415評論 1贊 283
?港島之戀（遺憾婚禮）
正文為了忘掉前任，我火速辦了婚禮看成，結(jié)果婚禮上君编，老公的妹妹穿的比我還像新娘。我一直安慰自己川慌，他們只是感情好吃嘿，可當(dāng)我...
茶點(diǎn)故事閱讀 65,453評論 6贊 385
惡毒庶女頂嫁案：這布局不是一般人想出來的
文/花漫我一把揭開白布祠乃。她就那樣靜靜地躺著，像睡著了一般兑燥。火紅的嫁衣襯著肌膚如雪亮瓷。梳的紋絲不亂的頭發(fā)上，一...
開封第一講書人閱讀 49,784評論 1贊 290
城市分裂傳說
那天降瞳，我揣著相機(jī)與錄音嘱支，去河邊找鬼。笑死挣饥，一個胖子當(dāng)著我的面吹牛除师，可吹牛的內(nèi)容都是我干的。我是一名探鬼主播扔枫，決...
沈念sama閱讀 38,927評論 3贊 406
雙鴛鴦連環(huán)套：你想象不到人心有多黑
文/蒼蘭香墨我猛地睜開眼汛聚，長吁一口氣：“原來是場噩夢啊……” “哼！你這毒婦竟也來了短荐？” 一聲冷哼從身側(cè)響起倚舀，我...
開封第一講書人閱讀 37,691評論 0贊 266
萬榮殺人案實(shí)錄
序言：老撾萬榮一對情侶失蹤，失蹤者是張志新（化名）和其女友劉穎忍宋，沒想到半個月后痕貌，有當(dāng)?shù)厝嗽跇淞掷锇l(fā)現(xiàn)了一具尸體，經(jīng)...
沈念sama閱讀 44,137評論 1贊 303
?護(hù)林員之死
正文獨(dú)居荒郊野嶺守林人離奇死亡讶踪，尸身上長有42處帶血的膿包…… 初始之章·張勛以下內(nèi)容為張勛視角年9月15日...
茶點(diǎn)故事閱讀 36,472評論 2贊 326
?白月光啟示錄
正文我和宋清朗相戀三年芯侥，在試婚紗的時候發(fā)現(xiàn)自己被綠了。大學(xué)時的朋友給我發(fā)了我未婚夫和他白月光在一起吃飯的照片乳讥。...
茶點(diǎn)故事閱讀 38,622評論 1贊 340
活死人
序言：一個原本活蹦亂跳的男人離奇死亡柱查，死狀恐怖，靈堂內(nèi)的尸體忽然破棺而出云石，到底是詐尸還是另有隱情唉工，我是刑警寧澤，帶...
沈念sama閱讀 34,289評論 4贊 329
?日本核電站爆炸內(nèi)幕
正文年R本政府宣布汹忠，位于F島的核電站淋硝，受9級特大地震影響，放射性物質(zhì)發(fā)生泄漏宽菜。R本人自食惡果不足惜谣膳，卻給世界環(huán)境...
茶點(diǎn)故事閱讀 39,887評論 3贊 312
男人毒藥：我在死后第九天來索命
文/蒙蒙一、第九天我趴在偏房一處隱蔽的房頂上張望铅乡。院中可真熱鬧继谚，春花似錦、人聲如沸阵幸。這莊子的主人今日做“春日...
開封第一講書人閱讀 30,741評論 0贊 21
一樁弒父案芽世，背后竟有這般陰謀
文/蒼蘭香墨我抬頭看了看天上的太陽。三九已至诡壁，卻和暖如春济瓢，著一層夾襖步出監(jiān)牢的瞬間，已是汗流浹背妹卿。一陣腳步聲響...
開封第一講書人閱讀 31,977評論 1贊 265
情欲美人皮
我被黑心中介騙來泰國打工旺矾，沒想到剛下飛機(jī)就差點(diǎn)兒被人妖公主榨干…… 1. 我叫王不留，地道東北人纽帖。一個月前我還...
沈念sama閱讀 46,316評論 2贊 360
代替公主和親
正文我出身青樓宠漩，卻偏偏與公主長得像，于是被迫代替她去往敵國和親懊直。傳聞我的和親對象是個殘疾皇子，可洞房花燭夜當(dāng)晚...
茶點(diǎn)故事閱讀 43,490評論 2贊 348

2019-01-15

Scrapy爬蟲與機(jī)器學(xué)習(xí)之三：房屋掛牌價格預(yù)測

推薦閱讀更多精彩內(nèi)容