# -*- coding: utf-8 -*-
"""
Created on Thu Jan 9 11:36:00 2020
@author: QIAOQICHAO258
"""
import pandas as pd
import numpy as np
import time
from tqdm import tqdm
import codecs
import csv
import os
from math import radians, cos, sin, asin, sqrt
import math
from xgboost import XGBRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import minmax_scale
from sklearn.metrics import mean_absolute_error
cols_dict = \
{'bazhuayu_case.case_type': '案例類型',
'bazhuayu_case.case_source': '案例來源',
'bazhuayu_case.district': '區(qū)域',
'bazhuayu_case.community_sources': '案例源小區(qū)名稱',
'bazhuayu_case.flr_total_ind': '總樓層',
'bazhuayu_case.checked_time': '看房次數(shù)',
'bazhuayu_case.attention_time': '關(guān)注量',
'bazhuayu_case.browse_time': '瀏覽次數(shù)',
'bazhuayu_case.list_time': '掛牌時間',
'bazhuayu_case.transaction_time': '成交時間',
'bazhuayu_case.list_totalprice': '掛牌總價(萬元)',
'bazhuayu_case.list_unitprice': '掛牌單價(元/㎡)',
'bazhuayu_case.transaction_price': '成交總價(萬元)',
'bazhuayu_case.transaction_avg_price': '成交單價(元/㎡)',
'bazhuayu_case.community_price': '案例源小區(qū)均價(元/㎡)',
'bazhuayu_case.transaction_cycle': '成交周期',
'bazhuayu_case.price_adjustment_times': '調(diào)價次數(shù)',
'bazhuayu_case.longitude': '經(jīng)度',
'bazhuayu_case.latitude': '緯度'}
pi = math.pi
EARTH_REDIUS = 6378.137
# 經(jīng)度系數(shù) 1米所對應(yīng)的經(jīng)度
LONGITUDE_COEF = 0.000011
# 緯度系數(shù) 1米所對應(yīng)的度度
LATITUDE_COEF = 0.000009
###################################### 讀取數(shù)據(jù) ###################################################
# 城市
city = '南昌市'
# poi數(shù)據(jù)文件
path_poi = '高德POI/%s.csv' %city
with open(path_poi, 'r', encoding='utf-8')as f:
poi = pd.read_csv(f)
# 小區(qū)清單數(shù)據(jù)文件
path_community = '小區(qū)清單/八爪魚小區(qū)清單20191230.csv'
community = pd.read_csv(path_community, engine='python',encoding='utf-8')
community = community[community['城市'] == city]
# 百強開發(fā)商物業(yè)
developer = pd.read_excel('百強物業(yè)開發(fā)商/百強開發(fā)商.xlsx')
property = pd.read_excel('百強物業(yè)開發(fā)商/百強物業(yè).xlsx')
# 映射表
yingshe = pd.read_excel('映射表/%s.xlsx'%city)
# 八爪魚掛牌案例
txt_name = os.listdir('城市八爪魚數(shù)據(jù)/%s' % city)[0]
with open('城市八爪魚數(shù)據(jù)/%s/%s' % (city, txt_name),'r',encoding='utf-8')as f:
case_data = pd.read_csv(f, sep='$', error_bad_lines=False,
usecols=cols_dict.keys(), low_memory=True)
# 法拍數(shù)據(jù)
fapai = pd.read_csv('法拍數(shù)據(jù)/北京重慶大連哈爾濱_京東阿里法拍.csv', engine='python', encoding='utf8')
####################################################################################################
def mape(estimatory, X, y):
y_ = estimatory.predict(X)
return (np.abs(y - y_)/y).mean()
def rad(d):
return d * pi / 180.0
# 測算兩點經(jīng)緯度之間的距離
def getDistance1(lat1, lng1, lat2, lng2):
radLat1 = rad(lat1)
radLat2 = rad(lat2)
a = radLat1 - radLat2
b = rad(lng1) - rad(lng2)
s = 2 * math.asin(math.sqrt(math.pow(math.sin(a/2), 2) + math.cos(radLat1) * math.cos(radLat2) * math.pow(math.sin(b/2), 2)))
s = s * EARTH_REDIUS
return s * 1000
def getDistance(lon1, lat1, lon2, lat2):
#計算距離
# 將十進制度數(shù)轉(zhuǎn)化為弧度
lon1, lat1, lon2, lat2 = map(radians, [lon1, lat1, lon2, lat2])
# haversine公式
dlon = lon2 - lon1
dlat = lat2 - lat1
a = sin(dlat/2)**2 + cos(lat1) * cos(lat2) * sin(dlon/2)**2
c = 2 * asin(sqrt(a))
r = 6378.137 # 地球平均半徑授瘦,單位為公里
return c * r * 1000
def select_facter_poi(temp_comm_longitude, temp_comm_latitude, poi_data, len_coef):
poi_data_spilt = poi_data[ (temp_comm_longitude + LONGITUDE_COEF * len_coef > poi_data['longitude'] ) &
(poi_data['longitude'] > temp_comm_longitude - LONGITUDE_COEF * len_coef) &
(temp_comm_latitude + LONGITUDE_COEF * len_coef > poi_data['latitude'] ) &
(poi_data['latitude'] > temp_comm_latitude - LONGITUDE_COEF * len_coef) ]
return poi_data_spilt
#體育館最近距離
def get_turth_distance(poi_data_all, read_community_data):
dict_temp = {
'體育館':["['綜合體育館']"],
'幼兒園':["['幼兒園']"],
'小學':["['小學']","['中學', '小學']", "['小學', '中學']"],
'購物中心':["['購物中心']"],
'三級甲等醫(yī)院':["['三級甲等醫(yī)院']"],
'政府機關(guān)':["['區(qū)縣級政府及事業(yè)單位']","['省直轄市級政府及事業(yè)單位']"],
'火車站':["['火車站']"],
'景區(qū)':["['國家級景點']", "['省級景點']"],
'公園':["['公園']"],
'高等院校':["['高等院校']"],
'地鐵站': ["['地鐵站']"],
'飛機場':["['飛機場']"],
'城市中心':["['城市中心']"],
'城市廣場':["['城市廣場']"],
}
data1=dict_temp.items()
community_data = read_community_data[['省份', '樓盤名稱', '高德經(jīng)度', '高德緯度']].values
list_distance_min=[]
for key,value in data1:
poi_data=poi_data_all.copy()
if key=='政府機關(guān)':
poi_data = poi_data[((poi_data['sub_category']=="['區(qū)縣級政府及事業(yè)單位']")|(poi_data['sub_category']=="['省直轄市級政府及事業(yè)單位']"))&(poi_data['name'].str.endswith('人民政府')) ]
elif key=='小學':
poi_data=poi_data[((poi_data['sub_category']=="['小學']")|(poi_data['sub_category']=="['中學', '小學']")|(poi_data['sub_category']=="['小學', '中學']"))]
elif key=='景區(qū)':
poi_data=poi_data[((poi_data['sub_category']=="['國家級景點']")|(poi_data['sub_category']=="['省級景點']"))]
else:
print(value)
poi_data =poi_data[(poi_data['sub_category']==value[0])]
# 小區(qū)清單列表
list_distance_min=[]
print(value, 'poi數(shù)據(jù):', len(poi_data), '小區(qū)清單列表:', len(read_community_data))
# 遍歷小區(qū)清單
for temp_comm in tqdm(community_data):
# 經(jīng)度
temp_comm_longitude = float(temp_comm[2])
# 緯度
temp_comm_latitude = float(temp_comm[3])
list_distance_temp=[]
# 過濾經(jīng)緯度
temp_poi_data = poi_data[['mid_category', 'sub_category', 'longitude', 'latitude', 'name']].values
for temp_poi in temp_poi_data:
# 經(jīng)度
temp_poi_longitude = float(temp_poi[2])
# 緯度
temp_poi_latitude = float(temp_poi[3])
# 距離
distance = getDistance(temp_comm_longitude, temp_comm_latitude, temp_poi_longitude, temp_poi_latitude)
list_distance_temp.append(distance)
distance_min=min(list_distance_temp)if len(list_distance_temp)!=0 else 0
list_distance_min.append(distance_min)
read_community_data['%s'%(key)]=list_distance_min
return read_community_data
# 購物中心標簽
def get_turth_number(poi_data_all, read_community_data):
dict_temp = {
'商務(wù)寫字樓':["['商務(wù)寫字樓']"],
'喪葬設(shè)施':["['喪葬設(shè)施']"],
'商場':["['商場']"],
'路口名':["['路口名']"],
'公交車站相關(guān)':["['公交車站相關(guān)']"],
'工廠':["['工廠']"],
}
data1=dict_temp.items()
community_data = read_community_data[['省份', '樓盤名稱', '高德經(jīng)度', '高德緯度']].values
for key,value in data1:
poi_data=poi_data_all.copy()
print(value)
poi_data =poi_data[(poi_data['sub_category']==value[0])]
# 小區(qū)清單列表
list_number=[]
print(value, 'poi數(shù)據(jù):', len(poi_data), '小區(qū)清單列表:', len(read_community_data))
# 遍歷小區(qū)清單
for temp_comm in tqdm(community_data):
# 經(jīng)度
temp_comm_longitude = float(temp_comm[2])
# 緯度
temp_comm_latitude = float(temp_comm[3])
list_number_temp=[]
# 過濾經(jīng)緯度
poi_data_spilt = select_facter_poi(temp_comm_longitude, temp_comm_latitude, poi_data, 2000)
temp_poi_data = poi_data_spilt[['mid_category', 'sub_category', 'longitude', 'latitude', 'name']].values
for temp_poi in temp_poi_data:
# 經(jīng)度
temp_poi_longitude = float(temp_poi[2])
# 緯度
temp_poi_latitude = float(temp_poi[3])
# 距離
distance = getDistance(temp_comm_longitude, temp_comm_latitude, temp_poi_longitude, temp_poi_latitude)
if distance<=1000:
list_number_temp.append(distance)
number=len(list_number_temp)
list_number.append(number)
read_community_data['%s'%(key)]=list_number
return read_community_data
class CaseValue():
def __init__(self, city_name):
self.city_name = city_name
self.df = self.read_and_process_data()
self.data = self.get_info(self.df)
self.result = self.match_c()
self.result = self.fill_nan(self.result)
def read_and_process_data(self):
year, month = 2019, 1
cols_dict = \
{'bazhuayu_case.case_type': '案例類型',
'bazhuayu_case.case_source': '案例來源',
'bazhuayu_case.district': '區(qū)域',
'bazhuayu_case.community_sources': '案例源小區(qū)名稱',
'bazhuayu_case.flr_total_ind': '總樓層',
'bazhuayu_case.checked_time': '看房次數(shù)',
'bazhuayu_case.attention_time': '關(guān)注量',
'bazhuayu_case.browse_time': '瀏覽次數(shù)',
'bazhuayu_case.list_time': '掛牌時間',
'bazhuayu_case.transaction_time':'成交時間',
'bazhuayu_case.list_totalprice': '掛牌總價(萬元)',
'bazhuayu_case.list_unitprice': '掛牌單價(元/㎡)',
'bazhuayu_case.transaction_price': '成交總價(萬元)',
'bazhuayu_case.transaction_avg_price': '成交單價(元/㎡)',
'bazhuayu_case.community_price': '案例源小區(qū)均價(元/㎡)',
'bazhuayu_case.transaction_cycle': '成交周期',
'bazhuayu_case.price_adjustment_times': '調(diào)價次數(shù)',
'bazhuayu_case.longitude': '經(jīng)度',
'bazhuayu_case.latitude': '緯度'}
# TODO: 讀取數(shù)據(jù)
df = case_data.copy()
df.columns = df.columns.map(cols_dict)
# 逗號過濾
df['案例源小區(qū)名稱'] = df['案例源小區(qū)名稱'].astype(str)
df['案例源小區(qū)名稱'] = df['案例源小區(qū)名稱'].str.replace(',', '')
# TODO: 八爪魚小區(qū)名稱映射
df_community = yingshe
df_community['映射字段'] = df_community['district'] + df_community['community_sources']
d = dict(zip(df_community['映射字段'], df_community['man_community_id']))
df['映射字段'] = df['區(qū)域'] + df['案例源小區(qū)名稱']
df['匹配ID'] = df['映射字段'].map(d)
n = df.shape[0]
n_nan = df[df['匹配ID'].isna()].shape[0]
print('總數(shù)據(jù)量:%s, 可映射數(shù)據(jù)%s, 不可映射數(shù)據(jù)%s'%(n, n-n_nan, n_nan))
df = df[df['匹配ID'].notna()]
def func(x):
if pd.isna(x['掛牌時間']):
if pd.isna(x['成交時間']):
return np.nan
else:
return x['成交時間']
else:
return x['掛牌時間']
def func2(x):
if pd.isna(x['成交單價(元/㎡)']) :
return x['掛牌單價(元/㎡)'] * 0.95
else:
return x['成交單價(元/㎡)']
df['時間'] = df.apply(func, axis=1)
df['時間'] = df['時間'].map(lambda x: np.nan if len(str(x)) > 10 else x)
df['時間']= pd.to_datetime(df['時間'], errors='coerce')
df = df[df['時間'].notna()]
df = df[df['時間']>pd.to_datetime('%s-%s-01'%(year, month))]
df['時間'] = df['時間'].map(lambda x: (x.year, x.month))
cols = ['案例源小區(qū)均價(元/㎡)', '掛牌總價(萬元)', '掛牌單價(元/㎡)', '成交總價(萬元)',
'成交單價(元/㎡)', '總樓層']
for col in cols:
df[col] = pd.to_numeric(df[col], errors='coerce')
df['價格'] = df[['掛牌單價(元/㎡)', '成交單價(元/㎡)']].apply(func2, axis=1)
df = df[['匹配ID', '案例源小區(qū)均價(元/㎡)', '掛牌總價(萬元)', '掛牌單價(元/㎡)', '成交總價(萬元)',
'成交單價(元/㎡)', '總樓層','時間','價格']]
df.drop_duplicates(subset=['匹配ID', '價格'], inplace=True)
return df
def get_info(self, df):
data = []
for c_id, group in tqdm(df.groupby(['匹配ID']), desc='掛牌數(shù)據(jù)計算中'):
n_chengjiao = group[group['成交單價(元/㎡)'].notna()].shape[0]
n_guapai = group[group['掛牌單價(元/㎡)'].notna()].shape[0]
n_floor = group['總樓層'].median()
avg_price = group['價格'].median()
case_price = group['案例源小區(qū)均價(元/㎡)'].median()
if pd.isna(case_price):
price = avg_price
if price < 1000:
price = np.nan
else:
price = case_price
if price < 1000:
price = np.nan
a = group.groupby('時間').mean()
rate = (a['價格'].diff() / a['價格']).mean()
data.append([c_id, n_chengjiao, n_guapai, n_floor, price, rate])
data = pd.DataFrame(data, columns=['樓盤ID','成交數(shù)量','掛牌數(shù)量','總樓層','小區(qū)均價','抗跌率'])
return data
def match_c(self):
# TODO: 讀取小區(qū)清單
result = pd.merge(community[['樓盤ID']], self.data,
on='樓盤ID', how='left')
return result
def fill_nan(self, result):
result['成交數(shù)量'].fillna(0, inplace=True)
result['掛牌數(shù)量'].fillna(0, inplace=True)
result['總樓層'].fillna(result['總樓層'].quantile(0.4), inplace=True)
result['抗跌率'].fillna(result['抗跌率'].quantile(0.4), inplace=True)
return result
#處理填充
def community_data(df):
# 填充房屋類型的空值
housing_type_fill=str(df['房屋類型(小類)'].mode())
df.fillna({'房屋類型(小類)':housing_type_fill})
# 填充綠化率和容積率的空值醋界,根據(jù)房屋類型分組填充
cols=[col for col in df.columns if col in ['綠化率','容積率'] ]
gp_col='房屋類型(小類)'
df_na=df[cols].isna()
df_mean=df.groupby(gp_col)[cols].mean()
for col in cols:
na_series=df_na[col]
names=list(df.loc[na_series,gp_col])
t=df_mean.loc[names,col]
t.index=df.loc[na_series,col].index
df.loc[na_series,col]=t
# 填充其他字段的空值
df['占地面積'] = pd.to_numeric(df['占地面積'], errors='coerce')
df['地上車位'] = pd.to_numeric(df['地上車位'], errors='coerce')
df['地下車位'] = pd.to_numeric(df['地下車位'], errors='coerce')
build_date_fill=int(df['建成年份'].mode())
green_rate_fill=df['綠化率'].mean()
plot_rate_fill=df['容積率'].mean()
floor_area_fill=df['占地面積'].mean()
house_num_fill=df['總戶數(shù)'].mean()
manage_type_fill=str(df['管理形式'].mode())
column_list=df.fillna({'管理形式':manage_type_fill,'占地面積':floor_area_fill,'總戶數(shù)':house_num_fill,'綠化率':green_rate_fill,
'容積率':plot_rate_fill,'建成年份':build_date_fill,'地上車位':0,'地下車位':0})
# result=column_list[['樓盤ID','樓盤名稱','行政區(qū)','綠化率','容積率','管理形式','地上車位','地下車位','占地面積','建成年份','總戶數(shù)']]
column_list['車位比']=column_list.apply(lambda x:(x['地上車位']+x['地下車位'])/x['總戶數(shù)'],axis=1)
return column_list
def value2score(data):
# 1:值越小分數(shù)越高 0:值越大分數(shù)越高
usecols = {
'地鐵站': (1, '生活配套'),
'建成年份': (0, '樓盤品質(zhì)'),
'綠化率': (0, '樓盤品質(zhì)'),
'容積率': (0, '宜居程度'),
'體育館': (1, '宜居程度'),
'幼兒園': (1, '宜居程度'),
'總戶數(shù)': (0, '樓盤品質(zhì)'),
'占地面積': (0, '樓盤品質(zhì)'),
'小學': (1, '生活配套'),
'購物中心': (1, '宜居程度'),
'三級甲等醫(yī)院': (1, '生活配套'),
'政府機關(guān)': (0, '生活配套'),
'景區(qū)': (1, '宜居程度'),
'公園': (1, '宜居程度'),
'高等院校': (1, '區(qū)位狀況'),
'城市中心': (1, '區(qū)位狀況'),
'城市廣場': (0, '區(qū)位狀況'),
'商務(wù)寫字樓': (0, '區(qū)位狀況'),
'喪葬設(shè)施': (1, '不利因素'),
'商場': (0, '生活配套'),
'路口名': (0, '區(qū)位狀況'),
'公交車站相關(guān)': (0, '區(qū)位狀況'),
'工廠': (0, '不利因素'),
'成交數(shù)量': (0, '活躍程度'),
'掛牌數(shù)量': (0, '活躍程度'),
'總樓層': (0, '樓盤品質(zhì)'),
'抗跌率': (0, '活躍程度'),
'車位比': (0, '樓盤品質(zhì)'),
'加油站': (0, '不利因素'),
'火車站': (0,'不利因素'),
'機場': (0, '不利因素'),
'行政區(qū)分數(shù)':(0, '區(qū)位狀況'),
}
data['建成年份'] = pd.to_numeric(data['建成年份'],errors='coerce')
data['建成年份'].fillna(data['建成年份'].quantile(0.5),inplace=True)
# 處理行政區(qū)分數(shù)
distruct_value = data[['小區(qū)均價','行政區(qū)']].groupby('行政區(qū)').mean().fillna(0)
dv_dict = dict(zip(distruct_value.index, distruct_value['小區(qū)均價']))
data['行政區(qū)分數(shù)'] = data['行政區(qū)'].map(dv_dict)
for col_name, col in data.iteritems():
if col_name in usecols.keys():
cat = usecols[col_name][1]
false = False
if false:
pass
elif false:
pass
else:
t = usecols[col_name][0]
data.loc[data[col_name] >= data[col_name].quantile(0.75), col_name + '分數(shù)'] = t*-4 + 5
data.loc[(data[col_name] < data[col_name].quantile(0.75)) & (
data[col_name] >= data[col_name].quantile(0.50)), col_name + '分數(shù)'] = t*-2 + 4
data.loc[(data[col_name] < data[col_name].quantile(0.50)) & (
data[col_name] >= data[col_name].quantile(0.30)), col_name + '分數(shù)'] = 3
data.loc[(data[col_name] < data[col_name].quantile(0.30)) & (
data[col_name] >= data[col_name].quantile(0.15)), col_name + '分數(shù)'] = t*2 + 2
data.loc[data[col_name] < data[col_name].quantile(0.15), col_name + '分數(shù)'] = t*4 + 1
if cat in data.columns:
data[cat] = pd.to_numeric(data[cat], errors='coerce')
data[cat] += data[col_name + '分數(shù)']
else:
data[cat] = data[col_name + '分數(shù)']
return data
def get_label(data):
developers = developer['公司名稱'].values
propertys = property['公司名稱'].values
data_temp = data
dffb = fapai
data['法拍數(shù)量'] = [len(dffb[dffb['匹配ID'] == ID].drop_duplicates(subset=['匹配ID', 'title'])) for ID in
data['樓盤ID'].values]
data.loc[(data['法拍數(shù)量'] > 0) & (data['法拍數(shù)量'] < 5), '法拍標簽'] = '含法拍'
data.loc[(data['法拍數(shù)量'] >= 5) & (data['法拍數(shù)量'] < 10), '法拍標簽'] = '法拍數(shù)量多'
data.loc[data['法拍數(shù)量'] > 10, '法拍標簽'] = '法拍數(shù)量極多'
data_temp['容積率'] = pd.to_numeric(data_temp['容積率'], errors='coerce')
data_temp.loc[data_temp['容積率'] >= data_temp['容積率'].quantile(0.85), '容積率標簽'] = '容積率高'
data_temp.loc[data_temp['容積率'] <= data_temp['容積率'].quantile(0.15), '容積率標簽'] = '容積率低'
data_temp['綠化率'] = pd.to_numeric(data_temp['綠化率'], errors='coerce')
data_temp.loc[data_temp['綠化率'] >= data_temp['綠化率'].quantile(0.85), '綠化率標簽'] = '綠化率高'
data_temp.loc[data_temp['綠化率'] <= data_temp['綠化率'].quantile(0.15), '綠化率標簽'] = '綠化率低'
data_temp['總戶數(shù)'] = pd.to_numeric(data_temp['總戶數(shù)'], errors='coerce')
data_temp.loc[data_temp['總戶數(shù)'] >= data_temp['總戶數(shù)'].quantile(0.85), '小區(qū)規(guī)模標簽'] = '大型社區(qū)'
data_temp.loc[data_temp['總戶數(shù)'] <= data_temp['總戶數(shù)'].quantile(0.15), '小區(qū)規(guī)模標簽'] = '小型社區(qū)'
# data_temp.loc[data_temp['車位比']>=data_temp['車位比'].quantile(0.85),'停車位標簽'] = '停車位充裕'
# data_temp.loc[data_temp['車位比']<=data_temp['車位比'].quantile(0.15),'停車位標簽'] = '停車位緊缺'
data_temp['建成年份'] = pd.to_numeric(data_temp['建成年份'], errors='coerce')
data_temp.loc[data_temp['建成年份'] >= 2015, '樓齡標簽'] = '次新房'
data_temp.loc[(data_temp['建成年份'] >= 2010) & (data_temp['建成年份'] < 2015), '樓齡標簽'] = '6-10年樓齡'
data_temp.loc[(data_temp['建成年份'] >= 2005) & (data_temp['建成年份'] < 2010), '樓齡標簽'] = '10-15年樓齡'
data_temp.loc[(data_temp['建成年份'] >= 2000) & (data_temp['建成年份'] < 2005), '樓齡標簽'] = '15-20年樓齡'
data_temp.loc[data_temp['建成年份'] < 2000, '樓齡標簽'] = '老舊小區(qū)'
data_temp.loc[data_temp['開發(fā)商'].isin(developers), '百強開發(fā)商標簽'] = '百強開發(fā)商'
data_temp.loc[data_temp['物業(yè)公司'].isin(propertys), '百強物業(yè)標簽'] = '百強物業(yè)'
data_temp.loc[data_temp['掛牌數(shù)量'] >= data_temp['掛牌數(shù)量'].quantile(0.85), '活躍度標簽'] = '掛盤活躍'
data_temp.loc[data_temp['掛牌數(shù)量'] <= data_temp['掛牌數(shù)量'].quantile(0.15), '停車位標簽'] = '掛盤不活躍'
data_temp.loc[data_temp['購物中心'] < 1000, '購物中心標簽'] = '近購物中心'
data_temp.loc[data_temp['三級甲等醫(yī)院'] < 1000, '三級甲等醫(yī)院標簽'] = '近三甲醫(yī)院'
data_temp.loc[data_temp['政府機關(guān)'] < 1000, '政府機關(guān)標簽'] = '近政府機關(guān)'
data_temp.loc[data_temp['火車站'] < 1000, '火車站標簽'] = '近火車站'
data_temp.loc[data_temp['景區(qū)'] < 1000, '景區(qū)標簽'] = '近景區(qū)'
# data_temp.loc[data_temp['公園']<1000, '公園標簽'] = '近公園'
# data_temp.loc[data_temp['地鐵站']<1000, '地鐵標簽'] = '近地鐵站'
data_temp.loc[data_temp['飛機場'] < 1000, '機場標簽'] = '距離機場過近'
data_temp.loc[data_temp['喪葬設(shè)施'] > 5, '喪葬設(shè)施標簽'] = '距離喪葬設(shè)施過近'
data_temp.loc[data_temp['工廠'] > 3, '工廠標簽'] = '距離工廠過近'
data_temp.loc[data_temp['商務(wù)寫字樓'] >= data_temp['商務(wù)寫字樓'].quantile(0.85), '商務(wù)區(qū)標簽'] = '商務(wù)區(qū)'
show_cols = data_temp.columns[data_temp.columns.str.contains('標簽')]
# 計算展示標簽
for idx, row in data_temp.iterrows():
labels = []
for col in show_cols:
if pd.isna(row[col]):
pass
else:
labels.append(row[col])
data_temp.loc[idx, '展示標簽'] = ','.join(labels)
return data_temp
def train(data):
use_cols = [ '容積率',
'綠化率',
'占地面積',
'建成年份',
'總樓層',
'城市中心',
'抗跌率',
'購物中心',
'三級甲等醫(yī)院',
'高等院校',
'商務(wù)寫字樓',
'公交車站相關(guān)',
'掛牌數(shù)量',
'路口名',
'商場',
'工廠',
'政府機關(guān)',
'喪葬設(shè)施',
'體育館',
'公園',
'地鐵站',
'小學',
'幼兒園',
'火車站',
'小區(qū)均價']
train_data = data[use_cols]
X = train_data[train_data['小區(qū)均價'].notna()].iloc[:, :-1]
y = train_data[train_data['小區(qū)均價'].notna()].iloc[:, -1]
model = XGBRegressor(n_jobs=-1)
param_grid = {'max_depth': np.arange(2, 8, 1),
'gamma': np.arange(0.5, 0.8, 0.1),
'colsample_bytree': np.arange(0.5, 0.8, 0.1)}
gs = GridSearchCV(model, param_grid=param_grid, n_jobs=-1, verbose=1, cv=5, scoring=mape)
gs.fit(X, y)
print(gs.best_params_)
print(gs.best_score_)
X_ = train_data.iloc[:, :-1]
y_ = gs.predict(X_)
data['綜合評分'] = y_
return data
def change_poi(poi_data):
location = poi_data['location'].str.split(',')
list_longitude = []
list_latitude = []
[(list_longitude.append(x[0]), list_latitude.append(x[1])) for x in location]
poi_data['longitude'] = list_longitude
poi_data['latitude'] = list_latitude
poi_data['longitude'] =poi_data['longitude'].astype(float)
poi_data['latitude'] =poi_data['latitude'].astype(float)
return poi_data
def get_level(result):
result.loc[result['綜合評分'] >= result['綜合評分'].quantile(0.8), '綜合評級'] = 'A'
result.loc[(result['綜合評分'] < result['綜合評分'].quantile(0.8)) &
(result['綜合評分'] >= result['綜合評分'].quantile(0.6)), '綜合評級'] = 'B'
result.loc[(result['綜合評分'] < result['綜合評分'].quantile(0.6)) &
(result['綜合評分'] >= result['綜合評分'].quantile(0.4)), '綜合評級'] = 'C'
result.loc[(result['綜合評分'] < result['綜合評分'].quantile(0.4)) &
(result['綜合評分'] >= result['綜合評分'].quantile(0.2)), '綜合評級'] = 'D'
result.loc[result['綜合評分'] < result['綜合評分'].quantile(0.2), '綜合評級'] = 'E'
# TODO: 小區(qū)均價 --> 小區(qū)價格評級
result.loc[result['小區(qū)均價'] >= result['小區(qū)均價'].quantile(0.8), '小區(qū)價格評級'] = 'A'
result.loc[(result['小區(qū)均價'] < result['小區(qū)均價'].quantile(0.8)) &
(result['小區(qū)均價'] >= result['小區(qū)均價'].quantile(0.6)), '小區(qū)價格評級'] = 'B'
result.loc[(result['小區(qū)均價'] < result['小區(qū)均價'].quantile(0.6)) &
(result['小區(qū)均價'] >= result['小區(qū)均價'].quantile(0.4)), '小區(qū)價格評級'] = 'C'
result.loc[(result['小區(qū)均價'] < result['小區(qū)均價'].quantile(0.4)) &
(result['小區(qū)均價'] >= result['小區(qū)均價'].quantile(0.2)), '小區(qū)價格評級'] = 'D'
result.loc[result['小區(qū)均價'] < result['小區(qū)均價'].quantile(0.2), '小區(qū)價格評級'] = 'E'
# TODO: 填充評級
for idx, row in result.iterrows():
if pd.isna(row['小區(qū)價格評級']):
result.loc[idx, '小區(qū)價格評級'] = row['綜合評級']
# TODO: 數(shù)據(jù)重排
n = result.shape[0]
for col in ['樓盤品質(zhì)', '宜居程度', '區(qū)位狀況',
'生活配套', '活躍程度', '不利因素','綜合評分']:
result.sort_values(col, inplace=True)
result[col] = np.linspace(5, 10, n)
return result
def get_risk(all_community_data):
list_one = ['A'] * 5 + ['B'] * 5 + ['C'] * 5 + ['D'] * 5 + ['E'] * 5
list_two = ['A', 'B', 'C', 'D', 'E'] * 5
list_str = ['正常', '關(guān)注', '謹慎', '謹慎', '謹慎', '正常', '正常', '關(guān)注', '謹慎', '謹慎', '正常', '正常', '正常', '關(guān)注', '謹慎', '正常', '正常',
'關(guān)注', '謹慎', '謹慎', '正常', '關(guān)注', '謹慎', '謹慎', '謹慎', ]
risk_data = pd.DataFrame({'小區(qū)價格評級': list_one, '綜合評級': list_two, '策略': list_str})
list_risk = []
for price, score in all_community_data[['小區(qū)價格評級', '綜合評級']].values:
risk_temp = risk_data[(risk_data['小區(qū)價格評級'] == price) & (risk_data['綜合評級'] == score)]['策略'].values[0]
list_risk.append(risk_temp)
all_community_data['風險策略'] = list_risk
return all_community_data
def high_quality_community(data):
df = data.copy()
df = df[df['綜合評級'].isin(['C', 'B', 'A'])]
df = df[df['小區(qū)價格評級'].isin(['A', 'B', 'C', 'D'])]
df = df[df['風險策略'].isin(['正常', '關(guān)注'])]
tmp = df[df['掛牌數(shù)量'] > 0]
stop_n = tmp['掛牌數(shù)量'].quantile(0.01)
df = df[df['掛牌數(shù)量'] > stop_n]
df = df[df['房屋類型(小類)'] == '普通住宅']
df = df[df['建成年份'] > 2000]
up = 0.05
down = 0
df = df[(np.abs(df['抗跌率']) >= down) & (np.abs(df['抗跌率']) <=up)]
df = df[df['法拍數(shù)量'] == 0]
data.loc[df.index, '優(yōu)質(zhì)小區(qū)'] = '是'
return data
if __name__=='__main__':
print('城市: ', city)
start_time1 = time.time()
# 獲取poi數(shù)據(jù)
poi_data = change_poi(poi)
# 獲取小區(qū)清單數(shù)據(jù)
read_community_data = community
read_community_data.dropna(subset=['高德經(jīng)度'],inplace=True)
# TODO: 解析POI數(shù)據(jù)
start_time = time.time()
read_community_data = get_turth_distance(poi_data, read_community_data)
read_community_data = get_turth_number(poi_data, read_community_data)
print('poi數(shù)據(jù)運行時間:',time.time() - start_time)
# TODO: 解析案例數(shù)據(jù)
start_time = time.time()
case_value = CaseValue(city)
result = case_value.result
read_community_data = pd.merge(read_community_data, result, on='樓盤ID', how='left')
print('案例數(shù)據(jù)運行時間:',time.time() - start_time)
# TODO: 出標簽
start_time = time.time()
read_community_data = get_label(read_community_data)
print('出標簽時間:',time.time() - start_time)
# TODO: 處理小區(qū)清單數(shù)據(jù)
start_time = time.time()
read_community_data=community_data(read_community_data)
print('全部運行時間:',time.time() - start_time1)
# TODO: 原始值出分
read_community_data = value2score(read_community_data)
# read_community_data.to_excel('訓練數(shù)據(jù)_%s.xlsx'%city)
# TODO: 訓練模型
read_community_data = train(read_community_data)
# TODO: 小區(qū)評級
result = get_level(read_community_data)
# TODO: 風險策略
result = get_risk(result)
# TODO: 添加優(yōu)質(zhì)小區(qū)
result = high_quality_community(result)
result.to_excel('小區(qū)畫像_%s.xlsx'%city, index=False)
result_cols = [
'省份','城市','城市代號','行政區(qū)','行政區(qū)代號','樓盤名稱','樓盤ID','地址',
'小區(qū)均價','小區(qū)價格評級','綜合評級','綜合評分','風險策略','展示標簽','樓盤品質(zhì)',
'宜居程度','區(qū)位狀況','生活配套','活躍程度','不利因素','優(yōu)質(zhì)小區(qū)']
result = result[result_cols]
result.to_excel('小區(qū)評級結(jié)果表_%s.xlsx'%city,index=False)
2020-01-17-linzilu248
最后編輯于 :
?著作權(quán)歸作者所有,轉(zhuǎn)載或內(nèi)容合作請聯(lián)系作者
- 文/潘曉璐 我一進店門,熙熙樓的掌柜王于貴愁眉苦臉地迎上來粗梭,“玉大人争便,你說我怎么就攤上這事《弦剑” “怎么了滞乙?”我有些...
- 文/不壞的土叔 我叫張陵,是天一觀的道長孩锡。 經(jīng)常有香客問我酷宵,道長,這世上最難降的妖魔是什么躬窜? 我笑而不...
- 正文 為了忘掉前任浇垦,我火速辦了婚禮,結(jié)果婚禮上荣挨,老公的妹妹穿的比我還像新娘男韧。我一直安慰自己,他們只是感情好默垄,可當我...
- 文/花漫 我一把揭開白布此虑。 她就那樣靜靜地躺著,像睡著了一般口锭。 火紅的嫁衣襯著肌膚如雪朦前。 梳的紋絲不亂的頭發(fā)上介杆,一...
- 文/蒼蘭香墨 我猛地睜開眼晶渠,長吁一口氣:“原來是場噩夢啊……” “哼凰荚!你這毒婦竟也來了?” 一聲冷哼從身側(cè)響起褒脯,我...
- 正文 年R本政府宣布,位于F島的核電站硫痰,受9級特大地震影響衩婚,放射性物質(zhì)發(fā)生泄漏。R本人自食惡果不足惜效斑,卻給世界環(huán)境...
- 文/蒙蒙 一非春、第九天 我趴在偏房一處隱蔽的房頂上張望。 院中可真熱鬧缓屠,春花似錦奇昙、人聲如沸。這莊子的主人今日做“春日...
- 文/蒼蘭香墨 我抬頭看了看天上的太陽。三九已至滨溉,卻和暖如春什湘,著一層夾襖步出監(jiān)牢的瞬間长赞,已是汗流浹背。 一陣腳步聲響...