python實戰(zhàn)計劃：爬取租房信息

Date:2016-9-21
update:2016-9-30
By:Black Crow

前言：

終于進入到網(wǎng)絡(luò)頁面的抓取了。前面一節(jié)課靜態(tài)頁面的作業(yè)做了之后總是有報錯宿刮，所以一直沒有單獨寫總結(jié)。聽課的時候就感覺到內(nèi)容十分的吸引人深碱，爬取的過程也是特別有意思结洼，后面一節(jié)課關(guān)于select的條件上是有做優(yōu)化的锌畸，比前一節(jié)課更高效。PPT里的地址已失效随夸，所以隨便設(shè)定條件搜的短租房信息九默。

作業(yè)效果：

看著信息滾動的感覺其實挺爽的

房租信息.gif

20160921爬取的excel表格：鏈接: http://pan.baidu.com/s/1nvEVDvN 密碼: j4vt
20160922update表格：鏈接: http://pan.baidu.com/s/1c198fN6 密碼: kq4a
20160922update圖片：

各區(qū)女房東占多數(shù).png

東城均價最高，通州均價最低.png

女房東的房子均價要高.png

我的代碼：

20160921代碼

from bs4 import BeautifulSoup
import requests
import time
def gender_change(gender_lorder):
if gender_lorder =='member_girl_ico'or 'member_ico1':
return 'girl'
elif gender_lorder =='member_boy_ico' or 'member_ico':
return 'boy'
else:
return 'unknown gender!'
def info(url):
info_data = requests.get(url)
info_soup=BeautifulSoup(info_data.text,'lxml')
titles = info_soup.select('div.con_l > div.pho_info > h4 > em')
addresses =info_soup.select('div.pho_info > p')
images_house = info_soup.select('img[id="curBigImage"]')
days_fee =info_soup.select('div.day_l > span')
urls_lorder =info_soup.select('div.member_pic > a > img')
names_lorder =info_soup.select('div.w_240 > h6 > a')
genders_lorder = info_soup.select('div.w_240 > h6 > span')
#print(days_fee)
for title,address,image_house,day_fee,url_lorder,name_lorder,gender_lorder in zip(titles,addresses,images_house,days_fee,urls_lorder,names_lorder,genders_lorder):
data ={
'title':title.get_text('em'),
'address':address.get('title'),
'image_house':image_house.get('src'),
'url_lorder':url_lorder.get('src'),
'name_lorder':name_lorder.get_text(),
'gender_lorder':gender_change(str(gender_lorder.get('class')[0])),
'day_fee': day_fee.get_text(),
}
print(data)
time.sleep(0.01)
house_urls = ['http://bj.xiaozhu.com/search-duanzufang-p{}-0/?startDate=2016-09-21&endDate=2016-10-01'.format(str(i)) for i in range(0,15,1)]
for house_url in house_urls:
wb_data = requests.get(house_url)
soup = BeautifulSoup(wb_data.text, 'lxml')
detail_urls = soup.select('a[class="resule_img_a"]')
for detail_url in detail_urls:
house_data=detail_url.get('href')
info(house_data)
#print(house_data)
print('Done')


#####20160922update代碼：修正了性別判斷
>```
from bs4 import BeautifulSoup
import requests
import time
def gender_change(gender_lorder):
    if gender_lorder == 'member_girl_ico':
        return 'girl'
    elif gender_lorder == 'member_boy_ico':
        return 'boy'
    else:
        return 'unknown gender!'
def info(url):
    info_data = requests.get(url)
    info_soup=BeautifulSoup(info_data.text,'lxml')
    titles = info_soup.select('div.con_l > div.pho_info > h4 > em')
    addresses =info_soup.select('div.pho_info > p')
    images_house = info_soup.select('img[id="curBigImage"]')
    days_fee =info_soup.select('div.day_l > span')
    urls_lorder =info_soup.select('div.member_pic > a > img')
    names_lorder =info_soup.select('div.w_240 > h6 > a')
    genders_lorder = info_soup.select('div.w_240 > h6 > span')
    #print(genders_lorder)
    for title,address,image_house,day_fee,url_lorder,name_lorder,gender_lorder in zip(titles,addresses,images_house,days_fee,urls_lorder,names_lorder,genders_lorder):
        data ={
            'title':title.get_text('em'),
            'address':address.get('title'),
            'image_house':image_house.get('src'),
            'url_lorder':url_lorder.get('src'),
            'name_lorder':name_lorder.get_text(),
            'gender_lorder':gender_change(gender_lorder.get('class')[0]),
            'day_fee': day_fee.get_text(),
        }
        print(data)
house_urls = ['http://bj.xiaozhu.com/search-duanzufang-p{}-0/?startDate=2016-09-21&endDate=2016-10-01'.format(str(i)) for i in range(0,15,1)]
for house_url in house_urls:
    wb_data = requests.get(house_url)
    soup = BeautifulSoup(wb_data.text, 'lxml')
    detail_urls = soup.select('a[class="resule_img_a"]')
    for detail_url in detail_urls:
        house_data=detail_url.get('href')
        info(house_data)
        #print(house_data)
print('Done')

20160924update:性別表述修改為male和female;去除標題中的換行符宾毒，避免影響數(shù)據(jù)處理驼修；增加寫入本地文件；增加計數(shù)項诈铛，避免爬取過程中無聊乙各。

from bs4 import BeautifulSoup
import requests
import time
def gender_change(gender_lorder):
if gender_lorder =='member_girl_ico':
return 'female'
elif gender_lorder =='member_boy_ico':
return 'male'
else:
return 'unknown gender'
def counter(last=[0]):
#last[0]將列表里面的第一個元素取出，然后加1幢竹，賦值給next
next = last[0] + 1
#修改列表里面第一個元素的值
last[0] = next
#返回此時運行的次數(shù)
return next
def info(url):
info_data = requests.get(url)
info_soup=BeautifulSoup(info_data.text,'lxml')
titles = info_soup.select('h4 em')
#titles = info_soup.select('div.con_l > div.pho_info > h4 > em')
addresses =info_soup.select('div.pho_info > p')
images_house = info_soup.select('img[id="curBigImage"]')
days_fee =info_soup.select('div.day_l > span')
urls_lorder =info_soup.select('div.member_pic > a > img')
names_lorder =info_soup.select('div.w_240 > h6 > a')
genders_lorder = info_soup.select('div.w_240 > h6 > span')
#print(titles)
for title,address,image_house,day_fee,url_lorder,name_lorder,gender_lorder
in zip(titles,addresses,images_house,days_fee,urls_lorder,names_lorder,
genders_lorder):
data ={
#'title':title.get_text('em'),
'title': title.get_text(),
'address':address.get('title'),
'image_house':image_house.get('src'),
'url_lorder':url_lorder.get('src'),
'name_lorder':name_lorder.get_text(),
'gender_lorder':gender_change(str(gender_lorder.get('class')[0])),
'day_fee': day_fee.get_text(),
}
#print(data)
with open('F://python/2/xiaozhu_data.txt','a',encoding='utf-8') as file:#路徑根據(jù)實際填寫
#以w形式寫入耳峦，前面會被清空；a為追加寫入
#標題里有換行焕毫，用replace去除
file_content =data['title'].replace("\n",'')+';'+data['day_fee']+';'
+data['address']+';'+data['image_house']+';'
+data['name_lorder']+';'+data['gender_lorder']+';'
+data['url_lorder']+'\n'
file.write(file_content)
print(counter()) # 調(diào)用計數(shù)器蹲坷，避免無聊
time.sleep(0.01)
house_urls = ['http://bj.xiaozhu.com/search-duanzufang-p{}-0/?startDate=2016-09-21'
'&endDate=2016-10-01'.format(str(i)) for i in range(0,15,1)]
for house_url in house_urls:
wb_data = requests.get(house_url)
soup = BeautifulSoup(wb_data.text, 'lxml')
detail_urls = soup.select('a[class="resule_img_a"]')
for detail_url in detail_urls:
house_data=detail_url.get('href')
info(house_data)
#print(house_data)
print('Done')

####總結(jié)：
>1. 該網(wǎng)站也設(shè)置了反爬措施，房屋圖片及房東圖片都采用了障眼法（假src邑飒，點擊鏈接后圖片下載了但是打不開）循签，但是目前技術(shù)有限，繞不過去疙咸，只能是暫時擱置了县匠。(update20160930圖片的地址是真實的，但是因為我默認瀏覽器為chrome，打開鏈接就直接下載了圖片乞旦，圖片無法打開贼穆，在該鏈接復(fù)制進IE瀏覽器后，發(fā)現(xiàn)原來可以顯示杆查。霧~~~)
![1.png](http://upload-images.jianshu.io/upload_images/1059649-5555e1182aab31d6.png?imageMogr2/auto-orient/strip%7CimageView2/2/w/1240)
2. 本來打算將爬取的數(shù)據(jù)寫入文檔扮惦，但實驗了多次后發(fā)現(xiàn)dict的轉(zhuǎn)換寫入方法還沒掌握臀蛛，這個后續(xù)打算問問老師怎么處理比較妥當亲桦；此次作業(yè)的表格是采用傻瓜式處理的，復(fù)制粘貼到excel浊仆，然后用excel分列處理的客峭。大致看了下短租房日租金以128-499區(qū)間的房屋最多，地址沒細作研究抡柿，但是覺得可以再excel里用地圖展現(xiàn)一下舔琅。（dict里的內(nèi)容打印存儲到本地的坑已經(jīng)填上，20160924update）
3. 代碼寫的時候是先寫的單個頁面的解析洲劣，后來寫的是房屋鏈接的采集备蚓，兩段代碼合并時稍微做了調(diào)整。
4. 速度有些慢囱稽，不知道是代碼原因還是本身數(shù)據(jù)爬取過程就比較慢的原因郊尝。sleep的時間還是設(shè)定了，比較短战惊，以防萬一流昏。
5. 性別一項抓取的數(shù)據(jù)都是girl,估計還是有問題，還沒有一個個細看是不是真的如此吞获，但直覺是女性確實比較多况凉。（此項已經(jīng)修正，20160924update）

最后編輯于：2017.12.04 03:52:30

?著作權(quán)歸作者所有,轉(zhuǎn)載或內(nèi)容合作請聯(lián)系作者

人面猴
序言：七十年代末各拷，一起剝皮案震驚了整個濱河市刁绒，隨后出現(xiàn)的幾起案子，更是在濱河造成了極大的恐慌烤黍，老刑警劉巖知市，帶你破解...
沈念sama閱讀 206,214評論 6贊 481
死咒
序言：濱河連續(xù)發(fā)生了三起死亡事件，死亡現(xiàn)場離奇詭異蚊荣，居然都是意外死亡初狰，警方通過查閱死者的電腦和手機，發(fā)現(xiàn)死者居然都...
沈念sama閱讀 88,307評論 2贊 382
救了他兩次的神仙讓他今天三更去死
文/潘曉璐我一進店門互例，熙熙樓的掌柜王于貴愁眉苦臉地迎上來奢入，“玉大人，你說我怎么就攤上這事⌒裙猓” “怎么了关顷？”我有些...
開封第一講書人閱讀 152,543評論 0贊 341
道士緝兇錄：失蹤的賣姜人
文/不壞的土叔我叫張陵，是天一觀的道長武福。經(jīng)常有香客問我议双，道長，這世上最難降的妖魔是什么捉片？我笑而不...
開封第一講書人閱讀 55,221評論 1贊 279
?港島之戀（遺憾婚禮）
正文為了忘掉前任平痰，我火速辦了婚禮，結(jié)果婚禮上伍纫，老公的妹妹穿的比我還像新娘宗雇。我一直安慰自己，他們只是感情好莹规，可當我...
茶點故事閱讀 64,224評論 5贊 371
惡毒庶女頂嫁案：這布局不是一般人想出來的
文/花漫我一把揭開白布赔蒲。她就那樣靜靜地躺著，像睡著了一般良漱。火紅的嫁衣襯著肌膚如雪舞虱。梳的紋絲不亂的頭發(fā)上，一...
開封第一講書人閱讀 49,007評論 1贊 284
城市分裂傳說
那天母市，我揣著相機與錄音矾兜，去河邊找鬼。笑死窒篱，一個胖子當著我的面吹牛焕刮，可吹牛的內(nèi)容都是我干的。我是一名探鬼主播墙杯，決...
沈念sama閱讀 38,313評論 3贊 399
雙鴛鴦連環(huán)套：你想象不到人心有多黑
文/蒼蘭香墨我猛地睜開眼配并，長吁一口氣：“原來是場噩夢啊……” “哼！你這毒婦竟也來了高镐？” 一聲冷哼從身側(cè)響起溉旋，我...
開封第一講書人閱讀 36,956評論 0贊 259
萬榮殺人案實錄
序言：老撾萬榮一對情侶失蹤，失蹤者是張志新（化名）和其女友劉穎嫉髓，沒想到半個月后观腊，有當?shù)厝嗽跇淞掷锇l(fā)現(xiàn)了一具尸體，經(jīng)...
沈念sama閱讀 43,441評論 1贊 300
?護林員之死
正文獨居荒郊野嶺守林人離奇死亡算行，尸身上長有42處帶血的膿包…… 初始之章·張勛以下內(nèi)容為張勛視角年9月15日...
茶點故事閱讀 35,925評論 2贊 323
?白月光啟示錄
正文我和宋清朗相戀三年梧油，在試婚紗的時候發(fā)現(xiàn)自己被綠了。大學(xué)時的朋友給我發(fā)了我未婚夫和他白月光在一起吃飯的照片州邢。...
茶點故事閱讀 38,018評論 1贊 333
活死人
序言：一個原本活蹦亂跳的男人離奇死亡儡陨，死狀恐怖，靈堂內(nèi)的尸體忽然破棺而出，到底是詐尸還是另有隱情骗村，我是刑警寧澤嫌褪，帶...
沈念sama閱讀 33,685評論 4贊 322
?日本核電站爆炸內(nèi)幕
正文年R本政府宣布，位于F島的核電站胚股，受9級特大地震影響笼痛，放射性物質(zhì)發(fā)生泄漏。R本人自食惡果不足惜琅拌，卻給世界環(huán)境...
茶點故事閱讀 39,234評論 3贊 307
男人毒藥：我在死后第九天來索命
文/蒙蒙一缨伊、第九天我趴在偏房一處隱蔽的房頂上張望。院中可真熱鬧财忽，春花似錦倘核、人聲如沸。這莊子的主人今日做“春日...
開封第一講書人閱讀 30,240評論 0贊 19
一樁弒父案，背后竟有這般陰謀
文/蒼蘭香墨我抬頭看了看天上的太陽活尊。三九已至隶校，卻和暖如春，著一層夾襖步出監(jiān)牢的瞬間蛹锰，已是汗流浹背深胳。一陣腳步聲響...
開封第一講書人閱讀 31,464評論 1贊 261
情欲美人皮
我被黑心中介騙來泰國打工，沒想到剛下飛機就差點兒被人妖公主榨干…… 1. 我叫王不留铜犬，地道東北人舞终。一個月前我還...
沈念sama閱讀 45,467評論 2贊 352
代替公主和親
正文我出身青樓，卻偏偏與公主長得像癣猾，于是被迫代替她去往敵國和親敛劝。傳聞我的和親對象是個殘疾皇子，可洞房花燭夜當晚...
茶點故事閱讀 42,762評論 2贊 345

python實戰(zhàn)計劃：爬取租房信息

前言：

作業(yè)效果：

我的代碼：

20160921代碼

20160924update:性別表述修改為male和female;去除標題中的換行符宾毒，避免影響數(shù)據(jù)處理驼修；增加寫入本地文件；增加計數(shù)項诈铛，避免爬取過程中無聊乙各。

推薦閱讀更多精彩內(nèi)容