1. 引言
爬取租房信息,具體要求如下
Paste_Image.png
2. 分析
- 網(wǎng)頁地址: http://bj.xiaozhu.com/search-duanzufang-p2-0/, 頁數(shù)增加時(shí)
p
后的數(shù)字隨之增加 - 每個(gè)導(dǎo)航頁面有多個(gè)詳情頁的link
- 男女房東性別判斷, 通過有無
member_girl_ico
來區(qū)分 - 從詳情頁中抓取的第一張房屋圖片發(fā)現(xiàn)打開無效, 所以直接從導(dǎo)航頁中抓取第一張圖片
3. 開工
# vim spider_xiaozhu.py //新建文件
#!/usr/bin/env python3
# -*- conding: utf-8 -*-
from bs4 import BeautifulSoup
import requests
# 定義獲取詳情頁租房信息函數(shù)
def get_data_member(sub_url, preview_img):
sub_data = requests.get(sub_url)
sub_soup = BeautifulSoup(sub_data.text, 'html5lib')
house_titles = sub_soup.select('body > div.wrap.clearfix.con_bg > div.con_l > div.pho_info > h4 > em')
house_addrs = sub_soup.select("body > div.wrap.clearfix.con_bg > div.con_l > div.pho_info > p")
# house_imgs = sub_soup.select('body > div > div > #detailImageBox > div.pho_show_l > div > div:nth-of-type(2) > img')
house_prices = sub_soup.select('body > div > div > #floatRightBox > #pricePart > div.day_l > span')
landlord_jpgs = sub_soup.select('body > div > div > #floatRightBox > div.js_box.clearfix > div.member_pic > a > img')
landlord_names = sub_soup.select('body > div > div > #floatRightBox > div.js_box.clearfix > div.w_240 > h6 > a')
landlord_genders = sub_soup.select('body > div > div > #floatRightBox > div.js_box.clearfix > div.w_240 > h6 > span')
# for house_title, house_addr, house_img, house_price, landlord_jpg, landlord_name, landlord_gender in zip(house_titles, house_ad
for house_title, house_addr, house_price, landlord_jpg, landlord_name, landlord_gender in zip(
house_titles, house_addrs, house_prices, landlord_jpgs, landlord_names, landlord_genders):
data = {
'info_web': sub_url,
'house_title': house_title.get_text(),
'house_addr': house_addr.get('title'),
'house_img': preview_img,
'house_price': house_price.get_text(),
'landlord_jpg': landlord_jpg.get('src'),
'landlord_name': landlord_name.get_text(),
# 判斷性別,有member_girl_ico屬性即為女,否則為男
'landlord_gender': ('MM' if (landlord_gender.get('class')[0] == 'member_girl_ico') else 'FM'),
}
# 打印下,起碼知道有數(shù)據(jù)輸出,而不是干等
print(data)
# 返回列表數(shù)據(jù),方便和下個(gè)詳情頁信息組合
return data
'''
# 定義獲取全部詳情頁鏈接函數(shù),也就是先取得全部的詳情頁鏈接
def get_url_list(url):
url_all = []
for ur in urls:
wb_data = requests.get(ur)
soup = BeautifulSoup(wb_data.text, 'html5lib')
url_total = soup.find_all('a', class_="resule_img_a")
for u in url_total:
sub_url = u.get('href')
url_all.append(sub_url)
return url_all
'''
# 定義獲取詳情頁鏈接函數(shù)
def get_data_list(url):
wb_data = requests.get(url)
soup = BeautifulSoup(wb_data.text, 'html5lib')
url_total = soup.select('#page_list > ul > li > a')
# url_total = soup.find_all('a', class_="resule_img_a")
# 定義一個(gè)空列表,存放一個(gè)導(dǎo)航頁中所有詳情頁的link
d_list = []
for u in url_total:
link = u.get('href')
# 獲取圖片, 以便傳入get_data_member函數(shù)
image = u.find('img').get('lazy_src')
datas = get_data_member(link, image)
d_list.append(datas)
# print('len of datas', len(datas), 'type of datas', type(datas), 'len of L', len(L), 'type of L', type(L))
return d_list
# 定義要爬取多少頁的信息,由range中的最后一個(gè)數(shù)字控制,由于這里沒有判斷頁面是否存在,所以數(shù)字就還要填太大
urls = ["http://bj.xiaozhu.com/search-duanzufang-p{}-0/".format(i) for i in range(1, 14)]
'''
url_list = get_url_list(urls)
'''
# 定義一個(gè)空列表用來存儲(chǔ)詳情頁中的信息,存入數(shù)據(jù)后這下列表就可以被引用了
data_list = []
# 循環(huán)獲取導(dǎo)航頁中詳情頁的link
for url_member in urls:
# 獲取詳情頁信息,因?yàn)間et_data_list返回的是一個(gè)列表,可以相加存入列表data_list
data_list += get_data_list(url_member)
# 看下獲取到了多少條數(shù)據(jù)
print(len(data_list))
運(yùn)行結(jié)果如下:
# python3 spider_xiaozhu.py // 運(yùn)行文件, 部分結(jié)果如下
{'house_title': '望京華彩十四號(hào)線精美豪華大一居', 'house_img': 'http://image.xiaozhustatic1.com/12/6,0,39,2965,1800,1200,f17d1a3e.jpg', 'landlord_gender': 'FM', 'landlord_name': '想要', 'house_price': '395', 'landlord_jpg': 'http://image.xiaozhustatic1.com/21/5,0,44,1477,329,329,ea609ac8.jpg', 'house_addr': '北京市朝陽區(qū)望京利澤西園', 'info_web': 'http://bj.xiaozhu.com/fangzi/3213812130.html'}
{'house_title': '積水潭地鐵近后海北師大西直門溫馨一居整租', 'house_img': 'http://image.xiaozhustatic1.com/12/6,0,64,3967,1800,1200,fb028f16.jpg', 'landlord_gender': 'MM', 'landlord_name': '天天Tinny', 'house_price': '298', 'landlord_jpg': 'http://image.xiaozhustatic1.com/21/5,0,36,1268,375,376,560d399f.jpg', 'house_addr': '北京市西城區(qū)志強(qiáng)南園9號(hào)樓', 'info_web': 'http://bj.xiaozhu.com/fangzi/2704933663.html'}
{'house_title': '國貿(mào)CBD 百子灣 1號(hào)線四惠地鐵站8分鐘', 'house_img': 'http://image.xiaozhustatic1.com/12/6,0,18,2496,1800,1200,4bc1de28.jpg', 'landlord_gender': 'MM', 'landlord_name': '不勤', 'house_price': '425', 'landlord_jpg': 'http://image.xiaozhustatic1.com/21/5,0,10,2403,363,364,d729c792.jpg', 'house_addr': '北京市朝陽區(qū)朝陽路八里莊十里堡', 'info_web': 'http://bj.xiaozhu.com/fangzi/2952536363.html'}
{'house_title': '獨(dú)立衛(wèi)浴鄰798昌简、望京、酒仙橋更多優(yōu)惠房源绒怨。', 'house_img': 'http://image.xiaozhustatic1.com/12/2,0,71,458,1800,1200,a9c5ea82.jpg', 'landlord_gender': 'FM', 'landlord_name': '暖陽洋Sunny', 'house_price': '268', 'landlord_jpg': 'http://image.xiaozhustatic1.com/21/2,0,86,206,375,375,d46c51ef.jpg', 'house_addr': '北京市朝陽區(qū)彩虹路', 'info_web': 'http://bj.xiaozhu.com/fangzi/860516339.html'}
{'house_title': '望京商圈纯赎,毗鄰地鐵5分鐘,漫威主題大兩居', 'house_img': 'http://image.xiaozhustatic1.com/12/6,0,66,803,1800,1200,38a4c686.jpg', 'landlord_gender': 'FM', 'landlord_name': '想要', 'house_price': '395', 'landlord_jpg': 'http://image.xiaozhustatic1.com/21/5,0,44,1477,329,329,ea609ac8.jpg', 'house_addr': '北京市朝陽區(qū)廣順北大街利澤西園', 'info_web': 'http://bj.xiaozhu.com/fangzi/2896441162.html'}
{'house_title': '宋家莊0距離地鐵5南蹂、10號(hào)線 址否,拎包入住', 'house_img': 'http://image.xiaozhustatic1.com/12/5,0,94,2458,1800,1200,5648e989.jpg', 'landlord_gender': 'MM', 'landlord_name': 'sara房', 'house_price': '308', 'landlord_jpg': 'http://image.xiaozhustatic1.com/21/5,0,91,1393,375,376,e2190513.jpg', 'house_addr': '北京市豐臺(tái)區(qū)宋家莊萬科紅', 'info_web': 'http://bj.xiaozhu.com/fangzi/2652262063.html'}
{'house_title': '天壇前門地鐵5,10號(hào)線大型交通樞紐暖心公寓', 'house_img': 'http://image.xiaozhustatic1.com/12/3,0,44,3940,1800,1200,2ff6a063.jpg', 'landlord_gender': 'MM', 'landlord_name': '暖心大姐', 'house_price': '358', 'landlord_jpg': 'http://image.xiaozhustatic1.com/21/6,0,42,4085,260,260,4faad403.jpg', 'house_addr': '北京市豐臺(tái)區(qū)萬科紅', 'info_web': 'http://bj.xiaozhu.com/fangzi/1603871035.html'}
{'house_title': '國貿(mào)雙井10號(hào)線蘋果酒店式公寓', 'house_img': 'http://image.xiaozhustatic1.com/12/6,0,25,3184,1800,1200,4b993d38.jpg', 'landlord_gender': 'MM', 'landlord_name': '陽光艷艷', 'house_price': '398', 'landlord_jpg': 'http://image.xiaozhustatic1.com/21/5,0,59,2841,363,363,8b6cf3d7.jpg', 'house_addr': '北京市朝陽區(qū)蘋果社區(qū)北區(qū)', 'info_web': 'http://bj.xiaozhu.com/fangzi/2803985763.html'}
{'house_title': '北三環(huán)健德門10號(hào)線陽光充沛整租公寓', 'house_img': 'http://image.xiaozhustatic1.com/12/6,0,68,2767,1800,1200,a419ba8c.jpg', 'landlord_gender': 'FM', 'landlord_name': '一塊萌萌的五花肉', 'house_price': '338', 'landlord_jpg': 'http://image.xiaozhustatic1.com/21/6,0,13,3403,375,376,da442c27.jpg', 'house_addr': '北京市朝陽區(qū)祈家豁子8號(hào) 健翔大廈(健翔公寓)', 'info_web': 'http://bj.xiaozhu.com/fangzi/3063899729.html'}
{'house_title': '近鄰北京西站 南站 307醫(yī)院舒適大兩居室', 'house_img': 'http://image.xiaozhustatic1.com/12/2,0,27,3758,1800,1200,32cec838.jpg', 'landlord_gender': 'FM', 'landlord_name': '小房東東', 'house_price': '428', 'landlord_jpg': 'http://image.xiaozhustatic1.com/21/3,0,32,1155,260,260,7cd22180.jpg', 'house_addr': '北京市豐臺(tái)區(qū)豐臺(tái)泥洼路', 'info_web': 'http://bj.xiaozhu.com/fangzi/1185941235.html'}
{'house_title': '南鑼鼓巷碎紊、故宮佑附、簋街、雍和宮清新雙人房', 'house_img': 'http://image.xiaozhustatic1.com/12/1,0,77,2314,825,550,8cfcf835.jpg', 'landlord_gender': 'MM', 'landlord_name': '小肖肖', 'house_price': '218', 'landlord_jpg': 'http://image.xiaozhustatic1.com/21/5,0,55,1517,260,260,ea96ce11.jpg', 'house_addr': '北京市東城區(qū)南鑼鼓巷', 'info_web': 'http://bj.xiaozhu.com/fangzi/357151300.html'}
{'house_title': '東三環(huán)潘家園十里河地鐵10-14號(hào)線臨近國貿(mào)', 'house_img': 'http://image.xiaozhustatic1.com/12/3,0,99,2623,1800,1200,a7ad6184.jpg', 'landlord_gender': 'MM', 'landlord_name': '蘭絮的家', 'house_price': '328', 'landlord_jpg': 'http://image.xiaozhustatic1.com/21/4,0,91,8102,329,329,2583e316.jpg', 'house_addr': '北京市朝陽區(qū)十里河橋西宏善家園', 'info_web': 'http://bj.xiaozhu.com/fangzi/1370027235.html'}
{'house_title': '地鐵6號(hào)線常營單間溫馨公寓愛豬我家', 'house_img': 'http://image.xiaozhustatic1.com/12/4,0,13,6807,1800,1200,4f508533.jpg', 'landlord_gender': 'MM', 'landlord_name': '愛豬我家萍姐', 'house_price': '197', 'landlord_jpg': 'http://image.xiaozhustatic1.com/21/3,0,95,5809,359,359,9c0c2802.jpg', 'house_addr': '北京市朝陽區(qū)常營富力陽光美園', 'info_web': 'http://bj.xiaozhu.com/fangzi/1831963835.html'}
{'house_title': '國貿(mào)CBD四惠十里堡華堂商場(chǎng) 現(xiàn)代田園四人房', 'house_img': 'http://image.xiaozhustatic1.com/12/5,0,64,2493,1800,1200,49dbbf30.jpg', 'landlord_gender': 'MM', 'landlord_name': '不勤', 'house_price': '556', 'landlord_jpg': 'http://image.xiaozhustatic1.com/21/5,0,10,2403,363,364,d729c792.jpg', 'house_addr': '北京市朝陽區(qū)八里莊十里堡', 'info_web': 'http://bj.xiaozhu.com/fangzi/2627680763.html'}
{'house_title': '地鐵5號(hào)線10號(hào)線宋家莊大型交通樞紐時(shí)尚公寓', 'house_img': 'http://image.xiaozhustatic1.com/12/3,0,85,5997,1800,1200,56cd3eea.jpg', 'landlord_gender': 'FM', 'landlord_name': 'simoe陳', 'house_price': '358', 'landlord_jpg': 'http://image.xiaozhustatic1.com/21/4,0,34,9060,260,260,463e7ce3.jpg', 'house_addr': '北京市豐臺(tái)區(qū)萬科紅公寓', 'info_web': 'http://bj.xiaozhu.com/fangzi/1758721035.html'}
{'house_title': '雙井仗考、九龍山地鐵附近首城國際三室一廳整租', 'house_img': 'http://image.xiaozhustatic1.com/12/5,0,24,1688,1800,1200,f90815a2.jpg', 'landlord_gender': 'MM', 'landlord_name': 'uujaa', 'house_price': '688', 'landlord_jpg': 'http://image.xiaozhustatic1.com/21/4,0,97,7907,320,320,708221d7.jpg', 'house_addr': '北京市朝陽區(qū)廣渠路36號(hào)首城國際', 'info_web': 'http://bj.xiaozhu.com/fangzi/2538072663.html'}
{'house_title': '5號(hào)線/13號(hào)線立水橋地鐵站藍(lán)色小清新臥室', 'house_img': 'http://image.xiaozhustatic1.com/12/2,0,91,4059,1800,1200,17dfcdb2.jpg', 'landlord_gender': 'MM', 'landlord_name': 'evergday', 'house_price': '138', 'landlord_jpg': 'http://image.xiaozhustatic1.com/21/2,0,19,3529,260,260,20a5e73d.jpg', 'house_addr': '北京市昌平區(qū)陳家營西路2號(hào)院', 'info_web': 'http://bj.xiaozhu.com/fangzi/1195183455.html'}
{'house_title': '地鐵5號(hào)13號(hào)立水橋南 浪漫溫馨大床房', 'house_img': 'http://image.xiaozhustatic1.com/12/3,0,20,5777,1800,1200,2ca35fc1.jpg', 'landlord_gender': 'FM', 'landlord_name': '安安同學(xué)', 'house_price': '138', 'landlord_jpg': 'http://image.xiaozhustatic1.com/21/5,0,96,1679,414,414,dde31210.jpg', 'house_addr': '北京市朝陽區(qū)北苑北苑家園錦芳路', 'info_web': 'http://bj.xiaozhu.com/fangzi/1779817335.html'}
{'house_title': '緊鄰10號(hào)線,三站奔中關(guān),屬于你的單人間', 'house_img': 'http://image.xiaozhustatic1.com/12/1,0,84,3843,825,550,55601bf5.jpg', 'landlord_gender': 'MM', 'landlord_name': 'Sharon_bj', 'house_price': '198', 'landlord_jpg': 'http://image.xiaozhustatic1.com/21/1,0,20,4699,260,260,a0108f46.jpg', 'house_addr': '北京市海淀區(qū)藍(lán)靛廠中路', 'info_web': 'http://bj.xiaozhu.com/fangzi/508913100.html'}
4. 總結(jié)
-
format
方法使用起來極為方便 - 同一個(gè)需求數(shù)據(jù)可以從不同地方獲取數(shù)據(jù)
- 可以獲取詳情頁鏈接后立馬就抓取詳情頁信息, 也可以獲取全部詳情頁鏈接后再抓取信息
-
if
else
在不確定數(shù)據(jù)是否存在時(shí)可以很方便的使用