第一部分 爬蟲(chóng)
- 數(shù)據(jù)來(lái)源:房天下
- 網(wǎng)頁(yè)結(jié)構(gòu)分析
- 通過(guò)抓包分析網(wǎng)頁(yè)信息否过,房源信息就是包含在當(dāng)前HTML文件中汽摹。
- 目標(biāo)URL規(guī)律:
第一頁(yè):https://lz.esf.fang.com/house/i31/
第二頁(yè):https://lz.esf.fang.com/house/i32/
第三頁(yè):https://lz.esf.fang.com/house/i33/
......
第十頁(yè):https://lz.esf.fang.com/house/i310/
從中可以看出李丰,變化的只是最后面的一部分,那么實(shí)現(xiàn)多頁(yè)爬取時(shí)構(gòu)造新的URL就比較容易逼泣。(拼接頁(yè)數(shù)就可以)
-
爬取內(nèi)容分析
為了得到更多的有用信息趴泌,需要進(jìn)行詳情頁(yè)的跳轉(zhuǎn),也就是說(shuō)拉庶,首先獲取詳情頁(yè)鏈接嗜憔,然后在請(qǐng)求獲取房源信息。
info1.png
info2.png 難點(diǎn)
- 重定向
當(dāng)興高采烈的拿著地址去訪問(wèn)的時(shí)候氏仗,返回信息如下吉捶,臉黑了,說(shuō)明發(fā)生了重定向問(wèn)題皆尔。在請(qǐng)求該地址后呐舔,會(huì)出現(xiàn)短暫的“跳轉(zhuǎn)”字眼。
跳轉(zhuǎn).png
那么我們就在這個(gè)網(wǎng)頁(yè)信息里查找下一個(gè)請(qǐng)求地址慷蠕,如下圖可以看到點(diǎn)擊跳轉(zhuǎn)前有我們想要的信息珊拼,這是目標(biāo)網(wǎng)頁(yè)請(qǐng)求地址。
點(diǎn)擊跳轉(zhuǎn).png
在進(jìn)行詳情頁(yè)跳轉(zhuǎn)的時(shí)候也存在這個(gè)問(wèn)題流炕,分析思路是一樣的澎现。
- 驗(yàn)證碼
selenium + 云打碼平臺(tái)解決或者人工輸入
-
爬取思路
過(guò)程.png 代碼實(shí)現(xiàn)
- 詳情頁(yè)地址
#!/user/bin/env python3
# -*- coding: utf-8 -*-
import requests
# 自定義的UA庫(kù)
from UA import ua
import random
from lxml import etree
import time
from requests.packages.urllib3.exceptions import InsecureRequestWarning
from selenium import webdriver
requests.packages.urllib3.disable_warnings(InsecureRequestWarning)
headers = {'User-Agent': ''}
def get_detail_url(url):
headers['User-Agent'] = random.choice(ua)
try:
r = requests.get(url, headers=headers, verify=False)
html = etree.HTML(r.text)
# 經(jīng)過(guò)上述跳轉(zhuǎn),得到目標(biāo)網(wǎng)頁(yè)地址
roal_url = html.xpath('//a[@class="btn-redir"]/@href')[0]
r = requests.get(roal_url, headers=headers, verify=False)
html = etree.HTML(r.text)
hrefs = html.xpath('//div[@class="shop_list shop_list_4"]/dl/dt/a/@href')
channels = html.xpath('//div[@class="shop_list shop_list_4"]/dl/dt/a/@data_channel')
next_urls = ['https://lz.esf.fang.com' + href +'?channel=' + channel for href,channel in zip(hrefs,channels)]
house.extend(next_urls)
except:
process_captcha()
get_detail_url(url)
def process_captcha():
# 該處url是讓出現(xiàn)驗(yàn)證碼界面浪感,沒(méi)有具體的限制
url = 'https://lz.esf.fang.com/chushou/3_416752691.htm?channel=2,2'
driver = webdriver.Firefox()
driver.get(url)
# 人工輸入驗(yàn)證碼
time.sleep(12)
driver.find_element_by_name('submit').click()
driver.close()
if __name__ == '__main__':
'''
這個(gè)過(guò)程中昔头,貌似只能爬取100頁(yè)饼问,那么可以細(xì)化影兽,比如分區(qū)域爬取,可以再細(xì)分莱革。
'''
house = []
for i in range(1,100):
print('--------------------------------')
print(f'開(kāi)始爬取第{i}頁(yè)')
url = f'https://lz.esf.fang.com/house/i3{i}/'
get_detail_url(url)
print('爬取結(jié)束峻堰!')
f = open('urls.txt', 'a+', encoding='utf8')
for i in house:
f.write(i + '\n')
f.close()
- 房屋信息
#!/user/bin/env python3
# -*- coding: utf-8 -*-
import requests
from UA import ua
import random
from lxml import etree
import time
from requests.packages.urllib3.exceptions import InsecureRequestWarning
import csv
from selenium import webdriver
from PIL import Image
# 云打碼平臺(tái)API
from vcode import *
requests.packages.urllib3.disable_warnings(InsecureRequestWarning)
headers = {'User-Agent': ''}
def get_info(url):
headers['User-Agent'] = random.choice(ua)
# 解決驗(yàn)證碼反爬蟲(chóng)問(wèn)題
try:
r = requests.get(url, headers=headers, verify=False, timeout=60)
html = etree.HTML(r.text)
detail_url = html.xpath('//a[@class="btn-redir"]/@href')[0]
r = requests.get(detail_url, headers=headers, verify=False, timeout=60)
html = etree.HTML(r.text)
total_price = html.xpath('//div[@class="tab-cont-right"]/div[1]/div[1]/div[1]/i/text()')[0]
style = html.xpath('//div[@class="tab-cont-right"]/div[@class="tr-line clearfix"][1]/div[1]/div[1]/text()')[
0].replace('\n', '').strip()
area = html.xpath('//div[@class="tab-cont-right"]/div[@class="tr-line clearfix"][1]/div[2]/div[1]/text()')[0]
unit_price = \
html.xpath('//div[@class="tab-cont-right"]/div[@class="tr-line clearfix"][1]/div[3]/div[1]/text()')[0]
direction = html.xpath('//div[@class="tab-cont-right"]/div[@class="tr-line clearfix"][2]/div[1]/div[1]/text()')[
0]
floor = html.xpath('//div[@class="tab-cont-right"]/div[@class="tr-line clearfix"][2]/div[2]/div[1]/text()')[0]
decoration = \
html.xpath('//div[@class="tab-cont-right"]/div[@class="tr-line clearfix"][2]/div[3]/div[1]/text()')[0]
local = html.xpath('//div[@class="tab-cont-right"]/div[@class="tr-line"]/div[2]/div[2]/a[1]/text()')[0].replace(
'\n', '').strip()
school = html.xpath('//div[@class="tab-cont-right"]/div[@class="tr-line"]/div[3]')
if len(school):
school = 1
else:
school = 0
data = {'總價(jià)': total_price,
'戶型': style,
'建筑面積': area,
'單價(jià)': unit_price,
'朝向': direction,
'樓層': floor,
'裝修': decoration,
'區(qū)域': local,
'學(xué)校': school}
content = {'建筑年代': '',
'有無(wú)電梯': '',
'產(chǎn)權(quán)性質(zhì)': '',
'住宅類別': '',
'建筑結(jié)構(gòu)': '',
'建筑類別': ''}
info = html.xpath('//div[@class="content-item fydes-item"]/div[2]//span/text()')
for i in range(int((len(info) - 2) / 2)):
content[info[2 * i]] = info[2 * i + 1]
to_csv(data, content)
except:
title = process_captcha(url)
# 檢驗(yàn)url是否有效,推測(cè)原因是房源信息已經(jīng)不存在了盅视,
# 如果存在捐名,則重新請(qǐng)求,反之闹击,就結(jié)束當(dāng)前請(qǐng)求镶蹋,開(kāi)始下一個(gè)請(qǐng)求
if title == '蘭州二手房-房天下':
delete.append(url)
pass
else:
# 防止一個(gè)請(qǐng)求循環(huán)進(jìn)行,導(dǎo)致一直使用驗(yàn)證碼平臺(tái),進(jìn)行下一個(gè)
if url in flag:
return
flag.append(url)
get_info(url)
def to_csv(data,content):
with open('house.csv', 'a+', encoding='utf-8', newline='') as f:
writer = csv.writer(f)
writer.writerow([data['戶型'], data['建筑面積'], data['朝向'], data['樓層'], data['裝修'],
content['建筑年代'], content['有無(wú)電梯'], content['產(chǎn)權(quán)性質(zhì)'], content['住宅類別'],
content['建筑結(jié)構(gòu)'], content['建筑類別'], data['區(qū)域'],
data['學(xué)校'], data['總價(jià)'], data['單價(jià)']])
def process_captcha(url):
driver = webdriver.Firefox()
driver.get(url)
print(url)
driver.save_screenshot('code.png')
left = 700
top = 340
right = 900
bottom = 405
im = Image.open('code.png')
im = im.crop((left, top, right, bottom))
im.save('captcha.png')
# 實(shí)例化贺归,需要自己的賬號(hào)淆两、密碼、驗(yàn)證碼對(duì)應(yīng)類型
cjy = Chaojiying_Client(你的賬號(hào)拂酣,你的密碼秋冰, '902223')
im = open('captcha.png', 'rb').read()
code = cjy.PostPic(im,1004).get('pic_str')
driver.find_element_by_id('code').send_keys(code)
time.sleep(1)
driver.find_element_by_name('submit').click()
time.sleep(2)
driver.get(url)
title = driver.title
driver.close()
return title
if __name__ == '__main__':
house = []
f = open('urls.txt')
texts = f.readlines()
for text in texts:
house.append(text.rstrip())
house = list(set(house))
f.close()
delete = []
flag = []
for i in range(2852,len(house)):
print(f'開(kāi)始爬取第{i+1}條信息')
get_info(house[i])
print('爬取結(jié)束!')
-
結(jié)果展示
結(jié)果.png
共計(jì)將近8000多條數(shù)據(jù)婶熬,和下圖對(duì)應(yīng)剑勾。
總體情況.png - 數(shù)據(jù)詳情見(jiàn):https://www.kesci.com/home/dataset/5f073e5ac94d2e002d03522d/files
二、數(shù)據(jù)分析
項(xiàng)目詳情見(jiàn):https://www.kesci.com/home/project/5f098536192ac2002c87c5aa