本文是利用Python對(duì)鏈家網(wǎng)長(zhǎng)沙房源信息進(jìn)行數(shù)據(jù)采集眼姐、提取并進(jìn)行分析的練習(xí)。
先總結(jié)下:
- user—agent : 采用萬(wàn)行的user列表,每次隨機(jī)使用球化,偽造瀏覽器以及屏幕和系統(tǒng)等信息;
- cookie :帶真實(shí)cookie瓦糟;
- 隨機(jī)休息時(shí)間筒愚;
- 使用logging庫(kù)和相關(guān)函數(shù)提醒進(jìn)度和錯(cuò)誤,方便查找錯(cuò)誤菩浙;
- 嘗試采用peewee庫(kù)和數(shù)據(jù)庫(kù)函數(shù)將數(shù)據(jù)導(dǎo)入mysql中巢掺。
數(shù)據(jù)采集
經(jīng)過對(duì)目標(biāo)網(wǎng)站的網(wǎng)頁(yè)進(jìn)行分析可以知道句伶,目標(biāo)數(shù)據(jù)都存在源代碼中。
瀏覽器偽裝并爬取網(wǎng)頁(yè)
import requests
import random
from datetime import datetime
from bs4 import BeautifulSoup
import logging
import time
from peewee import *
hds = [{'User-Agent': 'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6'},
{'User-Agent': 'Mozilla/5.0 (Windows NT 6.2) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.12 Safari/535.11'},
{'User-Agent': 'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.2; Trident/6.0)'},
{'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:34.0) Gecko/20100101 Firefox/34.0'},
{'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/44.0.2403.89 Chrome/44.0.2403.89 Safari/537.36'},
{'User-Agent': 'Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_8; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50'},
{'User-Agent': 'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50'},
{'User-Agent': 'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0'},
{'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv:2.0.1) Gecko/20100101 Firefox/4.0.1'},
{'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; rv:2.0.1) Gecko/20100101 Firefox/4.0.1'},
{'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_0) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11'},
{'User-Agent': 'Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; en) Presto/2.8.131 Version/11.11'},
{'User-Agent': 'Opera/9.80 (Windows NT 6.1; U; en) Presto/2.8.131 Version/11.11'}]
hd = {
'Host': 'cs.lianjia.com',
'Connection': 'keep-alive',
'Upgrade-Insecure-Requests': '1',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.109 Safari/537.36',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
'Referer': 'https://cs.lianjia.com/zufang/yuhua/',
'Accept-Encoding': 'gzip, deflate, br',
'Accept-Language': 'zh-CN,zh;q=0.9',
'Cookie': 'lianjia_uuid=96b42f77-56b8-490d-baa9-beaf6865a9c4; _smt_uid=5c724732.b80d85e; UM_distinctid=1691e661dd36c9-08249c43205ce8-3d644701-144000-1691e661dd44ff; _jzqc=1; _ga=GA1.2.186869930.1550993207; _gid=GA1.2.1359688634.1550993207; select_city=430100; _jzqx=1.1550993203.1551081745.3.jzqsr=cn%2Ebing%2Ecom|jzqct=/.jzqsr=cs%2Elianjia%2Ecom|jzqct=/xiaoqu/wangcheng/; _jzqckmp=1; Hm_lvt_9152f8221cb6243a53c83b956842be8a=1550993227,1550993401,1551082378,1551095350; _jzqa=1.3371712841637333500.1550993203.1551094837.1551107638.6; Hm_lpvt_9152f8221cb6243a53c83b956842be8a=1551108287; lianjia_ssid=ac48bf96-5661-bd1a-2293-92ba40145fa0; CNZZDATA1273627291=1275184754-1551151174-https%253A%252F%252Fcs.lianjia.com%252F%7C1551151174'
}
logging.basicConfig(
format='%(asctime)s - %(levelname)s - %(message)s', level=logging.INFO)
#設(shè)定日志提醒格式和等級(jí)
def get_source_code(url):
try:
result = requests.get(
url, headers=hds[random.randint(0, len(hds) - 1)])
source_code = result.content
except Exception as e:
print(e)
return
return source_code
#隨機(jī)使用user_agent
提取網(wǎng)頁(yè)信息
經(jīng)過分析網(wǎng)頁(yè)邏輯陆淀,發(fā)現(xiàn)通過分地區(qū)然后分小區(qū)獲取房源比較全面
該部分函數(shù)邏輯較復(fù)雜 先進(jìn)行簡(jiǎn)單介紹:
行政區(qū)列表:regionlist 考余;
小區(qū)列表:通過get_community函數(shù)可以得到;
小區(qū)信息:getCommunity_byRegionlist(regionlist):該函數(shù)根據(jù)行政區(qū)返回小區(qū)信息轧苫;
日志控制:check_blocked函數(shù)用于及時(shí)提醒因?yàn)榕廊∷俣冗^快導(dǎo)致需要輸入驗(yàn)證碼的情況楚堤;log_progress:實(shí)時(shí)提供爬蟲進(jìn)度信息。
獲取每個(gè)地區(qū)小區(qū)頁(yè)面數(shù)量
regionlist = ['yuhua','yuelu','tianxin','kaifu','furong','wangcheng']
# 地區(qū)列表
def get_total_pages(url):
source_code = get_source_code(url)
soup = BeautifulSoup(source_code, 'lxml')
total_pages = 0
try:
page_info = soup.find('div', {'class': 'page-box house-lst-page-box'})
except AttributeError as e:
page_info = None
if page_info == None:
logging.error('can not find pages')
return
page_info_str = page_info.get('page-data').split(',')[0]
total_pages = int(page_info_str.split(':')[1])
return total_pages
獲取每個(gè)地區(qū)的小區(qū)信息
def get_community(region):
baseUrl = 'https://cs.lianjia.com/xiaoqu/'
url = baseUrl + region+'/'
source_code = get_source_code(url)
soup = BeautifulSoup(source_code, 'lxml')
if check_blocked(soup):
return
total_pages = get_total_pages(url)
for page in range(total_pages):
if page > 0:
url_page = baseUrl + region + "/pg%d/" % page
source_code = get_source_code(url_page)
soup = BeautifulSoup(source_code, 'lxml')
nameList = soup.find_all("li", {"class": "clear"})
i = 0
log_progress("GetCommunity",
region, page + 1, total_pages)
for name in nameList:
info_dict = {}
communitytitle = name.find("div", {"class": "title"})
title = communitytitle.get_text().strip('\n')
link = communitytitle.a.get('href')
info_dict.update({u'title': title})
info_dict.update({u'link': link})
district = name.find("a", {"class": "district"})
info_dict.update({u'district': district.get_text()})
bizcircle = name.find("a", {"class": "bizcircle"})
info_dict.update({u'bizcircle': bizcircle.get_text()})
tagList = name.find("div", {"class": "tagList"})
info_dict.update({u'tagList': tagList.get_text().strip('\n')})
onsale = name.find("a", {"class": "totalSellCount"})
info_dict.update(
{u'onsale': onsale.span.get_text().strip('\n')})
onrent = name.find("a", {"title": title + u"租房"})
info_dict.update(
{u'onrent': onrent.get_text().strip('\n').split(u'套')[0]})
info_dict.update({u'id': name.get('data-housecode')})
price = name.find("div", {"class": "totalPrice"})
info_dict.update({u'price': price.span.get_text().strip('\n')})
communityinfo = get_communityinfo_by_url(link)
for key, value in communityinfo.items():
info_dict.update({key: value})
#連接數(shù)據(jù)庫(kù)
Community.insert(info_dict).on_conflict('replace').execute()
time.sleep(random.randint(1,5)) #時(shí)間延遲
def get_communityinfo_by_url(url):
source_code = get_source_code(url)
soup = BeautifulSoup(source_code, 'lxml')
if check_blocked(soup):
return
communityinfos = soup.find_all("div", {"class": "xiaoquInfoItem"})
res = {}
for info in communityinfos:
key_type = {
"建筑年代": 'year',
"建筑類型": 'housetype',
"物業(yè)費(fèi)用": 'cost',
"物業(yè)公司": 'service',
"開發(fā)商": 'company',
"樓棟總數(shù)": 'building_num',
"房屋總數(shù)": 'house_num',
}
try:
key = info.find("span", {"xiaoquInfoLabel"})
value = info.find("span", {"xiaoquInfoContent"})
key_info = key_type[key.get_text().strip()]
value_info = value.get_text().strip()
res.update({key_info: value_info})
except:
continue
return res
def getCommunity_byRegionlist(regionlist):
logging.info("Get Community Infomation")
starttime = datetime.now()
for region in regionlist:
try:
get_community(region)
logging.info(region + "Done")
except Exception as e:
logging.error(e)
logging.error(region + "Fail")
pass
endtime = datetime.now()
logging.info("Run time: " + str(endtime - starttime))
日志控制
def check_blocked(soup):
if soup.title.string == '414 Request-URI Too Large':
logging.error('IP is blocked')
return True
return False
def log_progress(function, address, page, total):
logging.info("Progress: %s %s: current page %d total pages %d" %
(function, address, page, total))
數(shù)據(jù)庫(kù)設(shè)置
database = MySQLDatabase(
'house',
host='localhost',
port=3306,
user='root',
passwd='',
charset='utf8',
use_unicode=True,
)
class BaseModel(Model):
class Meta:
database = database
class Community(BaseModel):
id = BigIntegerField(primary_key=True)
title = CharField()
link = CharField(unique=True)
district = CharField()
bizcircle = CharField()
tagList = CharField(null=True)
onsale = CharField()
onrent = CharField(null=True)
year = CharField(null=True)
housetype = CharField(null=True)
cost = CharField(null=True)
service = CharField(null=True)
company = CharField(null=True)
building_num = CharField(null=True)
house_num = CharField(null=True)
price = CharField(null=True)
validdate = DateTimeField(default=datetime.now)
def database_init():
database.connect()
database.create_tables(Community, safe=True)
database.close()
#數(shù)據(jù)庫(kù)表設(shè)定