設(shè)計(jì)思路
1.從host頁面獲取趕集網(wǎng)所有城市的二手市場marketurl
2.根據(jù)marketurl獲取每個(gè)板塊的blockurl
3.通過分析各板塊頁腳頁碼元素两残,分析每個(gè)板塊有多少個(gè)列表頁,得到listurl
ps:由于趕集網(wǎng)列表頁設(shè)置非常混亂(如不顯示頁數(shù)控件留攒,只有一頁卻可以訪問任意頁碼等等)塔鳍,此功能并未實(shí)現(xiàn)柒爸,程序只抓取了每個(gè)板塊的第一頁受裹,即blockurl=listurl
4.分析出listurl中的所有詳情頁detailurl
5.解析出詳情頁面中的信息斗这,區(qū)分轉(zhuǎn)轉(zhuǎn)和趕集商品动猬,獲取商品信息
ps:所有信息均入mongodb表中
代碼
main.py
#coding=utf-8
'''爬取趕集網(wǎng)所有城市二手市場所有類目的商品信息,由于趕集網(wǎng)列表頁數(shù)比較混亂表箭,僅抓取各板塊下第一頁
爬取類目下所有帖子,信息包括:商品標(biāo)題,發(fā)帖時(shí)間, 類型,價(jià)格,交易地點(diǎn),新舊程度等
多進(jìn)程方式爬取
'''
from configparser import ConfigParser
from pymongo import MongoClient
from ganji_crawler import get_citys_market,get_block_urls,get_detail_page_urls,getinfo_from_detailpage
cf = ConfigParser() # 創(chuàng)建conf文件解析對(duì)象
cf.read("ganji.conf") # 讀取conf文件
host = cf.get('mongodb', 'db_host')
port = cf.getint('mongodb', 'db_port')
client = MongoClient(host, port) # 連接mongodb赁咙,創(chuàng)建客戶端
ganji_market = client[cf.get('databases', 'db_market')] # 連接二手市場數(shù)據(jù)庫
# 創(chuàng)建collections實(shí)例
citys_market = ganji_market[cf.get('collections', 'collection_citys')] # 存放城市url表
city_market_block_url = ganji_market[cf.get('collections', 'collection_block_url')] # 各城市板塊url表
market_detailpage_url = ganji_market[cf.get('collections', 'collection_detail_url')] # 詳情頁url
market_goods_infos = ganji_market[cf.get('collections', 'collection_goods_info')]
#############代碼執(zhí)行部分##############
get_citys_market(host,citys_market) #入庫城市及二手市場url
for city in citys_market.find():
get_block_urls(city['link'],city_market_block_url) #入庫板塊url
for block in city_market_block_url.find({}, {'link': 1, '_id': 0}):
get_detail_page_urls(block['link'],market_detailpage_url) #入庫詳情頁url
for idx, detail in enumerate(market_detailpage_url.find({}, {'link': 1, '_id': 0})): # 入庫商品信息
getinfo_from_detailpage(detail['link'], market_goods_infos)
if idx % 1000 == 0:
print('{} records has been inserted ! '.format(idx))
crawler.py
#coding=utf-8
'''爬取趕集網(wǎng)二手市場數(shù)據(jù)'''
from bs4 import BeautifulSoup
from pymongo import MongoClient,errors
import requests,re,time
def get_citys_market(host,collection):
resp = requests.get(host)
soup = BeautifulSoup(resp.content,'lxml')
links = soup.select('div.all-city > dl > dd > a ') #獲取城市列表中所有超鏈接
for link in links:
collection.insert_one({
'link' : link['href']+'wu/',
'city' : link.string
})
def get_block_urls(city_market_url,collection):
'''從一個(gè)城市的二手市場頁面抓取所有區(qū)塊url
city_market_url:某個(gè)城市的二手市場url
collection:解析出的板塊url存入的數(shù)據(jù)表'''
resp = requests.get(city_market_url)
soup = BeautifulSoup(resp.content,'lxml')
try:
div_navigate = soup.select('div.main')[0]
except IndexError:
return
for a in div_navigate.select('a'):
try:
href = a['href']
if href.startswith('/'): #清洗臟數(shù)據(jù),全部分類中有#開頭的
collection.insert_one({'link': city_market_url[:-4] + href}) #拼拼湊板塊url免钻,這里因?yàn)橼s集網(wǎng)設(shè)置比較特殊彼水,需要去除一些無效字符
except errors.DuplicateKeyError as e:
print(e)
def get_detail_page_urls(blockurl,collection):
'''趕集網(wǎng)板塊的頁數(shù)判斷和訪問就是個(gè)坑啊,完全沒有判斷板塊下有多少頁的規(guī)律极舔,這里先提取各版塊第一頁的詳情頁url'''
resp = myRequestGet(blockurl)
if not resp:
return
soup = BeautifulSoup(resp.content,'lxml')
try:
layoutlist = soup.select('dl.list-bigpic.clearfix') #定位到每條數(shù)據(jù)dl標(biāo)簽上
except IndexError:
return
time.sleep(1)
for layout in layoutlist:
links = layout.select('a') #獲取此標(biāo)簽下所有超鏈
for link in links:
href = link['href']
if href.startswith('http://m.zhuanzhuan.58.com'): #篩選出轉(zhuǎn)轉(zhuǎn)數(shù)據(jù)url
#由于獲取的轉(zhuǎn)轉(zhuǎn)url是通過js獲取商品信息的凤覆,所以需要改一下url形式,以便css path可以找到目標(biāo)信息
infoId = re.findall(r'infoId=(\d+)&', href)[0]
.format(infoId)
try:
collection.insert_one({'source':'zhuanzhuan','link': href}) #入庫轉(zhuǎn)轉(zhuǎn)url-改寫后
except errors.DuplicateKeyError as e:
print(e)
elif href.endswith('.htm'):
try:
collection.insert_one({'source':'ganji','link': href}) #入庫趕集url
except errors.DuplicateKeyError as e:
print(e)
def getinfo_from_detailpage(detailurl,collection):
'''從詳情頁獲取商品信息'''
resp = myRequestGet(detailurl)
if not resp:
return
soup = BeautifulSoup(resp.content, 'lxml')
time.sleep(0.1)
#轉(zhuǎn)轉(zhuǎn)商品信息獲取
if resp.url.startswith('http://zhuanzhuan'):
try:
title = soup.select(' h1.info_titile ')[0].string
price = ''.join(soup.select(' span.price_now ')[0].stripped_strings)
area = soup.select(' div.palce_li > span > i ')[0].string
desc = soup.select(' div.baby_kuang.clearfix > p')[0].string
except IndexError:
return
#入表
collection.insert_one({
'source' : 'zhuanzhuan',
'title': title,
'price': price,
'area': area,
'desc': desc})
else: #趕集商品信息獲取
try:
title = soup.select(' h1.title-name ')[0].string
price = soup.select(' i.f22.fc-orange.f-type ')[0].string
area = ''.join(soup.select(' ul.det-infor > li:nth-of-type(3) ')[0].stripped_strings)
desc = soup.select(' .second-sum-cont')[0].get_text().strip()
except IndexError:
return
collection.insert_one({
'source': 'ganji',
'title': title,
'price': price,
'area': area,
'desc': desc
})
def myRequestGet(url):
'''會(huì)遇到被封的情況拆魏,在這里把requests.get包裹一層函數(shù)盯桦,如果異常慈俯,則sleep(10)'''
try:
resp = requests.get(url)
return resp
except requests.exceptions.RequestException as e:
print('Requests Error -----------{}-----------wait 10 seconds'.format(str(e.__class__)))
time.sleep(10)
return None
except Exception as e:
print('Other Eroor -----------{}-----------wait 10 seconds'.format(str(e.__class__)))
time.sleep(10)
return None
#判斷頁面是否存在,由于判斷時(shí)需要讀取頁面并生成soup后判斷拥峦,所以干脆傳入soup對(duì)象贴膘,而不是url
def exists(soup):
if soup.title.string == '您訪問的網(wǎng)頁不存在':
return False
else:
return True
if __name__ == '__main__':
##############一些初始化##############
client = MongoClient('mongodb://localhost:27017')
ganji_market = client['ganji_market'] #趕集網(wǎng)二手市場數(shù)據(jù)庫
host = 'http://www.ganji.com/index.htm'
#創(chuàng)建存儲(chǔ)城市及其二手市場url的表
citys_market = ganji_market['citys_market']
#創(chuàng)建存儲(chǔ)各城市二手市場板塊url的collections,并將link字段設(shè)置為唯一索引略号,避免出現(xiàn)重復(fù)的link
city_market_block_url = ganji_market['city_market_block_url']
city_market_block_url.ensure_index('link', unique=True)
#創(chuàng)建詳情頁url存儲(chǔ)表
market_detailpage_url = ganji_market['market_detailpage_url']
market_detailpage_url.ensure_index('link', unique=True)
#商品信息入庫
market_goods_infos = ganji_market['market_goods_infos']
##############代碼執(zhí)行部分##############
# get_citys_market(host,citys_market) #入庫城市及二手市場url
# get_citys_market(host)
# for city in citys_market.find():
# get_block_urls(city['link'],city_market_block_url) #入庫板塊url
# get_block_urls('http://xa.ganji.com/wu/')
# for block in city_market_block_url.find({}, {'link': 1, '_id': 0}):
# get_detail_page_urls(block['link'],market_detailpage_url) #入庫詳情頁url
# get_detail_page_urls('http://xa.ganji.com/ershoubijibendiannao/')
for idx,detail in enumerate(market_detailpage_url.find({}, {'link': 1, '_id': 0}).skip(12000)): #入庫商品信息
getinfo_from_detailpage(detail['link'],market_goods_infos)
if idx % 1000 == 0:
print ('{} records has been inserted ! '.format(idx))
# getinfo_from_detailpage('http://zhuanzhuan.58.com/detail/755842657703362564z.shtml')
# getinfo_from_detailpage('http://xa.ganji.com/ershoubijibendiannao/2266604261x.htm')
ganji.conf
[mongodb]
db_host = localhost
db_port = 27017
[databases]
db_market = ganji_market
[collections]
collection_citys = citys_market
collection_block_url = city_market_block_url
collection_detail_url = market_detailpage_url
collection_goods_info = market_goods_infos
總結(jié)
1.沒有使用多進(jìn)程刑峡,是因?yàn)楹瘮?shù)都要傳入兩個(gè)參數(shù)(包括一個(gè)collection對(duì)象),暫且先不修改函數(shù)適用map方式了玄柠。multiprocessing和threading模塊還需要再學(xué)習(xí)一下
2.為了增強(qiáng)程序健壯性突梦,執(zhí)行過程中增加了一些try模塊,在遇到爬蟲被封的時(shí)候停頓10s再接著抓取随闪,目前看效果不錯(cuò)
3.抓取是按照模板設(shè)計(jì)的阳似,通用性的問題依然存在,不知是否可以使用scrapy解決
4.引入了一個(gè)configparser模塊來創(chuàng)建配置文件讀取相關(guān)信息铐伴,以后可以繼續(xù)使用
5.由于爬蟲被封撮奏,商品詳情url僅抓取了20w,因此商品信息也僅能解析出這么多当宴。后增加了程序的健壯性畜吊,未重新運(yùn)行。