代碼
# pre.py
from bs4 import BeautifulSoup
import requests
#
# 根據(jù)“全部分類”頁面,找到所有的頻道入口
#
def parse_list():
weburl = 'http://bj.ganji.com/wu/'
web_data = requests.get(weburl)
soup = BeautifulSoup(web_data.text, 'lxml', from_encoding="utf-8")
suburllist = soup.select('#wrapper > div.content > div > div > dl > dt > a')
for suburl in suburllist:
print('http://bj.ganji.com' + suburl.get('href'))
# 找到的頻道入口列表
category_list = '''
http://bj.ganji.com/jiaju/
http://bj.ganji.com/rirongbaihuo/
http://bj.ganji.com/shouji/
http://bj.ganji.com/shoujihaoma/
http://bj.ganji.com/bangong/
http://bj.ganji.com/nongyongpin/
http://bj.ganji.com/jiadian/
http://bj.ganji.com/ershoubijibendiannao/
http://bj.ganji.com/ruanjiantushu/
http://bj.ganji.com/yingyouyunfu/
http://bj.ganji.com/diannao/
http://bj.ganji.com/xianzhilipin/
http://bj.ganji.com/fushixiaobaxuemao/
http://bj.ganji.com/meironghuazhuang/
http://bj.ganji.com/shuma/
http://bj.ganji.com/laonianyongpin/
http://bj.ganji.com/xuniwupin/
http://bj.ganji.com/qitawupin/
http://bj.ganji.com/ershoufree/
http://bj.ganji.com/wupinjiaohuan/
'''
if __name__ == '__main__':
parse_list()
# splider1.py
from bs4 import BeautifulSoup
from multiprocessing import Pool
import requests
import time
import pymongo
import pre
client = pymongo.MongoClient('localhost', 27017)
ganji = client['ganji']
t_urllist = ganji['t_urllist']
#
# 解析具體的一頁列表信息并入庫
#
def parse_list(url):
web_data = requests.get(url)
soup = BeautifulSoup(web_data.text, 'lxml')
# “轉(zhuǎn)轉(zhuǎn)”列表頁面,并且還有數(shù)據(jù)
if soup.find('table', 'tbimg'):
titles = soup.select('#infolist > div.infocon > table > tbody > tr.zzinfo > td.t > a')
for title in titles:
t_urllist.insert_one({'title': title.get_text(), 'url': title.get('href'), 'type': 'zz', 'flag': False})
# print('{} ==> {}'.format(title.get_text(), title.get('href')))
# 趕集網(wǎng)自身列表頁面又谋,并且還有數(shù)據(jù)
elif soup.find('div', 'layoutlist') and soup.find('ul', 'pageLink clearfix'):
titles = soup.select('#wrapper > div.leftBox > div.layoutlist > dl > dt > a')
for title in titles:
t_urllist.insert_one({'title': title.get('title'), 'url': title.get('href'), 'type': 'nm', 'flag': False})
# print('{} ==> {}'.format(title.get('title'), title.get('href')))
# 此頁無數(shù)據(jù)啦
else:
print('后面沒有啦 : ' + url)
pass
# Nothing !
#
# 逐頁將某頻道的列表信息解析入庫
#
def process(channel):
for i in range(1, 100):
# 第一頁特殊處理拼缝,因?yàn)橹苯悠唇印畂1’將會(huì)打開第二頁而非第一頁
if i == 1:
parse_list(channel)
else:
parse_list('{}o{}/'.format(channel, str(i)))
# time.sleep(2)
#
# 程序入口 : 采用多線程將多個(gè)頻道的列表信息解析入庫
#
if __name__ == '__main__':
# process('http://bj.ganji.com/bangong/')
pool = Pool()
pool.map(process, pre.category_list.split())
- 從數(shù)據(jù)庫獲取url解析各詳情頁面
# splider2.py
from bs4 import BeautifulSoup
from multiprocessing import Pool
import requests
import time
import pymongo
client = pymongo.MongoClient('localhost', 27017)
ganji = client['ganji']
t_urllist = ganji['t_urllist']
t_detail = ganji['t_detail']
#
# 解析基于“轉(zhuǎn)轉(zhuǎn)”平臺(tái)的頁面
#
def parse_zz_detail(url):
web_data = requests.get(url)
soup = BeautifulSoup(web_data.text, 'lxml')
if soup.find('span', 'soldout_btn'):
print('商品下架啦!' + url)
pass
# Nothing !
else:
titles = soup.select('body > div.content > div > div.box_left > div.info_lubotu.clearfix > div.box_left_top > h1')
prices = soup.select('body > div.content > div > div.box_left > div.info_lubotu.clearfix > div.info_massege.left > div.price_li > span > i')
areas = soup.select('body > div.content > div > div.box_left > div.info_lubotu.clearfix > div.info_massege.left > div.palce_li > span > i')
categories = soup.select('#nav > div')
data = {
'url': url,
'title': titles[0].get_text().strip(),
'price': prices[0].get_text().strip(),
'area': areas[0].get_text().strip(),
'category': list(categories[0].stripped_strings)[-1]
}
# print(data)
t_detail.insert_one(data)
#
# 解析基于趕集自身平臺(tái)的頁面
#
def parse_nm_detail(url):
web_data = requests.get(url)
if web_data.status_code == 404:
print('商品下架啦彰亥!' + url)
pass
# Nothing !
else:
soup = BeautifulSoup(web_data.text, 'lxml')
titles = soup.select(
'#wrapper > div.content.clearfix > div.leftBox > div.col-cont.title-box > h1')
prices = soup.select(
'#wrapper > div.content.clearfix > div.leftBox > div > div > ul > li > i.f22.fc-orange.f-type')
areas = soup.select(
'#wrapper > div.content.clearfix > div.leftBox > div:nth-of-type(2) > div > ul > li:nth-of-type(3) > a')
categories = soup.select('#wrapper > div.content.clearfix > div.leftBox > div:nth-of-type(2) > div > ul > li:nth-of-type(1) > span > a')
data = {
'url': url,
'title': titles[0].get_text().strip(),
'price': prices[0].get_text().strip(),
'area': list(map(lambda x:x.text, areas)),
'category': list(categories[0].stripped_strings)[-1]
}
# print(data)
t_detail.insert_one(data)
#
# 通用解析接口
#
def parse_detail(row):
print(row)
if row['type'] == 'zz':
parse_zz_detail(row['url'])
else:
parse_nm_detail(row['url'])
# 標(biāo)記已處理的記錄
t_urllist.update({'_id': row['_id']}, {'$set':{'flag': True}})
#
# 程序入口 : 從數(shù)據(jù)庫讀取url咧七,采用多線程進(jìn)行詳情爬取
#
if __name__ == '__main__':
# parse_detail('http://zhuanzhuan.ganji.com/detail/797106589634494469z.shtml?from=pc&source=ganji&cate=%E5%8C%97%E4%BA%AC%E8%B5%B6%E9%9B%86%7C%E5%8C%97%E4%BA%AC%E4%BA%8C%E6%89%8B%7C%E5%8C%97%E4%BA%AC%E4%BA%8C%E6%89%8B%E6%89%8B%E6%9C%BA&cateurl=bj|wu|shouji', 'zz')
# parse_detail('http://bj.ganji.com/bangong/2413656831x.htm', 'nm')
rows = t_urllist.find({'flag': False})
pool = Pool()
pool.map(parse_detail, rows)
總結(jié)
- 趕集網(wǎng)的分頁,第一頁與第二頁的規(guī)則不同剩愧,第一頁不能直接拼接“o1/”作為分頁標(biāo)識(shí)猪叙。
- 趕集的列表及商品頁面有兩種:基于“轉(zhuǎn)轉(zhuǎn)”平臺(tái)的 和 基于趕集自身平臺(tái)的。在列表識(shí)別以及詳情頁面爬取時(shí)需要予以區(qū)分處理仁卷。
- 基于轉(zhuǎn)轉(zhuǎn)的列表頁面中穴翩,個(gè)人信息與商家信息的區(qū)分要根據(jù)<tr>標(biāo)簽的css樣式差異。