成果展示
詳情頁鏈接
Paste_Image.png
商品信息
Paste_Image.png
我的代碼
主函數(shù)(main.py)
#-*- coding:utf-8 -*-
from multiprocessing import Pool
from channel_extact import channel_list
from ganji_url_info import get_url_link,url_links,get_goods_info,goodsinfo
#斷點(diǎn)續(xù)傳判斷
download_Y = [item['url'] for item in goodsinfo.find()] # 相關(guān)鏈接數(shù)據(jù)已下載至數(shù)據(jù)庫(kù)
download_N = [item['url'] for item in url_links.find()] # 完整數(shù)據(jù)鏈接
Y = set(download_Y) #集合化
N = set(download_N) #集合化
need_to_download = N-Y # 還未下載的鏈接
#def get_all_links(channel):
# for page in range(1,101):
# get_url_link(channel,page)
if __name__ == '__main__':
# 利用多進(jìn)程來爬取
#pool = Pool()
#pool.map(get_url_link,channel_list)
#pool.map(get_goods_info,need_to_download)
#pool.close()
#pool.join()
#不用pool逞带,直接爬取
#for url in need_to_download:
# get_goods_info(url)
獲取代理(My_proxies.py)
import requests
from bs4 import BeautifulSoup
import random
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.106 Safari/537.36',
'Connection': 'keep-alive',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
'Accept-Encoding': 'gzip, deflate, sdch',
}
def get_proxies():
url = 'http://www.xicidaili.com/nn'
proxies_list = [] # 存儲(chǔ)代理IP
wb_data = requests.get(url, headers=headers).text
soup = BeautifulSoup(wb_data, 'lxml')
ips = soup.select('tr.odd > td:nth-of-type(2)') # ip地址
ports = soup.select('tr.odd > td:nth-of-type(3)') # 端口號(hào)
speeds = soup.select('tr > td:nth-of-type(7) > div > div') # 速度
connect_times = soup.select('tr > td:nth-of-type(8) > div > div') # 連接速度
# 信息合并喧兄,且篩選出速度快的代理
for ip, port, speed, connect_time in zip(ips, ports, speeds, connect_times):
if speed.get('class')[1] == 'fast' and connect_time.get('class')[1] == 'fast':
proxies_list.append('http://' + str(ip.text) + ':' + str(port.text))
else:
continue
print(proxies_list)
get_proxies()
取得分類鏈接 (channel_extact.py)
# -*- coding:utf-8 -*-
from bs4 import BeautifulSoup
import requests
url = 'http://bj.ganji.com/wu/'
url_host = 'http://bj.ganji.com'
def get_channel_link(url):
wb_data = requests.get(url)
soup = BeautifulSoup(wb_data.text,'lxml')
channel_links = soup.select('dl.fenlei > dt > a')
#print(channel_links)
for channel in channel_links:
print(url_host + channel.get('href'))
channel_list ='''
http://bj.ganji.com/jiaju/
http://bj.ganji.com/rirongbaihuo/
http://bj.ganji.com/shouji/
http://bj.ganji.com/bangong/
http://bj.ganji.com/nongyongpin/
http://bj.ganji.com/jiadian/
http://bj.ganji.com/ershoubijibendiannao/
http://bj.ganji.com/ruanjiantushu/
http://bj.ganji.com/yingyouyunfu/
http://bj.ganji.com/diannao/
http://bj.ganji.com/xianzhilipin/
http://bj.ganji.com/fushixiaobaxuemao/
http://bj.ganji.com/meironghuazhuang/
http://bj.ganji.com/shuma/
http://bj.ganji.com/laonianyongpin/
http://bj.ganji.com/xuniwupin/
'''
#以下三項(xiàng)分類格式與上面的不統(tǒng)一
#http://bj.ganji.com/qitawupin/
#http://bj.ganji.com/ershoufree/
#http://bj.ganji.com/wupinjiaohuan/
#get_channel_link(url)
取各分類鏈接列表并取得詳情頁信息(ganji_url_info.py)
# -*- coding:utf-8 -*-
from bs4 import BeautifulSoup
import requests
import pymongo
import time , random
import requests.exceptions
client = pymongo.MongoClient('localhost', 27017)
ganji = client['ganji']
url_links = ganji['url_links_2']
url_links_zz = ganji['url_links_zz_2']
goodsinfo = ganji['goodsinfos']
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.106 Safari/537.36',
'Connection': 'keep-alive',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
'Accept-Encoding': 'gzip, deflate, sdch',
}
# http://www.xicidaili.com/wn
proxy_list =['http://121.193.143.249:80',
'http://42.159.251.84:41795',
'http://119.6.136.122:80',
'http://101.201.235.141:8000',
'http://118.180.15.152:8102',
'http://123.57.190.51:7777'
]
proxy = random.choice(proxy_list) # 隨機(jī)選擇代理
proxies = {'http': proxy}
#爬取網(wǎng)頁鏈接
def get_url_link(channel, page, who_sells='o'):
try:
url_link = '{}{}{}/'.format(channel, str(who_sells), str(page))
wb_data = requests.get(url_link, headers=headers)
time.sleep(1)
soup = BeautifulSoup(wb_data.text, 'lxml')
except (requests.exceptions.ProxyError, requests.exceptions.ConnectionError,requests.exceptions.ReadTimeout) as e:
print('This is a error raise')
# 判斷該網(wǎng)頁是否有效
url_right = soup.select('ul.pageLink.clearfix') == []
if url_right:
pass
else:
data_1 = soup.select('li.js-item > a') # 趕集網(wǎng)相關(guān)鏈接信息所在
#print(data_1)
data_2 = soup.select('div.zz-til > a') # 58轉(zhuǎn)轉(zhuǎn)相關(guān)鏈接信息所在
# 將標(biāo)題及對(duì)應(yīng)鏈接存入url_links
for data in data_1:
if 'biz.click.ganji.com' not in data.get('href'):
url_links.insert_one({'title': data.get_text(strip=True), 'url': data.get('href')})
print({'title': data.get_text(strip=True), 'url': data.get('href')})
# 將標(biāo)題及對(duì)應(yīng)鏈接存入url_links_zz
for data in data_2:
url_links_zz.insert_one({'title': data.get_text(strip=True), 'url': data.get('href')})
# print({'title': data.get_text(strip=True), 'url': data.get('href').split('?')[0]})
#爬取詳情信息
def get_goods_info(url):
try:
wb_data = requests.get(url, headers=headers, proxies=proxies).text
soup = BeautifulSoup(wb_data, 'lxml')
# 判斷頁面是否正常荒辕,若頁面商品信息已刪除或無效硼莽,跳過
if soup.select('div.error'):
print(url)
print('This page is Not Found!')
else:
title = soup.select('h1.title-name')[0].get_text() if soup.select('h1.title-name') else None # 標(biāo)題
# 發(fā)布時(shí)間
if soup.select('i.pr-5'):
published = soup.select('i.pr-5')[0].get_text(strip=True)
else:
published = None
goods_types = soup.select('div > ul.det-infor > li:nth-of-type(1) > span > a')
goods_type = [i.get_text(strip=True) for i in goods_types] # 商品類型
locations = soup.select('div > ul.det-infor > li:nth-of-type(3) > a')
location = [i.get_text(strip=True) for i in locations] # 交易地點(diǎn)
price = soup.select('i.f22.fc-orange.f-type')[0].get_text() \
if soup.select('i.f22.fc-orange.f-type') else None # 價(jià)格
if len(soup.select('body > div > div > div.h-crumbs > div > a')) >= 3:
classfy = soup.select('body > div > div > div.h-crumbs > div > a')[2].text
else:
classfy = None
# 判斷是否有該字段值
if soup.find(text='新舊程度:'):
degree = soup.select('ul.second-det-infor.clearfix > li')[0].get_text().split()[-1] # 新舊程度
else:
degree = None
#print(title,published,goods_type,location,price,classfy)
# 保存數(shù)據(jù)至數(shù)據(jù)庫(kù)
# 爬取不到關(guān)鍵信息束铭,信息不入數(shù)據(jù)庫(kù),待下次處理
if title or published or price:
goodsinfo.insert_one({'title': title,
'published': published,
'goods_type': goods_type,
'location': location,
'price': price,
'degree': degree,
'url': url, # 用于后面判斷還未下載的鏈接,
'classfy': classfy
})
print(
{'title': title,
'published': published,
'goods_type': goods_type,
'location': location,
'price': price,
'degree': degree,
'url': url,
'classfy': classfy
}
)
else:
pass
except (requests.exceptions.ProxyError, requests.exceptions.ConnectionError,requests.exceptions.ReadTimeout) as e:
print('This is a error raise')
#url='http://bj.ganji.com/yingyouyunfu/2285918732x.htm'
#get_goods_info(url)
總結(jié):
- 設(shè)計(jì)過程中不斷的遇到問題拾稳,不斷的在代碼添加新功能解決寸宏,使程序盡量完善,主要問題如下:
|- 同一IP訪問次數(shù)過多土砂,被網(wǎng)站封了州既,設(shè)計(jì)了一個(gè)從代理網(wǎng)站獲取代理IP的程序,來解決代理IP的問題萝映;
|- 爬取過程中各種請(qǐng)求失效吴叶,超時(shí)等報(bào)錯(cuò)影響程序運(yùn)行的連續(xù),利用try-except序臂,將此種報(bào)錯(cuò)獲取蚌卤,暫時(shí)跳過此網(wǎng)頁,繼續(xù)往下運(yùn)行贸宏,后面再來處理這些未成功的網(wǎng)頁
|- 加入了判斷頁面已失效(404)或 頁面內(nèi)容讀取失敗的判斷造寝,跳過這些頁面磕洪,增加效率
|- 爬取的詳情頁鏈接中吭练,存在許多跳轉(zhuǎn)到轉(zhuǎn)轉(zhuǎn)的鏈接,將此類鏈接另外存于Mongo的另一集合中析显,因頁面不同鲫咽,如需獲得這些鏈接的詳情,需另設(shè)計(jì)函數(shù)來獲取此類鏈接詳情信息