緊接著上回的文章详瑞,來書寫一個Callback
并演示一下爬蟲吧。
實例分析
以一個實際的例子為主臣缀,即展示爬取一本小說為例子坝橡。
通過xpath
的獲取,就可以寫下索引頁面的callback
函數(shù)精置,從而產(chǎn)生詳情頁的自定義Request
计寇,具體的事項請見上一回文章。
def qb5200_index_task(response: Response, spider: Request):
"""
自定義的任務callback函數(shù)脂倦,這里是index函數(shù)
:param response: requests返回的數(shù)據(jù)
:param spider: 自定義的請求
:return: yeild出新的自定義Request
"""
html = etree.HTML(response.content.decode('gbk'))
try:
all_tr_tags = html.xpath('//div[@id="content"]//tr')
for tr_tag in all_tr_tags[:2]: # 請求一本小說番宁,取前兩個element,第一個是表格頭,第二是小說
td_temp = tr_tag.xpath('./td[@class="odd"]')
if len(td_temp):
name = td_temp[0].xpath('./a/text()')[0]
url = td_temp[0].xpath('./a/@href')[0]
yield Request(
url, name=name, folder=name, pipeline=FolderPipeline,
title=name, callback=qb5200_detail_task, headers={'Referer': spider.url},
category=DETAIL
)
except Exception as e:
raise e
當拋出新的自定義Request
后赖阻,管理器會將新的請求扔進Redis
中蝶押,從而在下次的循環(huán)中彈出。
索引頁的Callback
創(chuàng)建完了政供,新生成的請求需要詳情頁的Callback
來決定之后的請求走向播聪。
同樣以第一本小說為例子,寫下如下的詳情頁的Callback:
def qb5200_detail_task(response: Response, spider: Request):
"""
自定義的任務callback函數(shù)布隔,這里是detail函數(shù)
:param response: requests返回的數(shù)據(jù)
:param spider: 自定義的請求
:return: yeild出新的自定義Request
"""
html = etree.HTML(response.content.decode('gbk'))
try:
base_url = spider.url
all_td_tags = html.xpath('//table//td')
for td_tag in all_td_tags:
a_tag = td_tag.xpath('./a')
if len(a_tag):
title = a_tag[0].xpath('./text()')[0]
folder = title.replace('?', '?')\
.replace('!', '稼虎!')\
.replace('.', '衅檀。')\
.replace('*', 'x') # 去除創(chuàng)建文件時沖突的特殊字符
url = a_tag[0].xpath('./@href')
yield Request(
base_url + url[0], name=spider.name, folder=folder, pipeline=FilePipeline,
title=title, callback=qb5200_text_task, headers={'Referer': spider.url},
category=TEXT
)
except Exception as e:
raise e
注意category
設置,這個將影響管理器處理這個請求的方式霎俩。TEXT
代表了以文本的方式處理哀军,在管理器中它會這樣處理。
當然很好奇上述拋出的Request
中明明定義了qb5200_text_task
這個Callback
函數(shù)打却,但是沒有調用杉适。這里是因為保持pipeline
存儲的第一個參數(shù)是Response
對象,所以將內容的處理放到了pipeline
里柳击。
同時放上最后的qb5200_text_task
的代碼:
def qb5200_text_task(response: Response, spider: Request):
"""
自定義的任務callback函數(shù)猿推,這里是真實數(shù)據(jù)請求函數(shù),目前為針對于全本小說網(wǎng)的text文本
:param response: requests返回的數(shù)據(jù)
:param spider: 自定義的請求
:return: yeild出新的自定義Request
"""
html = etree.HTML(response.content.decode('gbk'))
try:
title_temp = html.xpath('//div[@id="title"]')
if len(title_temp):
title = title_temp[0].xpath('./text()')[0]
content_temp = html.xpath('//div[@id="content"]')
if len(content_temp):
content = content_temp[0].xpath('string(.)')
return {
"title": str(title),
"content": str(content)
}
except Exception as e:
raise e
最后啟動運行初始代碼:
#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
@File : run.py
@Time : 2017/11/7 0007 20:40
@Author : Empty Chan
@Contact : chen19941018@gmail.com
@Description: 運行spider
"""
from downloader import HttpDownloader
from manager import Manager
from pipeline import ConsolePipeline
from request import Request
from tasks import qb5200_index_task
from utils import INDEX
if __name__ == '__main__':
'''定義初始的鏈接請求捌肴,初始化到manager中蹬叭,然后run'''
req = Request("http://www.qb5200.org/list/3.html", name='qb5200', category=INDEX,
pipeline=ConsolePipeline, callback=qb5200_index_task,
downloader=HttpDownloader)
instance = Manager(req)
instance.run()
結果展示:
Callback全部代碼
#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
@File : tasks.py
@Time : 2017/10/21 0021 19:27
@Author : Empty Chan
@Contact : chen19941018@gmail.com
@Description: 申明任務來執(zhí)行callback, 且必須滿足兩個參數(shù),一個為requests請求的返回值状知,一個是自定義的spider請求
"""
import json
import click
from requests import Response
from lxml import etree
import re
import abc
from log_util import Log
from request import Request
from pipeline import FolderPipeline, ConsolePipeline, FilePipeline
from pipeline import FilePipeline
from utils import TEXT, IMAGE, VIDEO, INDEX, NEXT, DETAIL
qb5200_index_pat = re.compile(r'.*(\d+).*', re.MULTILINE)
qb5200_logger = Log("qb5200")
def qb5200_index_task(response: Response, spider: Request):
"""
自定義的任務callback函數(shù)秽五,這里是index函數(shù)
:param response: requests返回的數(shù)據(jù)
:param spider: 自定義的請求
:return: yeild出新的自定義Request
"""
html = etree.HTML(response.content.decode('gbk'))
try:
all_tr_tags = html.xpath('//div[@id="content"]//tr')
for tr_tag in all_tr_tags[:2]: # 請求一本小說,取前兩個element,第一個是表格頭饥悴,第二是小說
td_temp = tr_tag.xpath('./td[@class="odd"]')
if len(td_temp):
name = td_temp[0].xpath('./a/text()')[0]
url = td_temp[0].xpath('./a/@href')[0]
yield Request(
url, name=name, folder=name, pipeline=FolderPipeline,
title=name, callback=qb5200_detail_task, headers={'Referer': spider.url},
category=DETAIL
)
except Exception as e:
raise e
def qb5200_detail_task(response: Response, spider: Request):
"""
自定義的任務callback函數(shù)坦喘,這里是detail函數(shù)
:param response: requests返回的數(shù)據(jù)
:param spider: 自定義的請求
:return: yeild出新的自定義Request
"""
html = etree.HTML(response.content.decode('gbk'))
try:
base_url = spider.url
all_td_tags = html.xpath('//table//td')
for td_tag in all_td_tags:
a_tag = td_tag.xpath('./a')
if len(a_tag):
title = a_tag[0].xpath('./text()')[0]
folder = title.replace('?', '贸诚?')\
.replace('!', '鳍咱!')\
.replace('.', '。')\
.replace('*', 'x')
url = a_tag[0].xpath('./@href')
yield Request(
base_url + url[0], name=spider.name, folder=folder, pipeline=FilePipeline,
title=title, callback=qb5200_text_task, headers={'Referer': spider.url},
category=TEXT
)
except Exception as e:
raise e
def qb5200_text_task(response: Response, spider: Request):
"""
自定義的任務callback函數(shù),這里是真實數(shù)據(jù)請求函數(shù)希停,目前為針對于全本小說網(wǎng)的text文本
:param response: requests返回的數(shù)據(jù)
:param spider: 自定義的請求
:return: yeild出新的自定義Request
"""
html = etree.HTML(response.content.decode('gbk'))
try:
title_temp = html.xpath('//div[@id="title"]')
if len(title_temp):
title = title_temp[0].xpath('./text()')[0]
content_temp = html.xpath('//div[@id="content"]')
if len(content_temp):
content = content_temp[0].xpath('string(.)')
return {
"title": str(title),
"content": str(content)
}
except Exception as e:
raise e
有些東西寫得不好,算是很基礎的東西吧筒严,希望后期能夠完善一下扛芽。感謝大家閱讀!8馈叉弦!
放上Github地址。
大家下回見~~