爬取目標(biāo):糗事百科全部文章的內(nèi)容和網(wǎng)址蚕泽。
直接上代碼:
items.py
#糗事百科自動抓取
class QSBKAutoItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
content = scrapy.Field()
link = scrapy.Field()
爬蟲文件
# -*- coding: utf-8 -*-
import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
from testscrapy01.items import QSBKAutoItem
class QsbkautoSpider(CrawlSpider):
name = 'qsbkauto'
allowed_domains = ['qiushibaike.com']
start_urls = ['http://qiushibaike.com/']
rules = (
#指定鏈接提取的規(guī)律 r代表后面引號里面的字符均為非轉(zhuǎn)義
#follow:是指爬取了之后鱼响,是否還繼續(xù)從該頁面提取鏈接,然后繼續(xù)爬下去
Rule(LinkExtractor(allow=r'article'), callback='parse_item', follow=True),
)
def parse_item(self, response):
item = QSBKAutoItem()
#i['domain_id'] = response.xpath('//input[@id="sid"]/@value').extract()
#i['name'] = response.xpath('//div[@id="name"]').extract()
#i['description'] = response.xpath('//div[@id="description"]').extract()
item["content"] = response.xpath("http://div[@class='content']/text()").extract()[0]
item["link"] = response.xpath("http://link[@rel='canonical']/@href").extract()[0]
print("內(nèi)容:",item["content"])
print("鏈接:",item["link"])
print("*************************************************************")
yield item
對以上代碼的改進(jìn):
# coding=utf-8
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
from testscrapy01.items import QSBKAutoItem
import sys
reload(sys)
sys.setdefaultencoding('utf-8')
class QsbkautoSpider(CrawlSpider):
name = 'qsbkauto1'
allowed_domains = ['qiushibaike.com']
start_urls = ['http://www.qiushibaike.com/']
rules = (
Rule(LinkExtractor(allow=r'page/\d*/?s=\d*'), follow=True),
Rule(LinkExtractor(allow=(r'article/.*')), callback='parse_item',follow=True),
)
def parse_item(self, response):
item = QSBKAutoItem()
item["content"] = response.xpath("http://div[@class='content']/text()").extract()[0]
# item["link"] = response.xpath("http://link[@rel='canonical']/@href").extract()[0]
item["link"] = response.url
content = "內(nèi)容:" + item["content"]
print(content.decode("utf-8"))
link = "鏈接:" + item["link"]
print(link.decode("utf-8"))
print("*************************************************************")
yield item