在 windows 10 下安裝 scrapy 腕铸,首先安裝好 anaconda , 直接運行 conda install scrapy , anaconda 會幫你解決安裝過程所需要的庫和文件, 安裝成功
安裝scrapy
爬取 http://quotes.toscrape.com/ 網址的內容,注意跳轉下一頁
# quotes.py
# -*- coding: utf-8 -*-
import scrapy
from quotetutorial.items import QuoteItem
class QuotesSpider(scrapy.Spider):
name = 'quotes'
allowed_domains = ['quotes.toscrape.com']
start_urls = ['http://quotes.toscrape.com/']
def parse(self, response):
quotes = response.css(".quote")
for quote in quotes:
item = QuoteItem()
text = quote.css('.text::text').extract_first()
author = quote.css('.author::text').extract_first()
tags = quote.css('.tags .tag::text').extract()
item['text'] = text
item['author'] = author
item['tags'] = tags
yield item
#正則取出下一頁的 url
next = response.css(".pager .next a::attr(href)").extract_first()
#取出的url是相對路徑,用下面函數(shù)合成一個絕對 url
url = response.urljoin(next)
#設置回調函數(shù),遞歸傳遞下一頁的url,實現(xiàn)下一頁爬取
yield scrapy.Request(url=url, callback=self.parse)
定義 item 列表處理數(shù)據(jù)
#items.py
import scrapy
class QuoteItem(scrapy.Item):
# define the fields for your item here like:
text = scrapy.Field()
author = scrapy.Field()
tags = scrapy.Field()
定義 pipeline ,處理數(shù)據(jù)
#pipeline.py
import pymongo
from scrapy.exceptions import DropItem
class TextPipeline(object):
def __init__(self):
self.limit = 50
def process_item(self, item, spider):
if item['text']:
if len(item['text']) > self.limit:
item['text'] = item['text'][0:self.limit].rstrip() + '...'
return item
else:
#丟棄Item
return DropItem('Missing Text')
#定義函數(shù)內變量
class MongoPipeline(object):
def __init__(self, mongo_url, mongo_db):
self.mongo_url = mongo_url
self.mongo_db = mongo_db
#從 setting 中傳入設定的 MONGO_URL 和 MONGO_DB懂诗,
@classmethod
def from_crawler(cls, crawler):
return cls(
mongo_url = crawler.settings.get('MONGO_URL'),
mongo_db = crawler.settings.get('MONGO_DB')
)
#當 spider 開始時運行
def open_spider(self, spider):
self.client = pymongo.MongoClient(self.mongo_url)
self.db = self.client[self.mongo_db]
#存入 mongodb
def process_item(self, item, spider):
name = item.__class__.__name__
self.db[name].insert(dict(item))
return item
#當關閉 spider 時運行
def close_spider(self, spider):
self.client.close()
setting 文件定義的變量
#setting.py
MONGO_URL = 'localhost'
MONGO_DB = 'quotestutorial'