運(yùn)行環(huán)境:
* Python 2.7.12
* Scrapy 1.2.2
* Mac OS X 10.10.3 Yosemite
繼續(xù)爬取Scrapy 1.2.2文檔提供的練習(xí)網(wǎng)址:
可以暫時(shí)不用考慮爬蟲被封的情況耗啦,用于初級(jí)爬蟲練習(xí)。
目標(biāo)
多級(jí)頁面爬取時(shí),在什么位置yield items是個(gè)問題刮便,結(jié)論是可以放入子頁面的爬取時(shí)yield items讹蘑。但是要記住scrapy的自動(dòng)去重。
最終代碼
因?yàn)楸敬螌?shí)驗(yàn)內(nèi)容較多功偿。因此先給出最終的代碼盆佣。
items.py聲明items
增加聲明子頁面的抓取內(nèi)容。
import scrapy
class QuotesItem(scrapy.Item):
quote = scrapy.Field()
author = scrapy.Field()
tags = scrapy.Field()
author_born_date = scrapy.Field()
author_born_location = scrapy.Field()
author_description = scrapy.Field()
author_full_url = scrapy.Field()
爬蟲文件
需要進(jìn)行以下關(guān)鍵內(nèi)容改寫:
- 引入items.py中的類
- 更改爬蟲名
- 把作者的介紹頁面的鏈接(author_full_url)放入items中械荷。
- 需要把item的元數(shù)據(jù)傳入子頁面的request中:
meta={'item':item}
- 需要把Scrapy的自動(dòng)去重機(jī)制關(guān)掉:
callback=self.parse_author,dont_filter=True
最終代碼如下:
import scrapy
from quotes_2.items import QuotesItem
class QuotesSpider(scrapy.Spider):
name = 'quotes_2_6'
start_urls = [
'http://quotes.toscrape.com',
]
allowed_domains = [
'toscrape.com',
]
def parse(self,response):
for quote in response.css('div.quote'):
item = QuotesItem()
item['quote'] = quote.css('span.text::text').extract_first()
item['author'] = quote.css('small.author::text').extract_first()
item['tags'] = quote.css('div.tags a.tag::text').extract()
author_page = response.css('small.author+a::attr(href)').extract_first()
item['author_full_url'] = response.urljoin(author_page)
yield scrapy.Request(url=item['authro_full_url'], meta={'item':item},callback=self.parse_author,dont_filter=True)
next_page = response.css('li.next a::attr("href")').extract_first()
if next_page is not None:
next_full_url = response.urljoin(next_page)
yield scrapy.Request(next_full_url, callback=self.parse)
def parse_author(self,response):
item = response.meta['item']
item['author_born_date'] = response.css('.author-born-date::text').extract_first()
item['author_born_location'] = response.css('.author-born-location::text').extract_first()
item['author_description'] = response.css('.author-born-location::text').extract_first()
yield item
實(shí)驗(yàn)內(nèi)容記錄
步驟1:聲明items
首先共耍,我們針對(duì)進(jìn)行改寫。
爬蟲文件在改寫前的原始文件如下:
import scrapy
class QuotesSpider(scrapy.Spider):
name = 'quotes_2_3'
start_urls = [
'http://quotes.toscrape.com',
]
allowed_domains = [
'toscrape.com',
]
def parse(self,response):
for quote in response.css('div.quote'):
yield{
'quote': quote.css('span.text::text').extract_first(),
'author': quote.css('small.author::text').extract_first(),
'tags': quote.css('div.tags a.tag::text').extract(),
}
author_page = response.css('small.author+a::attr(href)').extract_first()
authro_full_url = response.urljoin(author_page)
yield scrapy.Request(authro_full_url, callback=self.parse_author)
def parse_author(self,response):
yield{
'author': response.css('.author-title::text').extract_first(),
'author_born_date': response.css('.author-born-date::text').extract_first(),
'author_born_location': response.css('.author-born-location::text').extract_first(),
'authro_description': response.css('.author-born-location::text').extract_first(),
}
把子頁面下要爬取的內(nèi)容也聲明items吨瞎。
import scrapy
class QuotesItem(scrapy.Item):
quote = scrapy.Field()
author = scrapy.Field()
tags = scrapy.Field()
# For author introductions
author_born_date = scrapy.Field()
author_born_location = scrapy.Field()
author_description = scrapy.Field()
步驟2:爬蟲
在子頁面再yield item
import scrapy
from quotes_2.items import QuotesItem
class QuotesSpider(scrapy.Spider):
name = 'quotes_2_5'
start_urls = [
'http://quotes.toscrape.com',
]
allowed_domains = [
'toscrape.com',
]
def parse(self,response):
for quote in response.css('div.quote'):
item = QuotesItem()
item['quote'] = quote.css('span.text::text').extract_first()
item['author'] = quote.css('small.author::text').extract_first()
item['tags'] = quote.css('div.tags a.tag::text').extract()
author_page = response.css('small.author+a::attr(href)').extract_first()
author_full_url = response.urljoin(author_page)
yield scrapy.Request(authro_full_url, meta={'item':item},callback=self.parse_author)
def parse_author(self,response):
item = response.meta['item']
item['author_born_date'] = response.css('.author-born-date::text').extract_first()
item['author_born_location'] = response.css('.author-born-location::text').extract_first()
item['author_description'] = response.css('.author-born-location::text').extract_first()
yield item
結(jié)果
[
{"author_description": "in Ulm, Germany", "author": "Albert Einstein", "quote": "\u201cThe world as we have created it is a process of our thinking. It cannot be changed without changing our thinking.\u201d", "tags": ["change", "deep-thoughts", "thinking", "world"], "author_born_date": "March 14, 1879", "author_born_location": "in Ulm, Germany"}
]
parse()函數(shù)以及子頁面同時(shí)yield item
import scrapy
from quotes_2.items import QuotesItem
class QuotesSpider(scrapy.Spider):
name = 'quotes_2_5'
start_urls = [
'http://quotes.toscrape.com',
]
allowed_domains = [
'toscrape.com',
]
def parse(self,response):
for quote in response.css('div.quote'):
item = QuotesItem()
item['quote'] = quote.css('span.text::text').extract_first()
item['author'] = quote.css('small.author::text').extract_first()
item['tags'] = quote.css('div.tags a.tag::text').extract()
yield item
author_page = response.css('small.author+a::attr(href)').extract_first()
author_full_url = response.urljoin(author_page)
yield scrapy.Request(authro_full_url, meta={'item':item},callback=self.parse_author)
def parse_author(self,response):
item = response.meta['item']
item['author_born_date'] = response.css('.author-born-date::text').extract_first()
item['author_born_location'] = response.css('.author-born-location::text').extract_first()
item['author_description'] = response.css('.author-born-location::text').extract_first()
yield item
json文件結(jié)果
[
{"quote": "\u201cThe world as we have created it is a process of our thinking. It cannot be changed without changing our thinking.\u201d", "author": "Albert Einstein", "tags": ["change", "deep-thoughts", "thinking", "world"]},
{"quote": "\u201cIt is our choices, Harry, that show what we truly are, far more than our abilities.\u201d", "author": "J.K. Rowling", "tags": ["abilities", "choices"]},
{"quote": "\u201cThere are only two ways to live your life. One is as though nothing is a miracle. The other is as though everything is a miracle.\u201d", "author": "Albert Einstein", "tags": ["inspirational", "life", "live", "miracle", "miracles"]},
{"quote": "\u201cThe person, be it gentleman or lady, who has not pleasure in a good novel, must be intolerably stupid.\u201d", "author": "Jane Austen", "tags": ["aliteracy", "books", "classic", "humor"]},
{"quote": "\u201cImperfection is beauty, madness is genius and it's better to be absolutely ridiculous than absolutely boring.\u201d", "author": "Marilyn Monroe", "tags": ["be-yourself", "inspirational"]},
{"quote": "\u201cTry not to become a man of success. Rather become a man of value.\u201d", "author": "Albert Einstein", "tags": ["adulthood", "success", "value"]},
{"quote": "\u201cIt is better to be hated for what you are than to be loved for what you are not.\u201d", "author": "Andr\u00e9 Gide", "tags": ["life", "love"]},
{"quote": "\u201cI have not failed. I've just found 10,000 ways that won't work.\u201d", "author": "Thomas A. Edison", "tags": ["edison", "failure", "inspirational", "paraphrased"]},
{"quote": "\u201cA woman is like a tea bag; you never know how strong it is until it's in hot water.\u201d", "author": "Eleanor Roosevelt", "tags": ["misattributed-eleanor-roosevelt"]},
{"quote": "\u201cA day without sunshine is like, you know, night.\u201d", "author": "Steve Martin", "tags": ["humor", "obvious", "simile"]},
{"author_description": "in Ulm, Germany", "author": "Albert Einstein", "quote": "\u201cThe world as we have created it is a process of our thinking. It cannot be changed without changing our thinking.\u201d", "tags": ["change", "deep-thoughts", "thinking", "world"], "author_born_date": "March 14, 1879", "author_born_location": "in Ulm, Germany"}
]
把作者介紹頁面放入items中
如下:
def parse(self,response):
for quote in response.css('div.quote'):
item = QuotesItem()
item['quote'] = quote.css('span.text::text').extract_first()
item['author'] = quote.css('small.author::text').extract_first()
item['tags'] = quote.css('div.tags a.tag::text').extract()
yield item
author_page = response.css('small.author+a::attr(href)').extract_first()
item['authro_full_url'] = response.urljoin(author_page)
yield scrapy.Request(url=item['authro_full_url'], meta={'item':item},callback=self.parse_author)
得到j(luò)son文件痹兜,仍然只有一個(gè)作者。
[
{"quote": "\u201cThe world as we have created it is a process of our thinking. It cannot be changed without changing our thinking.\u201d", "author": "Albert Einstein", "tags": ["change", "deep-thoughts", "thinking", "world"]},
{"quote": "\u201cIt is our choices, Harry, that show what we truly are, far more than our abilities.\u201d", "author": "J.K. Rowling", "tags": ["abilities", "choices"]},
{"quote": "\u201cThere are only two ways to live your life. One is as though nothing is a miracle. The other is as though everything is a miracle.\u201d", "author": "Albert Einstein", "tags": ["inspirational", "life", "live", "miracle", "miracles"]},
{"quote": "\u201cThe person, be it gentleman or lady, who has not pleasure in a good novel, must be intolerably stupid.\u201d", "author": "Jane Austen", "tags": ["aliteracy", "books", "classic", "humor"]},
{"quote": "\u201cImperfection is beauty, madness is genius and it's better to be absolutely ridiculous than absolutely boring.\u201d", "author": "Marilyn Monroe", "tags": ["be-yourself", "inspirational"]},
{"quote": "\u201cTry not to become a man of success. Rather become a man of value.\u201d", "author": "Albert Einstein", "tags": ["adulthood", "success", "value"]},
{"quote": "\u201cIt is better to be hated for what you are than to be loved for what you are not.\u201d", "author": "Andr\u00e9 Gide", "tags": ["life", "love"]},
{"quote": "\u201cI have not failed. I've just found 10,000 ways that won't work.\u201d", "author": "Thomas A. Edison", "tags": ["edison", "failure", "inspirational", "paraphrased"]},
{"quote": "\u201cA woman is like a tea bag; you never know how strong it is until it's in hot water.\u201d", "author": "Eleanor Roosevelt", "tags": ["misattributed-eleanor-roosevelt"]},
{"quote": "\u201cA day without sunshine is like, you know, night.\u201d", "author": "Steve Martin", "tags": ["humor", "obvious", "simile"]},
{"authro_full_url": "http://quotes.toscrape.com/author/Albert-Einstein", "author": "Albert Einstein", "quote": "\u201cThe world as we have created it is a process of our thinking. It cannot be changed without changing our thinking.\u201d", "tags": ["change", "deep-thoughts", "thinking", "world"], "author_born_date": "March 14, 1879", "author_born_location": "in Ulm, Germany", "author_description": "in Ulm, Germany"}
]
把作者頁面先放進(jìn)items再yield
def parse(self,response):
for quote in response.css('div.quote'):
item = QuotesItem()
item['quote'] = quote.css('span.text::text').extract_first()
item['author'] = quote.css('small.author::text').extract_first()
item['tags'] = quote.css('div.tags a.tag::text').extract()
author_page = response.css('small.author+a::attr(href)').extract_first()
item['authro_full_url'] = response.urljoin(author_page)
yield item
yield scrapy.Request(url=item['authro_full_url'], meta={'item':item},callback=self.parse_author)
def parse_author(self,response):
item = response.meta['item']
item['author_born_date'] = response.css('.author-born-date::text').extract_first()
item['author_born_location'] = response.css('.author-born-location::text').extract_first()
item['author_description'] = response.css('.author-born-location::text').extract_first()
yield item
結(jié)果
[
{"authro_full_url": "http://quotes.toscrape.com/author/Albert-Einstein", "quote": "\u201cThe world as we have created it is a process of our thinking. It cannot be changed without changing our thinking.\u201d", "author": "Albert Einstein", "tags": ["change", "deep-thoughts", "thinking", "world"]},
{"authro_full_url": "http://quotes.toscrape.com/author/Albert-Einstein", "quote": "\u201cIt is our choices, Harry, that show what we truly are, far more than our abilities.\u201d", "author": "J.K. Rowling", "tags": ["abilities", "choices"]},
{"authro_full_url": "http://quotes.toscrape.com/author/Albert-Einstein", "quote": "\u201cThere are only two ways to live your life. One is as though nothing is a miracle. The other is as though everything is a miracle.\u201d", "author": "Albert Einstein", "tags": ["inspirational", "life", "live", "miracle", "miracles"]},
{"authro_full_url": "http://quotes.toscrape.com/author/Albert-Einstein", "quote": "\u201cThe person, be it gentleman or lady, who has not pleasure in a good novel, must be intolerably stupid.\u201d", "author": "Jane Austen", "tags": ["aliteracy", "books", "classic", "humor"]},
{"authro_full_url": "http://quotes.toscrape.com/author/Albert-Einstein", "quote": "\u201cImperfection is beauty, madness is genius and it's better to be absolutely ridiculous than absolutely boring.\u201d", "author": "Marilyn Monroe", "tags": ["be-yourself", "inspirational"]},
{"authro_full_url": "http://quotes.toscrape.com/author/Albert-Einstein", "quote": "\u201cTry not to become a man of success. Rather become a man of value.\u201d", "author": "Albert Einstein", "tags": ["adulthood", "success", "value"]},
{"authro_full_url": "http://quotes.toscrape.com/author/Albert-Einstein", "quote": "\u201cIt is better to be hated for what you are than to be loved for what you are not.\u201d", "author": "Andr\u00e9 Gide", "tags": ["life", "love"]},
{"authro_full_url": "http://quotes.toscrape.com/author/Albert-Einstein", "quote": "\u201cI have not failed. I've just found 10,000 ways that won't work.\u201d", "author": "Thomas A. Edison", "tags": ["edison", "failure", "inspirational", "paraphrased"]},
{"authro_full_url": "http://quotes.toscrape.com/author/Albert-Einstein", "quote": "\u201cA woman is like a tea bag; you never know how strong it is until it's in hot water.\u201d", "author": "Eleanor Roosevelt", "tags": ["misattributed-eleanor-roosevelt"]},
{"authro_full_url": "http://quotes.toscrape.com/author/Albert-Einstein", "quote": "\u201cA day without sunshine is like, you know, night.\u201d", "author": "Steve Martin", "tags": ["humor", "obvious", "simile"]},
{"authro_full_url": "http://quotes.toscrape.com/author/Albert-Einstein", "author": "Albert Einstein", "quote": "\u201cThe world as we have created it is a process of our thinking. It cannot be changed without changing our thinking.\u201d", "tags": ["change", "deep-thoughts", "thinking", "world"], "author_born_date": "March 14, 1879", "author_born_location": "in Ulm, Germany", "author_description": "in Ulm, Germany"}
]
交給子目錄yield
def parse(self,response):
for quote in response.css('div.quote'):
item = QuotesItem()
item['quote'] = quote.css('span.text::text').extract_first()
item['author'] = quote.css('small.author::text').extract_first()
item['tags'] = quote.css('div.tags a.tag::text').extract()
author_page = response.css('small.author+a::attr(href)').extract_first()
item['authro_full_url'] = response.urljoin(author_page)
yield scrapy.Request(url=item['authro_full_url'], meta={'item':item},callback=self.parse_author)
def parse_author(self,response):
item = response.meta['item']
item['author_born_date'] = response.css('.author-born-date::text').extract_first()
item['author_born_location'] = response.css('.author-born-location::text').extract_first()
item['author_description'] = response.css('.author-born-location::text').extract_first()
yield item
最后只有一個(gè)
[
{"authro_full_url": "http://quotes.toscrape.com/author/Albert-Einstein", "author": "Albert Einstein", "quote": "\u201cThe world as we have created it is a process of our thinking. It cannot be changed without changing our thinking.\u201d", "tags": ["change", "deep-thoughts", "thinking", "world"], "author_born_date": "March 14, 1879", "author_born_location": "in Ulm, Germany", "author_description": "in Ulm, Germany"}
]
關(guān)閉Scrapy的去重機(jī)制
加上參數(shù)颤诀,不要過濾
def parse(self,response):
for quote in response.css('div.quote'):
item = QuotesItem()
item['quote'] = quote.css('span.text::text').extract_first()
item['author'] = quote.css('small.author::text').extract_first()
item['tags'] = quote.css('div.tags a.tag::text').extract()
author_page = response.css('small.author+a::attr(href)').extract_first()
item['authro_full_url'] = response.urljoin(author_page)
yield scrapy.Request(url=item['authro_full_url'], meta={'item':item},callback=self.parse_author,dont_filter=True)
def parse_author(self,response):
item = response.meta['item']
item['author_born_date'] = response.css('.author-born-date::text').extract_first()
item['author_born_location'] = response.css('.author-born-location::text').extract_first()
item['author_description'] = response.css('.author-born-location::text').extract_first()
yield item
最后結(jié)果json文件
[
{"authro_full_url": "http://quotes.toscrape.com/author/Albert-Einstein", "author": "Albert Einstein", "quote": "\u201cTry not to become a man of success. Rather become a man of value.\u201d", "tags": ["adulthood", "success", "value"], "author_born_date": "March 14, 1879", "author_born_location": "in Ulm, Germany", "author_description": "in Ulm, Germany"},
{"authro_full_url": "http://quotes.toscrape.com/author/Albert-Einstein", "author": "Marilyn Monroe", "quote": "\u201cImperfection is beauty, madness is genius and it's better to be absolutely ridiculous than absolutely boring.\u201d", "tags": ["be-yourself", "inspirational"], "author_born_date": "March 14, 1879", "author_born_location": "in Ulm, Germany", "author_description": "in Ulm, Germany"},
{"authro_full_url": "http://quotes.toscrape.com/author/Albert-Einstein", "author": "Steve Martin", "quote": "\u201cA day without sunshine is like, you know, night.\u201d", "tags": ["humor", "obvious", "simile"], "author_born_date": "March 14, 1879", "author_born_location": "in Ulm, Germany", "author_description": "in Ulm, Germany"},
{"authro_full_url": "http://quotes.toscrape.com/author/Albert-Einstein", "author": "Andr\u00e9 Gide", "quote": "\u201cIt is better to be hated for what you are than to be loved for what you are not.\u201d", "tags": ["life", "love"], "author_born_date": "March 14, 1879", "author_born_location": "in Ulm, Germany", "author_description": "in Ulm, Germany"},
{"authro_full_url": "http://quotes.toscrape.com/author/Albert-Einstein", "author": "Albert Einstein", "quote": "\u201cThere are only two ways to live your life. One is as though nothing is a miracle. The other is as though everything is a miracle.\u201d", "tags": ["inspirational", "life", "live", "miracle", "miracles"], "author_born_date": "March 14, 1879", "author_born_location": "in Ulm, Germany", "author_description": "in Ulm, Germany"},
{"authro_full_url": "http://quotes.toscrape.com/author/Albert-Einstein", "author": "J.K. Rowling", "quote": "\u201cIt is our choices, Harry, that show what we truly are, far more than our abilities.\u201d", "tags": ["abilities", "choices"], "author_born_date": "March 14, 1879", "author_born_location": "in Ulm, Germany", "author_description": "in Ulm, Germany"},
{"authro_full_url": "http://quotes.toscrape.com/author/Albert-Einstein", "author": "Jane Austen", "quote": "\u201cThe person, be it gentleman or lady, who has not pleasure in a good novel, must be intolerably stupid.\u201d", "tags": ["aliteracy", "books", "classic", "humor"], "author_born_date": "March 14, 1879", "author_born_location": "in Ulm, Germany", "author_description": "in Ulm, Germany"},
{"authro_full_url": "http://quotes.toscrape.com/author/Albert-Einstein", "author": "Albert Einstein", "quote": "\u201cThe world as we have created it is a process of our thinking. It cannot be changed without changing our thinking.\u201d", "tags": ["change", "deep-thoughts", "thinking", "world"], "author_born_date": "March 14, 1879", "author_born_location": "in Ulm, Germany", "author_description": "in Ulm, Germany"},
{"authro_full_url": "http://quotes.toscrape.com/author/Albert-Einstein", "author": "Thomas A. Edison", "quote": "\u201cI have not failed. I've just found 10,000 ways that won't work.\u201d", "tags": ["edison", "failure", "inspirational", "paraphrased"], "author_born_date": "March 14, 1879", "author_born_location": "in Ulm, Germany", "author_description": "in Ulm, Germany"},
{"authro_full_url": "http://quotes.toscrape.com/author/Albert-Einstein", "author": "Eleanor Roosevelt", "quote": "\u201cA woman is like a tea bag; you never know how strong it is until it's in hot water.\u201d", "tags": ["misattributed-eleanor-roosevelt"], "author_born_date": "March 14, 1879", "author_born_location": "in Ulm, Germany", "author_description": "in Ulm, Germany"}
]