爬取干貨集中營(yíng)數(shù)據(jù)(3)
改進(jìn)后的妹紙爬蟲 ???
1 項(xiàng)目結(jié)構(gòu)
2 修改 Item
- title 標(biāo)題
- images 妹子圖片
- leftLink 左部導(dǎo)航
- rightLink 右部導(dǎo)航
class GankPageItem(scrapy.Item):
title = scrapy.Field()
images = scrapy.Field()
leftLink = scrapy.Field()
rightLink = scrapy.Field()
3 新建 gankpage_spider.py (spider 目錄下)
# -*- coding: utf-8 -*-
import scrapy
from gank.items import GankPageItem
class GnakPageSpider(scrapy.Spider):
name = "gank"
allowed_domains = ["gank.io"]
start_urls = [
"http://gank.io",
]
url_prefix = "http://gank.io"
def parse(self, response):
pageItem = self.extract_item(response)
yield pageItem
# 左部導(dǎo)航不為空 則向左部爬取
leftLinks = pageItem['leftLink']
if len(leftLinks) == 1:
leftLink = self.url_prefix + leftLinks[0]
yield scrapy.Request(leftLink, callback=self.to_left_page)
# 右部導(dǎo)航不為空 則向右部爬取
rightLinks = pageItem['rightLink']
if len(rightLinks) == 1:
rightLink = self.url_prefix + rightLinks[0]
yield scrapy.Request(rightLink, callback=self.to_right_page)
def extract_item(self, response):
'''
頁面解析
'''
selector = scrapy.Selector(response)
tagTitle = selector.xpath('head/title')
tagNavDivs = selector.xpath('(//div[contains(@class, "typo")]/div[contains(@class, "container content")]/div[contains(@class, "row")])[1]/div')
tagPs = selector.xpath('//div[contains(@class, "typo")]/div[contains(@class, "container content")]/div[contains(@class, "outlink")]/p')
pageItem = GankPageItem()
# 頁面標(biāo)題
pageTitle = tagTitle.xpath('text()').extract()[0]
pageItem['title'] = pageTitle #.encode('utf-8')
# 妹紙圖片
pageItem['images'] = tagPs.xpath('./img/@src').extract()
# 導(dǎo)航鏈接
if len(tagNavDivs) == 2:
pageItem['leftLink'] = tagNavDivs[0].xpath('./p/a/@href').extract()
pageItem['rightLink'] = tagNavDivs[1].xpath('./p/a/@href').extract()
else:
pageItem['rightLink'] = tagNavDivs[0].xpath('./p/a/@href').extract()
return pageItem
def to_left_page(self, response):
'''
左部爬取
'''
pageItem = self.extract_item(response)
yield pageItem
leftLinks = pageItem['leftLink']
if len(leftLinks) == 1:
leftLink = self.url_prefix + leftLinks[0]
yield scrapy.Request(leftLink, callback=self.to_left_page)
def to_right_page(self, response):
'''
右部爬取
'''
pageItem = self.extract_item(response)
yield pageItem
rightLinks = pageItem['rightLink']
if len(rightLinks) == 1:
rightLink = self.url_prefix + rightLinks[0]
yield scrapy.Request(rightLink, callback=self.to_right_page)
4 執(zhí)行爬蟲
$> crapy crawl gank