爬取目標(biāo):淘寶下某一類目商品的標(biāo)題慌随、鏈接唬血、原價(jià)堕阔、優(yōu)惠促銷價(jià)格咆课、評(píng)論數(shù)等信息(也可進(jìn)一步爬取詳細(xì)評(píng)論信息)灌砖。
源代碼
items.py
# -*- coding: utf-8 -*-
# Define here the models for your scraped items
#
# See documentation in:
# http://doc.scrapy.org/en/latest/topics/items.html
import scrapy
class Taobao01Item(scrapy.Item):
# define the fields for your item here like:
#商品名稱
title = scrapy.Field()
#商品鏈接
link = scrapy.Field()
#商品價(jià)格(原價(jià))
price = scrapy.Field()
#促銷價(jià)格
price_now = scrapy.Field()
#評(píng)論數(shù)
comment = scrapy.Field()
爬蟲文件 tb01.py
# -*- coding: utf-8 -*-
import scrapy
from scrapy.http import Request
import re
from taobao01.items import Taobao01Item
import urllib.request
class Tb01Spider(scrapy.Spider):
name = "tb01"
allowed_domains = ["taobao.com"]
start_urls = ['http://www.taobao.com/']
def parse(self, response):
key = '零食'
for i in range(1,3):
url = "https://s.taobao.com/search?q=" + key + "&ie=utf8&s=" + str((i - 1) * 44)
print("要爬取的url是:" + url)
yield Request(url=url,callback=self.goodlist)
def goodlist(self,response):
body = response.body.decode()
pat = '"nid":"(.*?)"'
allid = re.compile(pattern=pat).findall(body)
# print(allid)
for id in allid:
url = "https://item.taobao.com/item.htm?id=" + str(id)
yield Request(url=url,callback=self.good,meta={"id":id})
def good(self,response):
id = response.meta["id"]
comment_url = "https://rate.taobao.com/detailCount.do?callback=jsonp100&itemId=" + str(id)
try:
title = response.xpath("http://h3[@class='tb-main-title']/text()").extract()[0]
except:
title = response.xpath("http://h1[@data-spm='1000983']/text()").extract()[0]
link = response.url
try:
price = response.xpath("http://em[@class='tb-rmb-num']/text()").extract()[0]
except:
price = "100"
# try:
# price = response.xpath("http://dl[@id='J_StrPriceModBox']/dd/span[@class='tm-price']/text()").extract()[0]
# except:
# price = response.xpath("http://dl[@id='J_StrPriceModBox']/dd/div[class='tm-promo-price']/span[@class='tm-price']/text()").extract()[0]
commentdata = urllib.request.urlopen(comment_url).read().decode("utf-8","ignore")
pat = 'jsonp100({"count":(.*?)})'
# comment = re.compile(pat).findall(commentdata)[0]
print("商品url是:" + link)
print("商品價(jià)格是:" + price)
print("商品評(píng)論url是:" + comment_url)
print("返回的評(píng)論字符串是:" + commentdata)
# print("評(píng)論數(shù)是:" + comment)
print("")
item = Taobao01Item()
item["title"] = title
item["link"] = link
item["price"] = price
# item["comment"] = comment
yield item
pipelines.py
將爬取到的數(shù)據(jù)插入到數(shù)據(jù)庫(kù):略,可參考博文
http://www.reibang.com/p/164f3fda2d1c
(本文未完待續(xù))