encoding=utf8
import scrapy
import time
from scrapyLuntan.items import ScrapyluntanItem
import sys
reload(sys)
sys.setdefaultencoding("utf-8")
lis = []
class LunTan(scrapy.Spider):
# 這個爬蟲的識別名稱毕箍,必須是唯一的挂疆,在不同的爬蟲必須定義不同的名字
name='luntan'
# 是搜索的域名范圍髓考,也就是爬蟲的約束區(qū)域炼鞠,
# 規(guī)定爬蟲只爬取這個域名下的網(wǎng)頁茵休,不存在的URL會被忽略嘲碱。
allowd_domains = ['http://dzh2.mop.com/']
#爬取的URL元祖 / 列表煌茬。爬蟲從這里開始抓取數(shù)據(jù),
# 所以,第一次下載的數(shù)據(jù)將會從這些urls開始急但。其他子URL將會從這些起始URL中繼承性生成澎媒。
start_urls = ('http://www.mop.com/',)
#解析的方法,每個初始URL完成下載后將被調(diào)用羊始,
# 調(diào)用的時候傳入從每一個URL傳回的Response對象來作為唯一參數(shù)
def parse(self, response):
li_list = response.xpath("http://li[@class='mop-item-a']")
print len(li_list)
for i in li_list:
a_href = i.xpath("./a/@href")[0].extract()
a_href = a_href.replace('http://dzh2.mop.com/dzh_index.html#rlink=','')
print a_href
# 發(fā)送新的url請求加入待爬隊列旱幼,并調(diào)用回調(diào)函數(shù) self.parse
yield scrapy.Request(a_href, meta={'a_href': a_href},callback=self.res_detail)
def res_detail(self, response):
item = ScrapyluntanItem()
detail_url = response.meta['a_href']
print response.meta['a_href']
if 'http://dzh2.mop.com/' in detail_url:
text = response.xpath("http://div[@class='post-date fl mr15']/span/text()")[0].extract()
else:
text = response.xpath("http://div[@class='mr20 inlineBlock']/span/text()")[0].extract()
text = text.strip()
print text
if '年' in text:
print 88888
text = text.replace('年','-')
text = text.replace('月','-')
text = text.replace('日','')
timeArray = time.strptime(text,"%Y-%m-%d %H:%M:%S")
item['time'] = int(time.mktime(timeArray))
open('./time.txt','a+').write(str(item['time'])+"\r\n")
yield item
# print timeStamp