時(shí)間戳格式化成字符串之后的結(jié)果:
自己看結(jié)果
>>> time.time()
1530150193.873144
>>> '{}'.format(time.time())
'1530150224.11'
>>> '{}'.format(str(time.time()))
'1530150237.7'
>>> a = 1.33333
>>> str(a)
'1.33333'
>>> str(time.time())
'1530151047.78'
>>>
pyspider內(nèi)置url去重機(jī)制周霉,本想著,根據(jù)時(shí)間戳的唯一性來拼接url參數(shù)亚皂,達(dá)到url唯一性的目的俱箱,以至于不被自動(dòng)去重而丟失請求,但是事實(shí)上灭必,丟失了大量的請求;
問題代碼段:
for i in range(1,pages+1):
url = 'http://hotels.ctrip.com/Domestic/Tool/AjaxHotelList.aspx?time={}'.format(time.time())
由于cpu的運(yùn)算速度特別快狞谱,別說第二小數(shù)了,可能第6位小數(shù)都是重復(fù)的禁漓,那么這樣的url肯定被去重跟衅,
導(dǎo)致的結(jié)果,我在調(diào)試的時(shí)候529個(gè)請求播歼,有的時(shí)候是40多個(gè)伶跷,有的時(shí)候是50多個(gè),我就納悶了秘狞,到底是怎么回事叭莫,為什么會(huì)丟請求,后來想到烁试,前段時(shí)間的另一個(gè)問題:
self.send_message(self.project_name,data,url=time.time())
由于pyspider自帶數(shù)據(jù)去重雇初,相同的數(shù)據(jù)特征是不會(huì)被保存的,這里是根據(jù)url來識(shí)別特征的减响!
在調(diào)試中我發(fā)現(xiàn)所有的數(shù)據(jù)都拿到了靖诗,但是保存的時(shí)候,總是會(huì)丟失大量的數(shù)據(jù)支示;基本上會(huì)丟失9/10,也就是10條數(shù)據(jù)才能被保存一條刊橘;
突然恍然大悟:
萬惡的time.time()根本不能唯一標(biāo)識(shí);
于是修改代碼颂鸿,只需要修改一個(gè)地方伤为,就是將time.time()換成其他的唯一標(biāo)識(shí)符;
修改后的代碼如下:
#!/usr/bin/env python
# -*- encoding: utf-8 -*-
# Created on 2018-06-21 10:35:32
# Project: ctrip_hotel
import sys
reload(sys)
sys.setdefaultencoding('utf-8')
from pyspider.libs.base_handler import *
from lxml import etree
import random
import json
import time
from pyspider_util import proxy_util
from fake_useragent import UserAgent
import re
from pyspider.libs.base_handler import *
from lxml import etree
import random
import json
import time
from pyspider_util import proxy_util
from fake_useragent import UserAgent
import re
import hashlib
city_names = [u'上海', '北京', '廣州', '深圳', '南京', '廈門', '大連', '天津', '寧波', '成都', '無錫', '杭州', '武漢', '沈陽', '蘇州', '西安', '重慶', '長沙', '青島', '東莞', '烏魯木齊', '佛山', '南寧', '南昌', '南通', '合肥', '太原', '常州', '徐州', '惠州', '揚(yáng)州', '昆明', '汕頭', '泉州', '洛陽', '濟(jì)南', '海口', '溫州', '濰坊', '煙臺(tái)', '珠海', '石家莊', '福州', '貴陽', '鄭州', '金華', '長春', '哈爾濱', '三亞', '上饒', '中山', '臨沂', '麗江', '保定', '蘭州', '包頭', '南充', '南平', '臺(tái)州', '吉林', '呼倫貝爾', '呼和浩特', '咸陽', '唐山', '嘉興', '大慶', '威海', '安陽', '宜賓', '宜昌', '宜春', '寶雞', '岳陽', '常德', '廊坊', '張家口', '德陽', '懷化', '撫順', '揭陽', '柳州', '株洲', '桂林', '梅州', '榆林', '泰州', '瀘州', '濟(jì)寧', '淄博', '淮安', '湖州', '湛江', '漳州', '鹽城', '秦皇島', '紹興', '綿陽', '舟山', '蕪湖', '荊州', '莆田', '蚌埠', '衡陽', '衢州', '西寧', '贛州', '赤峰', '運(yùn)城', '連云港', '遵義', '邢臺(tái)', '邯鄲', '郴州', '鄂爾多斯', '銀川', '鎮(zhèn)江', '鞍山', '齊齊哈爾', '龍巖']
# type_code = ['100000', '100100', '100101', '100102', '100103', '100104', '100105', '100200', '100201']
class Handler(BaseHandler):
crawl_config = {
'proxy': 'forward.xdaili.cn:80'
}
ua = UserAgent()
# 請求城市列表頁
@every(minutes=365 * 24 * 60)
def on_start(self):
headers = {}
headers = self.url_start(headers)
url = 'http://hotels.ctrip.com/domestic-city-hotel.html?time={}'.format(time.time())
self.crawl(url,headers=headers,callback=self.get_allcity_urls,retries=10)
# 請求所有城市的url
@config(age=364 * 24 * 60)
def get_allcity_urls(self,response):
tree = etree.HTML(response.text)
all_list = tree.xpath('//dl[@class="pinyin_filter_detail layoutfix"]/dd/a')
city_urls = []
url_name = {}
for i in all_list:
name = i.xpath('./text()')[0]
if name in city_names:
city_urls.append(i.xpath('./@href')[0])
url_name[i.xpath('./@href')[0]] = name
print(city_urls)
for city_url in city_urls:
headers = {}
headers = self.url_start(headers)
city_url = 'http://hotels.ctrip.com' + city_url + '?time={}'.format(time.time())
self.crawl(city_url,headers = headers,retries=10,callback=self.get_allpages,save=url_name)
#拿到總頁數(shù)绞愚,發(fā)起所有列表頁的請求
@config(age=364 * 24 * 60)
def get_allpages(self,response):
tree = etree.HTML(response.text)
url_name = response.save
url_e = re.findall(r'(/hotel/[a-z]+\d+)\?time=', response.url)[0]
name = url_name[url_e]
try:
pages = tree.xpath('//div[@class="c_page_list layoutfix"]/a[@rel="nofollow"]/text()')[0]
pages = int(pages)
except:
pages = 1
print(pages)
# python2
import urllib
print(name)
name_code = urllib.quote(name.decode('utf-8').encode('utf-8'))
city_id = re.findall(r'/hotel/[a-z]+(\d+)\?time=', response.url)[0]
city_py = re.findall(r'/hotel/([a-z]+)\d+\?time=',response.url)[0]
for i in range(1,pages+1):
url = 'http://hotels.ctrip.com/Domestic/Tool/AjaxHotelList.aspx?time={}'.format(i)
formdata = {
"__VIEWSTATEGENERATOR": "DB1FBB6D",
"cityName": name_code,
"RoomGuestCount": "1,1,0",
"operationtype": "NEWHOTELORDER",
"cityId": city_id,
"cityPY": city_py,
"page": i,
}
headers = {}
headers = self.url_start(headers)
self.crawl(url,method='POST', data=formdata,headers=headers,retries=10,callback=self.response_parse,save={'name':name})
# 保存單頁的數(shù)據(jù)部分?jǐn)?shù)據(jù),發(fā)起詳情頁請求颖医,拿到更多數(shù)據(jù)位衩;
@config(age=364 * 24 * 60)
def response_parse(self, response):
city_name = response.save['name']
response_json = response.json
info_list = response_json["hotelPositionJSON"]
htllist = response_json['HotelMaiDianData']['value']['htllist']
htllist = eval(htllist)
num = 0
for info in info_list:
# 拿到列表頁酒店的部分?jǐn)?shù)據(jù),傳遞給詳情頁的請求
info_json = {}
info_json['id'] = info['id']
info_json['名稱'] = info['name']
info_json['地址'] = info['address']
info_json['評分'] = info['score']
star = info['star']
if 'diamond' in star:
info_json['攜程評級(jí)'] = star
info['星級(jí)'] = ''
else:
info_json['攜程評級(jí)'] = ''
info['星級(jí)'] = star
info_json['類型'] = info['stardesc']
info_json['省'] = ''
info_json['市'] = city_name
lon = info['lon']
lat = info['lat']
info_json['中心點(diǎn)'] = lon + lat
info_json['最低價(jià)'] = htllist[num]['amount']
# for ht in htllist:
# if ht['hotelid'] == info['id']:
# info_json['最低價(jià)'] = ht['amount']
url = 'http://hotels.ctrip.com'+info['url']
headers = {}
headers = self.url_start(headers)
num += 1
self.crawl(url,headers=headers,retries=10, callback=self.detail_parse, save=info_json)
# 解析詳情頁熔萧,拿到剩余字段糖驴,保存數(shù)據(jù)庫
def detail_parse(self,response):
tree = etree.HTML(response.text)
#price = tree.xpath('//p[@class="staring_price"]/span[@class="price"]/text()')[0]
special = tree.xpath('//div[@class="grade"]/div[@class="special_label"]/i[@class="i_label"]/text()')
bar = tree.xpath('//div[@class="path_bar2"]/a/text()')
if len(bar) == 3:
district = bar[2]
brand = ''
elif len(bar) == 4:
district = bar[3]
brand = [2]
else:
district = ''
brand = ''
info_json = response.save
#info_json['最低價(jià)'] = price
info_json['品牌'] = brand
info_json['所在區(qū)縣'] = district
info_json['特色'] = special
self.send_message(self.project_name, info_json, url=info_json['id'])
def on_message(self, project_name, msg):
return msg
#
def url_start(self, headers):
times = int(time.time())
planText = "orderno=ZF20186158891UccNQO,secret=fbba4b982cc64755b23404f99297ecbd,timestamp={}".format(times)
md = hashlib.md5()
md.update(planText.encode('utf-8'))
content = md.hexdigest()
ua = UserAgent()
headers['User-Agent'] = ua.random
headers['Proxy-Authorization'] = 'sign={}&orderno=ZF20186158891UccNQO×tamp={}'.format(content.upper(),
times)
return headers