scrapy crawl liepin 遇到的坑
1.處理json文件方法,try 用法
# 創(chuàng)建json文件對象
self.f = open('city_list.json', 'a', encoding='utf-8')
item = {}
for c in c_list:
#生成字典
city = c.xpath('./@title')[0]
item[city] = c.xpath('./@href')[0]
# 存入json文件
json.dump(item, self.f, ensure_ascii=False)
# close
self.f.close()
讀取:
def select_city(self, city):
with open('city_list.json', 'r', encoding='utf-8') as f:
# 讀json文件
f_dict = json.load(f)
# 嘗試返回城市代碼,如果輸入城市不存在,報(bào)錯
try:
return f_dict[city]
except:
print('輸入城市不在查詢范圍內(nèi),請重新輸入!')
- 獲取頁碼總數(shù)
def get_page(self, city_link, key):
url = self.base_url.format(city_link, '0', key)
# requests.get 的用法
res = requests.get(url=url, headers=self.headers).text # 返回字符串
# em 節(jié)點(diǎn)無法用xpath獲取,只能用正則, re.findall(表達(dá)式,文本)
num = int(re.findall(r'<em>(.*)</em>', res)[0][:-1])# findall結(jié)果是列表:['1000+']
page = num // 40
return page
# 如果是requests.get(url,headers).text 接xpath:
res.encoding = 'utf-8'
parse_html = etree.HTML(res)
# r_list : ['../','day01','day02','redis_day01.zip']
r_list = parse_html.xpath(xpath_bds)
- 解析部分:
模板中,直接response.xpath即可