本篇是針對 58 同城二手車的爬蟲徙垫,主要是爬取車的價格讥裤,一些基礎(chǔ)信息,保存到 CSV表格中姻报。
創(chuàng)建時間:2019-04-22 10:20 很簡單還是分享一下吧
import re
import math
import requests
from scrapy import Selector
def start_request():
"""
開始爬蟲
:return:
"""
index_url = 'https://quanguo.58.com/ershouche/'
index_headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.121 Safari/537.36',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
'Referer': 'https://quanguo.58.com/ershouche/',
}
index_response = requests.get(url=index_url, headers=index_headers)
if index_response.status_code == 200:
selector_response = Selector(text=index_response.text)
all_info = selector_response.xpath('//tr')
all_car_info_total = selector_response.xpath('//p[@id="infocont"]/strong/text()').extract_first()
all_page = math.ceil(int(all_car_info_total) / 50)
print('has %s total_page' % all_page)
get_car_info(all_info) # 進(jìn)行第一頁信息的解析
for page_ in range(2, int(all_page)):
print('開始下載第 %s 頁圖片' % page_)
page_url = 'https://quanguo.58.com/ershouche/pn%s/' % str(page_)
page_response = requests.get(url=page_url, headers=index_headers)
if index_response.status_code == 200:
selector_response = Selector(text=page_response.text)
all_info = selector_response.xpath('//tr')
get_car_info(all_info) # 進(jìn)行第下一頁信息的解析
def get_car_info(all_info):
"""
解析獲得的信息
:param all_info:
:return:
"""
for each_info in all_info[1:]:
car_info = each_info.xpath('td[2]/a//text()').extract()
car_log = car_info[0] if len(car_info) >= 2 else '' # 車的標(biāo)志 eg:現(xiàn)代己英、大眾、日產(chǎn)等標(biāo)志
car_model = car_info[1] if len(car_info) >= 2 else '' # 車的型號 eg: 索納塔 2011款 2.0L 自動尊貴版
base_car_info = each_info.xpath('td[2]/p//text()').extract() # 車的一些基礎(chǔ)信息
buy_year = re.findall(r'.*\t(\w+)\t', base_car_info[0])[0] # 購買的年限
travelling_kilometers = re.findall(r'(.*)\t', base_car_info[2])[0] # 已經(jīng)行駛公里數(shù)
displacement = re.findall(r'(.*)\t', base_car_info[4])[0] # 汽車的排量是多少升
car_type = re.findall(r'(.*)\t', base_car_info[6])[0] # 汽車是自動擋還是手動擋的
car_price = each_info.xpath('td[3]/b/text()').extract_first() # 汽車的價格
car_price = car_price + '萬元' if car_price else ''
car_safety = each_info.xpath('td[4]//a/text()').extract_first() # 行駛證是否驗證
list_info = [car_log, car_model, buy_year, travelling_kilometers, displacement, car_type, car_price, car_safety]
all_write = ','.join('%s' % each_ for each_ in list_info)
with open('car_info.csv', 'a+', encoding='utf-8') as f:
print('正在寫入中................')
f.write(all_write + '\n')
if __name__ == '__main__':
start_request()
很簡答的一次整理逗抑,歡迎查看個人 csdn賬號:https://blog.csdn.net/weixin_42812527