實(shí)戰(zhàn)計劃第五天笛求,抓了58同城陡厘。
最終成果是這樣的:
我的代碼:
#!/usr/bin/env python #告訴計算機(jī)執(zhí)行程序在系統(tǒng)環(huán)境變量中的名字些椒,詳細(xì)位置在環(huán)境變量中設(shè)置好了
#-*- coding: utf-8 -*-
from bs4 import BeautifulSoup
import time
import requests
def get_info(link):
wb_detail = requests.get(link)
soup = BeautifulSoup(wb_detail.text, 'lxml')
# nth-of-child(3)改成nth-of-type(3)就可以唯一爬去本頁面的信息了 conditions 和 areas爬出來后需要去掉特殊符號
types = soup.select('#header > div.breadCrumb.f12 > span:nth-of-type(3) > a')
titles = soup.select('#content > div.person_add_top.no_ident_top > div.per_ad_left > div.col_sub.mainTitle > h1')
dates = soup.select('#index_show > ul.mtit_con_left.fl > li.time')
prices = soup.select(
'#content > div.person_add_top.no_ident_top > div.per_ad_left > div.col_sub.sumary > ul > li:nth-of-type(1) > div.su_con > span')
conditions = soup.select(
'#content > div.person_add_top.no_ident_top > div.per_ad_left > div.col_sub.sumary > ul > li:nth-of-type(2) > div.su_con > span')
areas = soup.select(
'#content > div.person_add_top.no_ident_top > div.per_ad_left > div.col_sub.sumary > ul > li:nth-of-type(3) > div.su_con > span')
for type, title, date, price, condition, area in zip(types, titles, dates, prices, conditions, areas):
data = {
'type' : type.get_text(),
'title' : title.get_text(),
'data' : date.get_text(),
'price' : price.get_text(),
'conditions' : list(condition.stripped_strings), #list()用法
'area' : list(areas[0].stripped_strings) if soup.find_all('span', 'c_25d') else None,
'view' : get_view(link)
}
print(data)
def get_view(url): #獲取瀏覽量
infoid = url.split('?')[0].split('/')[-1].strip('x.shtml')
api = 'http://jst1.58.com/counter?infoid={}'.format(infoid)
'''這里要加上header信息'''
headers = {'User-Agent':r'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.87 Safari/537.36',
'Cookie':r'id58=c5/ns1ct99sKkWWeFSQCAg==; city=bj; 58home=bj; ipcity=yiwu%7C%u4E49%u4E4C%7C0; als=0; myfeet_tooltip=end; bj58_id58s="NTZBZ1Mrd3JmSDdENzQ4NA=="; sessionid=021b1d13-b32e-407d-a76f-924ec040579e; bangbigtip2=1; 58tj_uuid=0ed4f4ba-f709-4c42-8972-77708fcfc553; new_session=0; new_uv=1; utm_source=; spm=; init_refer=; final_history={}; bj58_new_session=0; bj58_init_refer=""; bj58_new_uv=1'.format(str(infoid)),
'Accept': '*/*',
'Accept-Encoding': 'gzip, deflate, sdch',
'Accept-Language': 'zh-CN,zh;q=0.8',
'Cache-Control': 'max-age=0',
'Connection': 'keep-alive',
'Host':'jst1.58.com',
'Referer':r'http://bj.58.com/pingbandiannao/{}x.shtml'.format(str(infoid))
}
js = requests.get(api,headers = headers)
#js = requests.get(api)
view = js.text.split('=')[-1]
return view
def get_links_info(page):
urls = ['http://bj.58.com/pbdn/1/pn{}'.format(str(i)) for i in range(1,page)] #必須是個list
for url in urls:
wb_data = requests.get(url)
soup = BeautifulSoup(wb_data.text,'lxml')
schemes = soup.select('#infolist tr td.t a') #為什么寫成這樣就可以爬取了??
print(schemes)
time.sleep(2)
for scheme in schemes:
link = scheme.get('href')
if link[:17] == 'http://bj.58.com/': #用這種select有效連接
get_info(link)
get_links_info(20)
總結(jié)和問題
- list()用法
- CSSpath沒有>
- 字典寫文件語句
- open路徑前面加r
- 確定瀏覽量代碼