本次爬蟲實(shí)踐抵乓,使用到了cookies這一概念,有興趣的童鞋們可以自行搜索資料阎抒。
這次的代碼并未對(duì)搜索獲得的結(jié)果數(shù)量進(jìn)行邏輯判斷,after all消痛,我們只是為學(xué)習(xí)爬蟲且叁,大家可以發(fā)揮自己的分析能力和編程能力優(yōu)化這段代碼,有興趣的話評(píng)論交流秩伞。
話不多逞带,直接上代碼。
#!/usr/bin/env python
# -*- coding:utf-8
import requests
import pandas as pd
from collections import OrderedDict
from bs4 import BeautifulSoup
import time
def getPosInfo(posList,curSession):
posinfoList=[]
if posList['state'] == 1:
posList = posList['content']['data']['page']
else:
print('Something goes wrong with our spider!')
return ['no job available']
for pos in posList['result']:
posinfo = OrderedDict()
posinfo['公司全稱'] = pos['companyFullName']
posinfo['公司縮寫'] = pos['companyName']
posinfo['創(chuàng)建時(shí)間'] = pos['createTime']
posinfo['職位名稱'] = pos['positionName']
posinfo['薪水'] = pos['salary']
posinfo['職位編號(hào)'] = str(pos['positionId'])
getPosDetail(posinfo,curSession)
posinfoList.append(posinfo)
time.sleep(0.5)
return posinfoList
def getPosDetail(posinfo,curSession):
resp = curSession.get(posDetailurl.format(positionid=posinfo['職位編號(hào)']))
print(resp.url)
bsobj = BeautifulSoup(resp.text,'html.parser')
temptation = bsobj.select('div.temptation')
if temptation != []:
posinfo['職位誘惑'] = temptation[0].string.strip().lstrip('職位誘惑:')
else:
posinfo['職位誘惑']= '無(wú)'
desc=''
for line in bsobj.select('div.content p')[::]:
if line.string != None:
desc+=(line.string + '\n')
posinfo['職位描述'] = desc
cookies={'JESSIONID':'XXX',
'LGRID':'XXX',
'LGSID':'XXX',
'LGUID':'XXX',
'user_trace_token':'XXX',
'login':'true',
}
starturl = 'https://m.lagou.com/search.json'
posDetailurl='https://m.lagou.com/jobs/{positionid}.html'
headers={'User-Agent':'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Mobile Safari/537.36',
'X-Requested-With':'XMLHttpRequest',
'Accept':'application/json',
'Accept-language':'zh-CN,zh;q=0.8',
'Accept-Encoding':'gzip, deflate, br'}
params={'city':'上海',
'positionName':'python 爬蟲',
'pageNo':1,
'pageSize':15}
city = input("請(qǐng)輸入職位所在城市: ")
position = input("請(qǐng)輸入搜索職位: ")
params['city']= city
params['positionName'] = position
poslist=[]
with requests.Session() as s:
s.headers.update(headers)
s.cookies.update(cookies)
#請(qǐng)自行調(diào)節(jié)爬取的頁(yè)數(shù)
for page in range(1,3):
params['pageNo']=page
content = s.get(starturl,params=params)
content.encoding='utf-8'
poslist.extend(getPosInfo(content.json(),s))
ds = pd.DataFrame(poslist)
ds.to_excel('拉鉤.xlsx')
cookies請(qǐng)自行注冊(cè)拉勾網(wǎng)賬號(hào)然后抓包獲取纱新。
學(xué)海無(wú)涯展氓,擁抱改變,不斷進(jìn)化脸爱。