1何乎、 準(zhǔn)備工作
分析51job招聘信息頁面篙耗,如圖所示,紅框標(biāo)記的是頁碼宪赶,將URL分割為兩個(gè)部分宗弯,爬取多個(gè)頁面是需要修改頁碼信息
分析請求響應(yīng)信息,請求時(shí)需要模擬瀏覽器請求信息搂妻,如下圖蒙保,包括header信息和cookie信息;響應(yīng)結(jié)果在頁面的javascript標(biāo)簽中欲主,需要通過正則表達(dá)式解析響應(yīng)結(jié)果邓厕。
2、 用到的Python庫
requests:模擬瀏覽器請求
re:正則表達(dá)式
json:字符串轉(zhuǎn)json
pandas:excel操作
3扁瓢、代碼
import json
import re
import time
import requests
import pandasas pd
'''
pre_url: url前綴suf_url: url后綴headers: 請求頭cookies: Cookie
page_num: 爬取頁數(shù)'''
def get_data(pre_url, suf_url, headers, cookies, page_num):
for iin range(1, page_num):
print("爬取第" +str(i) +"頁數(shù)據(jù)")
url = pre_url +str(i) + suf_url
web = requests.get(url, headers=headers, cookies=cookies)
web.encoding ='gbk'
? ? ? ? print(web.text)
r = re.findall('window.__SEARCH_RESULT__ = (.*?)</script>', web.text, re.S)
string =''.join(r)
info_dict = json.loads(string)
job_list = info_dict['engine_jds']
JobName = []
Providesalary = []
Workarea = []
Attribute = []
Companyname = []
Companysize = []
Companytype = []
Jobwelf = []
Companyind = []
Suedate = []
for objin job_list:
JobName.append(obj['job_name'])
Providesalary.append(obj['providesalary_text'])
Workarea.append(obj['workarea_text'])
Attribute.append(' '.join(obj['attribute_text'][1:]))
Companyname.append(obj['company_name'])
Companysize.append(obj['companysize_text'])
Companytype.append(obj['companytype_text'])
Jobwelf.append(obj['jobwelf'])
Companyind.append(obj['companyind_text'])
Suedate.append(obj['issuedate'])
data = pd.DataFrame()
data["工作名稱"] = JobName
data["工資待遇"] = Providesalary
data["工作地點(diǎn)"] = Workarea
data["職位要求"] = Attribute
data["公司名稱"] = Companyname
data["公司規(guī)模"] = Companysize
data["公司類別"] = Companytype
data["公司福利"] = Jobwelf
data["主營業(yè)務(wù)"] = Companyind
data["發(fā)布日期"] = Suedate
print(data)
try:
data.to_csv("51Job烏魯木齊招聘信息.csv", mode="a+", header=None, index=None, encoding="utf-8")
except:
print("跳轉(zhuǎn)網(wǎng)頁详恼,無數(shù)據(jù)")
time.sleep(1)
# Press the green button in the gutter to run the script.
if __name__ =='__main__':
pre_url ="https://search.51job.com/list/310200,000000,0000,00,9,99,+,2,"
? ? suf_url =".html?lang=c&postchannel=0000&workyear=99&cotype=99°reefrom=99&jobterm=99&companysize=99&ord_field=0&dibiaoid=0&line=&welfare="
? ? headers = {
'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
? ? ? ? 'Accept-Encoding':'gzip, deflate, br',
? ? ? ? 'Accept-Language':'zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6',
? ? ? ? 'Cache-Control':'max-age=0',
? ? ? ? 'Connection':'close',
? ? ? ? 'Referer':'https://search.51job.com/',
? ? ? ? 'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.110 Safari/537.36 Edg/96.0.1054.62'
? ? }
cookies = {
"Cookie":"_uab_collina=164515157588672382024854; guid=ffafb018452895c75b5ff63cd2fb9563; nsearch=jobarea%3D%26%7C%26ord_field%3D%26%7C%26recentSearch0%3D%26%7C%26recentSearch1%3D%26%7C%26recentSearch2%3D%26%7C%26recentSearch3%3D%26%7C%26recentSearch4%3D%26%7C%26collapse_expansion%3D; slife=lastvisit%3D310200%26%7C%26; privacy=1646033921; search=jobarea%7E%60310200%7C%21ord_field%7E%600%7C%21recentSearch0%7E%60310200%A1%FB%A1%FA000000%A1%FB%A1%FA0000%A1%FB%A1%FA00%A1%FB%A1%FA99%A1%FB%A1%FA%A1%FB%A1%FA99%A1%FB%A1%FA99%A1%FB%A1%FA99%A1%FB%A1%FA99%A1%FB%A1%FA9%A1%FB%A1%FA99%A1%FB%A1%FA%A1%FB%A1%FA0%A1%FB%A1%FA%A1%FB%A1%FA2%A1%FB%A1%FA1%7C%21recentSearch1%7E%60310200%A1%FB%A1%FA000000%A1%FB%A1%FA0000%A1%FB%A1%FA00%A1%FB%A1%FA99%A1%FB%A1%FA%A1%FB%A1%FA99%A1%FB%A1%FA99%A1%FB%A1%FA99%A1%FB%A1%FA99%A1%FB%A1%FA9%A1%FB%A1%FA99%A1%FB%A1%FA%A1%FB%A1%FA0%A1%FB%A1%FA-java%A1%FB%A1%FA2%A1%FB%A1%FA1%7C%21collapse_expansion%7E%601%7C%21; acw_tc=76b20fe516460406333445179e1863f701fac22d0d83677b10737d6e54a7ab; acw_sc__v2=621c9639e08991e176e0cadd6a8c5f5ea4dabb36; ssxmod_itna=Yq0xcDyD2A0QG=qD=DXm3fSosD78beb1Dg0i=nhnmx0v+xPGzDAxn40iDtrO59hfuDq1YGAPrF3YpSa2tX74aRDb6G4W2D3DU4i8DCL2F4WDemtD5xGoDPxDeDA7KiTDY4DdXxYPG0DiKDpx0kG25D7ZF41lKDTPYDRgaGDQyk9gPmx407DiHq920kD75pDlpxIRYD018f1Av1GRG=qlDDUmR60n2bMbb5xqi36m9Gq40OD0FGXxibG6g6Rav14w+e6QxPDaDPKlbq3iDowDrP=QIxmni5bGiQtYxTmlGx=WKKYmrKDDp4Z4PWGD4D==; ssxmod_itna2=Yq0xcDyD2A0QG=qD=DXm3fSosD78beb1Dg0i=nhnDnKd2qDsKoDLGYhnav7bAi3Fw8MYYm7=w3ifeEjpMIeq8=EdxnRI=ayekyb=8lFkEnXmpQKjjLW=z/=ORRGLUKMRowcU3gBbUOXpQBUZKAKDq33ZUSi4OMi47CE5IjIhQf2M7l03ulie7eiK7CYP3b0k3lfr2CXFoLwZYQpaNWhk57XKYh9hK0G5i7j187j9tQ61U+eZtl3o=3n9lCI88kow8U=4a+0KyOBx0fGD5BOG1KZfNVSxC69Kwz3DkQM79FxD=3zinLzb=QZ16Kjmhx0YPw3ONv1=5Avpa6phdYz6iPvLB3=1pU1Q56d5FwG05=pEhWbiR5TKRQxYW23FUvPzNAPb9Q=8/=0YLjBUa60ezpEErjiTop2DhW==Qtb0bgnQEuv+nFz6r2uS=e9CbT2TUzEWIAHguA82m/=4=RTKnSc2I9uR4uffo9WXZoTv+dFDG2Sr3SG4IRt4FA0htBx4Y2sqQEYl=R2HbxDFqD+oLNQG5D4PEDqBpKnhn0G5AmKAdCEjryxxD==="
? ? }
get_data(pre_url, suf_url, headers, cookies, 51)