爬取拉勾網(wǎng)各類招聘崗位恶阴,爬取不同的崗位種類只需要初始化時(shí)候傳入?yún)?shù)不同能耻,爬取成功后會(huì)自動(dòng)寫入同目錄的csv文件中爱谁,本例未使用到多線程楞慈。
"""
__coding__ = 'UTF-8'
__author__ = 'bingo'
__date__ = '2020/12/13'
# code is far away from bugs with the god animal protecting
I love animals. They taste delicious.
┏┓ ┏┓
┏┛┻━━━┛┻━━┓
┃ ? ┃
┃ ┳┛ ┗┳ ┃
┃ ┻ ┃
┗━┓ ┏━┛
┃ ┗━━━━━┓
┃ 神獸保佑 ┣┓
┃ 永無BUG! ┏┛
┗━━━┓┓┏━━┳┓┏┛
┃┫┫ ┃┫┫
┗┻┛ ┗┻┛
"""
import requests
import random
import csv
from urllib.parse import quote
import time
class LaGou(object):
USER_AGENT = [
"Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:74.0) Gecko/20100101 Firefox/74.0",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.132 Safari/537.36",
"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.87 Safari/537.36 SLBrowser/6.0.1.9171"
]
tasks = []
def __init__(self, position):
# 需要搜索的職位
self.search_position = position
self.request_url = "https://www.lagou.com/jobs/positionAjax.json?needAddtionalResult=false"
self.cookies = None
self.f = open(f"拉勾{self.search_position}崗位.csv", mode="w+", encoding='gbk', newline='', errors="ignore")
self.csv = csv.writer(self.f, delimiter=",")
def get_request_cookie(self):
"""
由于拉鉤的反爬機(jī)制荆忍,請(qǐng)求時(shí)候必須攜帶上cookie格带,并且cookie有效時(shí)間很短,此函數(shù)用來獲取并刷新全局cookie
:return:
"""
url = "https://www.lagou.com/jobs/list_{}?labelWords=&fromSearch=true&suginput="
headers = {
"user-agent": random.choice(self.USER_AGENT)
}
try:
session = requests.Session()
res = session.get(url.format(quote(self.search_position)), headers=headers)
if res.status_code == 200:
self.cookies = res.cookies
print("獲取cookies成功")
else:
print("獲取cookies失敗")
except Exception as e:
print("獲取cookies失敗")
def get_page_data(self, i):
"""
獲取每一頁(yè)的內(nèi)容
:param i: 頁(yè)碼
:return:
"""
j = {
"first": False,
"pn": 2,
"kd": self.search_position
}
headers = {
"Referer": "https://www.lagou.com/jobs/list_{}?labelWords=&fromSearch=true&suginput=".format(quote(self.search_position)),
'Host': 'www.lagou.com',
"user-agent": random.choice(self.USER_AGENT)
}
# 每獲取5頁(yè)刷新一次cookie
if i % 5 == 0:
self.get_request_cookie()
# 偽造瀏覽器代理
headers["user-agent"] = random.choice(self.USER_AGENT)
# 頁(yè)碼變量
j["pn"] = i
# 獲取原始數(shù)據(jù)
for retry_time in range(10):
res = requests.post(self.request_url, data=j, headers=headers, cookies=self.cookies)
result = res.json()
# 如果成功走入該分支刹枉,返回崗位信息
if result.get("success"):
position_result = result["content"]["positionResult"]
print(f"第{i}頁(yè)爬取成功:{position_result}")
if position_result["resultSize"] == 0:
print("所有數(shù)據(jù)爬取完畢")
return 0
all_position = position_result["result"]
return all_position
# 如果失敗刷新cookie叽唱,走入循環(huán)重新爬取
else:
time.sleep(2)
self.get_request_cookie()
continue
else:
print(f"第{i}頁(yè)爬取失敗: {res.json()}")
return None
def get_all_data(self, page_range=None):
# 需要爬取的頁(yè)碼范圍,不傳page_range, 默認(rèn)爬取前30頁(yè)數(shù)據(jù)
if isinstance(page_range, int):
r_ = range(1, page_range+1)
elif isinstance(page_range, (tuple, list)):
r_ = range(page_range[0], page_range[1]+1)
else:
r_ = range(1, 31)
# 第一次獲取cookie
self.get_request_cookie()
for i in r_:
positions = self.get_page_data(i)
if positions == 0:
break
if positions:
# 寫csv的頭信息
if i == 1 or i == r_[0]:
csv_headers = list(positions[0].keys())
self.csv.writerow(csv_headers)
# 寫入具體內(nèi)容
for p in positions:
self.csv.writerow(list(p.values()))
def __del__(self):
self.f.close()
if __name__ == "__main__":
l = LaGou("數(shù)據(jù)分析")
l.get_all_data(page_range=20)
運(yùn)行結(jié)果:
效果圖
csv文件:
保存的文件