之所以選擇selenium實(shí)現(xiàn)登錄主要是為了處理驗(yàn)證碼设凹,招聘狗網(wǎng)站的驗(yàn)證碼圖片是拼接出來的舰讹,所以我的方法是通過webdriver截圖來實(shí)現(xiàn),然后通過打碼兔平臺(tái)獲取驗(yàn)證碼坐標(biāo)實(shí)現(xiàn)自動(dòng)自動(dòng)登錄闪朱。列表頁和詳情頁用requests庫實(shí)現(xiàn)月匣。具體實(shí)現(xiàn)過程如下:
招聘狗的驗(yàn)證碼如下:
首先你得注冊(cè)一個(gè)賬號(hào),可以跳過企業(yè)驗(yàn)證奋姿,招聘狗網(wǎng)站是給企業(yè)HR使用的锄开,所以一般要求企業(yè)驗(yàn)證,這里我們直接跳過企業(yè)驗(yàn)證称诗,下面是實(shí)現(xiàn)過程萍悴,有詳細(xì)注釋:
'''
import json
import os
import random
import re
import sys
import traceback
import time
from PIL import Image
from lxml import html as lxml_html
import selenium
from selenium import webdriver
from selenium.common.exceptions import NoSuchElementException
from selenium.webdriver import ActionChains
import requests
import base64
from requests.exceptions import ConnectionError
import http.cookiejar
import logging
from dama2_API import Dama2API
隨機(jī)獲取useragent的第三方庫,直接用pip安裝
from fake_useragent import UserAgent
ua = UserAgent()
class RTC_zhaopingou(object):
def init(self, account: dict, debug=False, visible=-1, last_try=False):
assert account['user_id']
assert account['password']
logging.info('Change webdriver to FireFox')
#創(chuàng)建seeion對(duì)象寓免,爬取列表頁和詳情頁使用
self.session = requests.Session()
self.session.headers = {
'Host': "qiye.zhaopingou.com",
"Origin":"http://qiye.zhaopingou.com",
"Referer":"http://qiye.zhaopingou.com",
"User-Agent":"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.62 Safari/537.36",
}
#需要注冊(cè)打碼兔賬號(hào)癣诱,從打碼兔平臺(tái)下載代碼
self.dama2 = Dama2API()
def login(self):
l = logging
l.info("Processing Login...")
self.driver = webdriver.Firefox()
self.driver.set_window_size(1920, 1080)
self.driver.implicitly_wait(10)
driver = self.driver
# login_url = 'http://qiye.zhaopingou.com/zhaopingou_interface/security_login?timestamp='+str(int(time.time()*1000))
login_url = 'http://qiye.zhaopingou.com/'
driver.get(login_url)
#打開頁面后出現(xiàn)的需要選擇城市
driver.find_element_by_xpath('//div[@class="city-now citys"]').click()
#找到用戶名和密碼元素,模仿人手動(dòng)輸入
for i in self.account['username']:
driver.find_element_by_xpath('//input[@placeholder="請(qǐng)輸入手機(jī)號(hào)/郵箱/狗狗號(hào)"]').send_keys(i)
time.sleep(random.uniform(0.2,0.8))
for j in self.account['password']:
driver.find_element_by_xpath('//input[@placeholder="請(qǐng)輸入密碼"]').send_keys(j)
time.sleep(random.uniform(0.2, 0.8))
# 獲取彈出驗(yàn)證碼的按鈕元素袜香,這里有一個(gè)坑撕予,按鈕元素在iframe節(jié)點(diǎn)中,不能直接獲取蜈首,需要通過driver.find_element_by_tag_name("iframe")切入到第一個(gè)iframe中实抡,然后在通過xpath獲取按鈕元素
# iframe = driver.find_element_by_id('captcha_widget_aiwaylekc')
driver.switch_to.frame(driver.find_element_by_tag_name("iframe"))
# driver.switch_to.frame('captcha_widget_aiwaylekc')
driver.find_element_by_xpath('//span[@class="captcha-widget-text"]').click()
#等待5秒,避免出現(xiàn)有時(shí)候還未加載出來的情況欢策,通過driver.switch_to.default_content()從iframe切換到主html頁面
time.sleep(5)
driver.switch_to.default_content()
#點(diǎn)擊彈出驗(yàn)證碼按鈕后出現(xiàn)一個(gè)新的iframe,此時(shí)有兩個(gè)iframe澜术,并列的,從這頁面切入到第二個(gè)iframe
driver.switch_to.frame(driver.find_elements_by_tag_name("iframe")[1])
# 驗(yàn)證碼區(qū)域
captcha_xpath = '//div[@class="lc-panel"]'
# captcha_xpath = '#l-captcha-float_aiwaylekc'
re = self._login_process_captcha(captcha_xpath)
#登錄成功
if re:
driver.switch_to.default_content()
driver.find_element_by_id('form_login').click()
time.sleep(3)
current_url = driver.current_url
#判斷登錄后的url是否是期望值
expect_url = 'http://qiye.zhaopingou.com/'
if current_url==expect_url:
l.info('login sucess!!!')
#獲取cookie猬腰,并將cookie保存到session中鸟废,以便爬蟲列表頁和詳情頁使用
cookie = dict()
print(driver.get_cookies())
for item in driver.get_cookies():
# cookie += "; {}={}".format(item['name'], item["value"])
cookie[item['name']] = item['value']
if item['name'] == 'hrkeepToken':
self.token = item['value']
# 存儲(chǔ)cookie
self.session.cookies = requests.utils.cookiejar_from_dict(cookie, self.cookiejar)
l.info("get cookie: {}".format(cookie))
#登錄成功,退出driver姑荷,后面不使用了
self.driver.quit()
return True
else:
l.info('login failed due to CAPTCHA, submit_count')
return False
def _login_process_captcha(self,captcha_xpath):
l = logging
driver = self.driver
captcha_element = driver.find_element_by_xpath(captcha_xpath)
#驗(yàn)證碼坐標(biāo)和大小
offset = captcha_element.location
print('offset:',offset)
size = captcha_element.size
# 驗(yàn)證碼接口
dama2 = self.dama2
#保存驗(yàn)證碼圖片
shm_dir = r'/tmp/zhaopingou/'
if os.path.exists(shm_dir) is False:
os.makedirs(shm_dir)
captcha_img_path = os.path.join(shm_dir, 'captcha_img_{user_id}.png'.format(user_id=self.account['user_id']))
maximum = 20
attempt = 0
while attempt<=maximum:
l.info(f'Trying to decode CAPTCHA: {attempt}/{maximum}')
#驗(yàn)證碼元素
captcha_element = driver.find_element_by_xpath(captcha_xpath)
#截取驗(yàn)證碼圖片保存到captcha_img_path
captcha_element.screenshot(captcha_img_path)
try:
#調(diào)用打碼兔接口盒延,傳入驗(yàn)證碼類型缩擂,驗(yàn)證碼圖片文件,返回坐標(biāo)值coordinate_list
captcha_id, coordinate_list = dama2.decode_captcha(captcha_type=6137, file_path=captcha_img_path)
l.info(f'coordinate_list:{coordinate_list}')
except Exception as err:
err_str = str(err)
tb = traceback.format_exc()
msg = f'Exception occurred when decode CAPTCHA, err: {err_str}, tb:\n{tb}'
l.warning(msg)
attempt+=1
# 發(fā)生異常時(shí)先返回主頁面
continue
#將鼠標(biāo)移動(dòng)到返回的坐標(biāo)位置并點(diǎn)擊
for xy in coordinate_list:
action = ActionChains(driver)
action.move_to_element_with_offset(captcha_element, xy[0], xy[1]).click()
action.perform()
time.sleep(random.uniform(0.5,2))
#先切回到主html添寺,再切到第一個(gè)iframe胯盯,獲取之前的彈出驗(yàn)證按鈕,判斷內(nèi)容是否是驗(yàn)證成功
driver.switch_to.default_content()
driver.switch_to.frame(driver.find_elements_by_tag_name("iframe")[0])
text = driver.find_element_by_xpath('//span[@class="captcha-widget-text"]').text
if text.find('驗(yàn)證成功')!=-1:
l.info('驗(yàn)證碼驗(yàn)證成功计露!')
time.sleep(random.uniform(1,2))
return True
else: #失敗則再切回到第二個(gè)iframe博脑,從新獲取驗(yàn)證碼
driver.switch_to.default_content()
driver.switch_to.frame(driver.find_elements_by_tag_name("iframe")[1])
l.info('fail,and try it again')
attempt+=1
time.sleep(2)
continue
return False
#通過搜索關(guān)鍵字獲取列表頁面,并定位到某一頁
def search(self, keyword, page_to_go):
'''搜索簡(jiǎn)歷票罐,得到列表頁面叉趣,數(shù)據(jù)為json格式'''
l = logging
assert keyword
self.keyword = keyword
# 使用firefox瀏覽器抓取post請(qǐng)求參數(shù)
params = {
"pageSize":page_to_go,
"pageNo":"25",
"keyStr":keyword,
"companyName":"",
"schoolName":"",
"keyStrPostion":"",
"postionStr":"",
"startDegrees":"-1",
"endDegress":"-1",
"startAge":"0",
"endAge":"0",
"gender":"-1",
"region":"",
"timeType":"-1",
"startWorkYear":"-1",
"endWorkYear":"-1",
"beginTime":"",
"endTime":"",
"isMember":"-1",
"hopeAdressStr":"",
"cityId":"-1",
"updateTime":"",
"tradeId":"",
"clientNo":"",
"userToken":self.token,
"clientType":"2"
}
retry = 0
while True:
#抓包獲取請(qǐng)求的真實(shí)URL,后面是隨機(jī)的數(shù)字字符串
search_url = "http://qiye.zhaopingou.com/zhaopingou_interface/find_warehouse_by_position_new?timestamp=" + str(int(time.time() * 1000))
l.info('search_url:{}'.format(search_url))
self.current_url = search_url
l.debug(f'Open search page. url,params,keyword,userToken: {search_url},{params},{keyword},{self.token}')
retry += 1
if retry == 11:
return ''
try:
#使用session請(qǐng)求
res = self.session.post(search_url, data=params)
except ConnectionError:
l.info("ConnectionError! Sleep 5 minutes and retry...")
time.sleep(300)
self.current_url = search_url
continue
else:
l.info('current url is:{}'.format(res.url))
if res.url != search_url:
login_result = self.login(load=False)
if login_result:
continue
else:
l.warning("Login failed!")
sys.exit('login failed')
elif not res.text:
l.info("Service is busy. Wait 5 minutes and retry...")
time.sleep(300)
l.info('Continue Searching...')
continue
#返回的數(shù)據(jù)異常该押,內(nèi)容很少
elif len(str(res.text))<2000:
#若返回‘請(qǐng)您登錄后查看簡(jiǎn)歷’疗杉,則重新登錄后在爬取
if '請(qǐng)您登錄后查看簡(jiǎn)歷' in str(res.text):
self.login(load=False)
continue
result = str(res.text)
#更換useragent
self.session.headers['User-Agent'] = ua.firefox
l.info(f'errorcode msg:{result}')
l.info('Too frequent operation, please try again in a minute')
time.sleep(random.randint(61,100))
continue
else:
try:
#返回的正常數(shù)據(jù),通過json.dumps()獲取json數(shù)據(jù)
resume_list = json.loads(res.text)
resume_list["current_page"]=page_to_go
# 在列表頁面加入搜索頁面
res = json.dumps(resume_list,ensure_ascii=False)
l.info(f'search_resume_list_info:{res}')
return res
except:
l.warning(res.text)
l.warning("something wrong!sleep 5 minutes and retry...")
time.sleep(300)
continue
def open_resume(self, url):
'''
打開簡(jiǎn)歷蚕礼,得到詳情頁面
url可通過base64加密的用戶id構(gòu)造
'''
l = logging
l.debug(f'Open a resume: request_url: {url}')
resumeHtmlId=(url.split("="))[1]
# 設(shè)置前鏈
#self.session.headers['Referer'] = "http://qiye.zhaopingou.com/resume?key="+self.keyword
# 抓包獲取簡(jiǎn)歷詳情頁的請(qǐng)求參數(shù)
open_resume_data={
"resumeHtmlId": resumeHtmlId,
"keyStr":"",
"keyPositionName":"",
"tradeId":"",
"postionStr":"",
"jobId":"0",
"companyName":"",
"schoolName":"",
"clientNo":"",
"userToken":self.token,
"clientType":"2"
}
retry = 0
while True:
#抓包獲取詳情頁真實(shí)url
openresumeurl = "http://qiye.zhaopingou.com/zhaopingou_interface/zpg_find_resume_html_details?timestamp=" + str(int(time.time() * 1000))
l.info('resume_url:{}'.format(openresumeurl))
retry += 1
if retry == 11:
return ''
try:
res = self.session.post(url=openresumeurl,data=open_resume_data)
except ConnectionError:
l.info("ConnectionError! Sleep 5 minutes and retry...")
time.sleep(300)
continue
else:
# 返回的html頁面
l.info('current url is:{}'.format(res.url))
if res.url != openresumeurl:
l.info("cookie is invalid. Login with webdriver")
login_result = self.login(load=False)
if login_result:
continue
else:
l.warning("Login failed!")
sys.exit('login failed')
if not res.text:
l.info("Service is busy. Wait 5 minutes and retry...")
time.sleep(300)
continue
elif len(str(res.text))<2000:
print('errorcode:',res.text)
result = str(res.text)
l.info(f'errorcode msg:{result}')
l.info('Too frequent operation, please try again in a minute')
time.sleep(random.randint(61, 100))
continue
else:
try:
page_len = len(res.text)
self.current_url = openresumeurl
l.info(f'Downloaded a resume, len: {page_len:,d}, current_url: {url}')
resp_json=json.loads(res.text)
res_utf=json.dumps(resp_json,ensure_ascii=False)
return res_utf
except:
l.warning(res.text)
l.warning("something wrong! sleep 5 minutes and retry...")
time.sleep(300)
continue
if name == 'main':
#賬號(hào)密碼是假的烟具,大家填寫自己的賬號(hào)密碼
rtc_zhaopingou = RTC_zhaopingou(account={'user_id': '-701', 'username': '13419696888', 'password': '123'},
debug=False,
visible=1, last_try=False)
rtc_zhaopingou.login()
keyword_list = ['python','大數(shù)據(jù)','人工智能','java']
for kw in keyword_list:
for i in range(1,200):
search_result = rtc_zhaopingou.search(kw, i)
print('****************************************************************')
res = rtc_zhaopingou.open_resume(' http://qiye.zhaopingou.com/resume/detail?resumeId=5761920')
print(res)
'''
打碼兔平臺(tái)的代碼需要自己下載,放在同級(jí)目錄后可以跑一下