關于
一直埋頭學習忆矛,不知當前趨勢请垛,這是學習一門技術過程中最大的忌諱宗收。剛好利用python爬蟲,抓取一下拉勾網(wǎng)關于python職位的一些基本要求混稽,不僅能知道崗位的基本技能要求匈勋,還能鍛煉一下代碼能力,學以致用洽洁,一舉兩得饿自。
準備
工具 :python 2.7,PyCharm
類庫:urllib2堡掏、BeautifulSoup城豁、time、re雳旅、sys间聊、json、collections型豁、xlsxwriter
分析及代碼實現(xiàn)
???進入拉勾網(wǎng)進行分析,要想獲取每個崗位的關鍵詞充尉,首先要知道每個崗位詳情頁面的url衣形,通過對比我們發(fā)現(xiàn),https://www.lagou.com/jobs/4289433.html中倒源,只有4289433這一串數(shù)字是不同的句狼,那么就可以知道我們只要獲取到每個崗位的這一串數(shù)字腻菇,我們就可以爬取每個崗位詳情頁面。
???通過F12查看旺隙,我們可以看到xhr請求中https://www.lagou.com/jobs/positionAjax.json?px=default&city=%E5%8C%97%E4%BA%AC&needAddtionalResult=false的響應消息里的參數(shù)positionId的值為詳情頁面url的那串數(shù)字骏令,如下圖
???首先我們通過分析可以看到這是個post請求且form的參數(shù)為first妥粟、pn吏够、kd,通過不同崗位列表頁面的請求播急,我們可以看到first的取值邏輯是pn為1的時候售睹,first為true昌妹,當pn不為1的時候握截,first的取值為false(其中pn為崗位列表的頁數(shù))烂叔,還有kd為一個固定值(這里是python)
到這里长已,具體的邏輯已經(jīng)很清楚了,具體的代碼實現(xiàn)如下:
def get_positionId(pn):
positionId_list = []
url = 'https://www.lagou.com/jobs/positionAjax.json?px=default&city=%E5%8C%97%E4%BA%AC&needAddtionalResult=false'
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36',
'Referer': 'https://www.lagou.com/jobs/list_Python?px=default&city=%E5%8C%97%E4%BA%AC'
}
if pn == 1:
first = 'true'
else:
first = 'false'
data = {
'first': first,
'pn': pn,
'kd':kd #這里取變量值,可以獲取不同崗位的關鍵詞
}
page = get_page(url, headers, data)
if page == None:
return None
max_pageNum = get_pageSize(page)
result = page['content']['positionResult']['result']
for num in range(0, max_pageNum):
positionId = result[num]['positionId']
positionId_list.append(positionId)
return positionId_list #該函數(shù)返回一個列表頁的所有崗位的positionId
???在獲取到每個崗位的positionId后胞四,我們就可以根據(jù)獲取到的positionId進行拼接得到每個崗位詳情頁面的url辜伟,然后爬取這些url脊另,來獲取每個崗位的關鍵詞(這里還有一個比較坑人的地方就是通過爬取來的網(wǎng)頁內容和通過定位得到的內容竟然是不一樣的,害的我糾結了好久)旱捧,分析該網(wǎng)頁如下圖:具體的實現(xiàn)如下:
#獲取每個崗位的職位要求
def get_content(positionId):
url = 'https://www.lagou.com/jobs/%s.html' %(positionId)
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36',
'Referer': 'https://www.lagou.com/jobs/list_Python?px=default&city=%E5%8C%97%E4%BA%AC'
}
page = get_page(url,headers,data=0)
soup = Bs(page, 'html.parser')
content = soup.find('dd', class_='job_bt').get_text()
return content
接下來就是對獲取到的崗位描述進行過濾處理枚赡,來獲取英文關鍵詞谓谦,實現(xiàn)如下:
#對獲取的關鍵詞列表進行過濾去重反粥,獲取top50的關鍵詞
#處理崗位描述,獲取英文關鍵詞
def get_keyword(content):
pattern = re.compile('[a-zA-Z]+')
keyword = pattern.findall(content)
return keyword
然后莫湘,在通過collections中的Counter模塊獲取到這些英文關鍵詞中的top50娜膘,實現(xiàn)如下:
#對獲取的關鍵詞列表進行過濾去重,獲取top50的關鍵詞
def parser_keyword(keyword_list):
for i in range(len(keyword_list)):
keyword_list[i] = keyword_list[i].lower()
keyword_top = Counter(keyword_list).most_common(50)
return keyword_top
最后把top50的關鍵詞保存到Excel中军洼,并且生成分析圖,實現(xiàn)如下:
#數(shù)據(jù)保存到Excel中避乏,并且生成報表甘桑。
def save_excel(keyword_top):
row = 1
col = 0
workbook = xlsxwriter.Workbook('lagou.xlsx')
worksheet = workbook.add_worksheet('lagou')
worksheet.write(0, col, u'關鍵詞')
worksheet.write(0, col+1, u'頻次')
for name, num in keyword_top:
worksheet.write(row, col, name)
worksheet.write(row, col+1, num)
row += 1
chart = workbook.add_chart({'type': 'area'})
chart.add_series({
'categories': 'lagou!$A$2:$A$51',
'values': 'lagou!$B$2:$B$51'
})
chart.set_title({'name': u'關鍵詞排名'})
chart.set_x_axis({'name': u'關鍵詞'})
chart.set_y_axis({'name': u'頻次(/次)'})
worksheet.insert_chart('C2', chart, {'x_offset':15, 'y_offset':10})
workbook.close()
結果
???具體生成的分析圖如下:
如果對您有點幫助的話铆帽,麻煩您給點個贊德谅,謝謝。
最后附上全部的代碼:
# -*-coding: utf-8 -*-
import urllib2
import urllib
import re
from bs4 import BeautifulSoup as Bs
import json
import time
import sys
from collections import Counter
import xlsxwriter
kd = raw_input('請輸入關鍵字:')
#獲取頁面內容
def get_page(url,headers,data):
if data == 0:
try:
request = urllib2.Request(url, headers=headers)
resp = urllib2.urlopen(request)
page = resp.read()
return page
except urllib2.URLError,e:
if hasattr('reason'):
print("爬取失敗", e.reason)
sys.exit(1)
else:
try:
data = urllib.urlencode(data).encode('utf-8')
request = urllib2.Request(url, data=data, headers=headers)
resp = urllib2.urlopen(request)
page = json.loads(resp.read())
if page['success'] == True:
return page
else:
print(page['msg'])
return None
except urllib2.URLError,e:
print("爬取失敗", e.reason)
#獲取每一頁最大的pageSize
def get_pageSize(page):
max_pageNum = page['content']['pageSize']
return max_pageNum
#獲取每個崗位的職位要求
def get_content(positionId):
url = 'https://www.lagou.com/jobs/%s.html' %(positionId)
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36',
'Referer': 'https://www.lagou.com/jobs/list_Python?px=default&city=%E5%8C%97%E4%BA%AC'
}
page = get_page(url,headers,data=0)
soup = Bs(page, 'html.parser')
content = soup.find('dd', class_='job_bt').get_text()
return content
#處理崗位描述,獲取英文關鍵詞
def get_keyword(content):
pattern = re.compile('[a-zA-Z]+')
keyword = pattern.findall(content)
return keyword
#獲取每一頁的崗位ID
def get_positionId(pn):
positionId_list = []
url = 'https://www.lagou.com/jobs/positionAjax.json?px=default&city=%E5%8C%97%E4%BA%AC&needAddtionalResult=false'
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36',
'Referer': 'https://www.lagou.com/jobs/list_Python?px=default&city=%E5%8C%97%E4%BA%AC'
}
if pn == 1:
first = 'true'
else:
first = 'false'
data = {
'first': first,
'pn': pn,
'kd':kd
}
page = get_page(url, headers, data)
if page == None:
return None
max_pageNum = get_pageSize(page)
result = page['content']['positionResult']['result']
for num in range(0, max_pageNum):
positionId = result[num]['positionId']
positionId_list.append(positionId)
return positionId_list
#對獲取的關鍵詞列表進行過濾去重组砚,獲取top50的關鍵詞
def parser_keyword(keyword_list):
for i in range(len(keyword_list)):
keyword_list[i] = keyword_list[i].lower()
keyword_top = Counter(keyword_list).most_common(50)
return keyword_top
#數(shù)據(jù)保存到Excel中糟红,并且生成報表蚯舱。
def save_excel(keyword_top):
row = 1
col = 0
workbook = xlsxwriter.Workbook('lagou.xlsx')
worksheet = workbook.add_worksheet('lagou')
worksheet.write(0, col, u'關鍵詞')
worksheet.write(0, col+1, u'頻次')
for name, num in keyword_top:
worksheet.write(row, col, name)
worksheet.write(row, col+1, num)
row += 1
chart = workbook.add_chart({'type': 'area'})
chart.add_series({
'categories': 'lagou!$A$2:$A$51',
'values': 'lagou!$B$2:$B$51'
})
chart.set_title({'name': u'關鍵詞排名'})
chart.set_x_axis({'name': u'關鍵詞'})
chart.set_y_axis({'name': u'頻次(/次)'})
worksheet.insert_chart('C2', chart, {'x_offset':15, 'y_offset':10})
workbook.close()
#執(zhí)行程序
def run():
#獲取30頁的數(shù)據(jù)
keyword_list = []
for pn in range(1, 2):
positionId_list= get_positionId(pn)
if positionId_list == None:
break
for positionId in positionId_list:
content = get_content(positionId)
keyword = get_keyword(content)
keyword_list += keyword
time.sleep(60)
keyword_top = parser_keyword(keyword_list)
save_excel(keyword_top)
if __name__ == '__main__':
run()
# save_excel()