從小白出發(fā)帶你領(lǐng)略Python爬蟲之美,關(guān)注微信公眾號:機(jī)智出品(jizhjchupin),免費獲取源代碼及實戰(zhàn)教程
一、系統(tǒng)環(huán)境:
Windows7+Python3.6
二沃呢、結(jié)果展示:
三、代碼解釋:
step1:需要的庫
from bs4 import BeautifulSoup as bs
import re
import time
import requests
import xlwt
import xlrd
from xlutils.copy import copy
from random import choice
step2:建立Excel表格
這里我們使用的是Python的第三方庫xlwt進(jìn)行Excel表格的寫入
def createxls(keyword):
wb = xlwt.Workbook(encoding = 'ascii')
time9 = time.strftime("%Y-%m-%d", time.localtime())
ws = wb.add_sheet(time9+'智聯(lián)招聘')#新建工作表
ws.write(0, 0, '職位名稱')
ws.write(0, 1, '公司名稱')
ws.write(0, 2, '職位月薪')
ws.write(0, 3, '工作地點')
ws.write(0, 4, '發(fā)布日期')
ws.write(0, 5, '地點')
ws.write(0, 6, '公司性質(zhì)')
ws.write(0, 7, '公司規(guī)模')
ws.write(0, 8, '學(xué)歷')
ws.write(0, 9, '崗位職責(zé)')
wb.save(keyword+'職位信息.xls')#保存工作表
step3:自定義爬蟲的User-Agent
為避免被屏蔽拆挥,爬取不同的網(wǎng)站經(jīng)常要定義和修改useragent值薄霜。這里我們選取了一些常用的User-Agent,然后利用random的choice隨機(jī)選擇一個作為一次訪問的User-Agent竿刁。
def useragent():
USER_AGENTS = [
"Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; AcooBrowser; .NET CLR 1.1.4322; .NET CLR 2.0.50727)",
"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0; Acoo Browser; SLCC1; .NET CLR 2.0.50727; Media Center PC 5.0; .NET CLR 3.0.04506)",
"Mozilla/4.0 (compatible; MSIE 7.0; AOL 9.5; AOLBuild 4337.35; Windows NT 5.1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)",
"Mozilla/5.0 (Windows; U; MSIE 9.0; Windows NT 9.0; en-US)",
"Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 2.0.50727; Media Center PC 6.0)",
"Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.8) Gecko Fedora/1.9.0.8-1.fc10 Kazehakase/0.5.6",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11",
]
headers =choice(USER_AGENTS)
return headers
step4:獲取招聘頁面網(wǎng)址
觀察一些智聯(lián)招聘的網(wǎng)址后黄锤,發(fā)現(xiàn)職位在kw=會計中搪缨,頁碼在p=1中食拜,這樣就好辦了,我們通過修改這兩個參數(shù)就可以得到所有某個職位的所有頁的網(wǎng)址副编。但是智聯(lián)招聘不讓查看所有頁负甸,只能訪問90頁,那我們就爬90頁痹届,一頁60條數(shù)據(jù)呻待,5400條數(shù)據(jù)也是夠了。
而且队腐,如果想獲取全網(wǎng)址的招聘信息蚕捉,只要改下代碼給出所有職業(yè)的關(guān)鍵詞就可以!
def geturllist(keyword):
listurl = ['']*90
h= keyword
page = 1
d = 0
while d < 90:
listurl[d] = 'http://sou.zhaopin.com/jobs/searchresult.ashx?jl=選擇地區(qū)&kw='+h+'&isadv=0&sg=91f598e913974f4687a7bfb86b54c91d&p='+str(page)
d=d+1
page=page+1
return listurl
step5:訪問網(wǎng)站
def openurl(url):
print('正在打開網(wǎng)頁:\n'+str(url))
try:
user = useragent()
headers = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.101 Safari/537.36'}
r = requests.get(url,headers = headers,timeout = 10)
r.raise_for_status()
r.encoding = r.apparent_encoding
return r.text
except Exception as e:
print('Error:',e)
time3 = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
content = time3+' '+str(e)
logpath = '51joblog.txt'
with open(logpath,'a') as f:
f.write(content+'\n')
f.close()
pass
step6:獲取數(shù)據(jù)寫入Excel
我們使用bs4庫來抓取需要的數(shù)據(jù)柴淘,再寫入Excel里面迫淹。為了避免被屏蔽,我們爬取一頁就用time.sleep模塊休眠3秒
def writexls(html,k,temp,keyword):
time3 = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
print('正在爬取第'+str(k+1)+'頁'+time3)
soup = bs(html, 'lxml')
name = soup.findAll('a',href=re.compile(r'^http://jobs.zhaopin.com/'))
company = soup.findAll('td',{'class':'gsmc'})
money = soup.findAll('td',{'class':'zwyx'})
address = soup.findAll('td',{'class':'gzdd'})
fadate = soup.findAll('td',{'class':'gxsj'})
detail = soup.findAll('span',string=re.compile(r'地點:'))
detail2 = soup.findAll('span',string=re.compile(r'公司性質(zhì):'))
detail3 = soup.findAll('span',string=re.compile(r'公司規(guī)模:'))
detail4 = soup.findAll('span',string=re.compile(r'學(xué)歷:'))
detail0 = soup.findAll('li',{'class':'newlist_deatil_last'})
try:
file = keyword+'職位信息.xls'
rb = xlrd.open_workbook(file, formatting_info=True)
wb = copy(rb)
ws = wb.get_sheet(0)
i = 0
j = 1 + temp
while i < 100:
ws.write(j,0,name[i].get_text())
ws.write(j,1,company[i].string)
ws.write(j,2,money[i].string)
ws.write(j,3,address[i].string)
ws.write(j,4,fadate[i].get_text())
ws.write(j,5,detail[i].string[3:])
ws.write(j,6,detail2[i].string[5:])
ws.write(j,7,detail3[i].string[5:])
ws.write(j,8,detail4[i].string[3:])
ws.write(j,9,detail0[i].get_text())
i = i + 1
temp = j
print('寫入第'+str(j)+'條數(shù)據(jù)')
j = j + 1
wb.save(keyword+'職位信息.xls')
except IndexError as e:
wb.save(keyword+'職位信息.xls')
print('共'+str(j-1)+'條數(shù)據(jù)')
time.sleep(5)
file = keyword+'職位信息.xls'
rb = xlrd.open_workbook(file, formatting_info=True)
wb = copy(rb)
ws = wb.get_sheet(0)
time3 = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
print('文件寫入成功:'+time3)
print('休眠3秒')
print('3')
time.sleep(1)
print('2')
time.sleep(1)
print('1')
time.sleep(1)
print('休眠結(jié)束')
return temp
step7:主程序設(shè)計
現(xiàn)在我們完成了各個模塊为严,再進(jìn)行主程序設(shè)計敛熬,下面是這個程序的一個思路。
while p < 90:
這里我們循環(huán)90次第股,也對應(yīng)最多能查看的90頁应民。
def main():
keyword = input('請輸入職位:')
time1 = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
time0 = time.time()
print('爬蟲啟動:'+time1)
createxls(keyword)
listurl = geturllist(keyword)
p = 0
k = 0
temp = 0
while p < 90:
url=listurl[k]
html = openurl(url)
temp = writexls(html,k,temp,keyword)
p = p + 1
k = k + 1
time3 = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
print('爬取結(jié)束:'+time3)
file = keyword+'職位信息.xls'
print('文件保存在:'+file)
time00 = time.time()
print('耗時:'+str(time00 - time0)+'seconds')
main()
input('回車退出')
這是之前爬的一份Python的招聘信息:
源代碼http://pan.baidu.com/s/1eS7nz5C
關(guān)注微信公眾號“機(jī)智出品”,回復(fù)19,獲取密碼