# -*- coding:utf-8 -*-
import urllib.request
import re
import xlwt#用來創(chuàng)建excel文檔并寫入數(shù)據(jù)
#獲取原碼
def get_content(page):
url ='http://search.51job.com/list/000000,000000,0000,00,9,99,python,2,'+ str(page)+'.html'
a = urllib.request.urlopen(url)#打開網(wǎng)址
html = a.read().decode('gbk')#讀取源代碼并轉(zhuǎn)為unicode
return html
def get(html):
reg = re.compile(r'class="t1 ">.*? <a target="_blank" title="(.*?)".*? <span class="t2"><a target="_blank" title="(.*?)".*?<span class="t3">(.*?)</span>.*?<span class="t4">(.*?)</span>.*? <span class="t5">(.*?)</span>',re.S)#匹配換行符
items = re.findall(reg,html)
return items
def excel_write(items,index):
#爬取到的內(nèi)容寫入excel表格
for item in items:#職位信息
for i in range(0,5):
#print item[i]
ws.write(index,i,item[i])#行车要,列,數(shù)據(jù)
print(index)
index+=1
newTable="test.xls"#表格名稱
wb = xlwt.Workbook(encoding='utf-8')#創(chuàng)建excel文件肩钠,聲明編碼
ws = wb.add_sheet('sheet1')#創(chuàng)建表格
headData = ['招聘職位','公司','地址','薪資','日期']#表頭部信息
for colnum in range(0, 5):
ws.write(0, colnum, headData[colnum], xlwt.easyxf('font: bold on')) # 行,列
for each in range(1,10):
index=(each-1)*50+1
excel_write(get(get_content(each)),index)
wb.save(newTable)
前程無憂爬職位信息正則表達式(from http://www.cnblogs.com/Beyond-Ricky/p/6771028.html)
基本確定爬蟲項目輪廓:獲取實習(xí)僧上數(shù)據(jù)分析和風(fēng)控的職位要求议街,分析詞頻得到帶次數(shù)的詞云。另外選擇網(wǎng)易云/知乎/微博進行模擬登陸和動態(tài)操作脓杉。