本章將從Python案例講起:所使用bs4做一個(gè)簡單的爬蟲案例秕衙,更多內(nèi)容請參考:Python學(xué)習(xí)指南
案例:使用BeautifulSoup的爬蟲
我們已騰訊社招頁面來做演示:http://hr.tencent.com/position.php?&start=10#a
騰訊校招
使用BeautifulSoup4解析器辩涝,將招聘網(wǎng)頁上的職位名稱亚享、職位類別权谁、招聘人數(shù)嫌套、工作地點(diǎn)腺占、時(shí)間玻蝌、以及每個(gè)職位詳情的點(diǎn)擊鏈接存儲出來。
#-*- coding:utf-8 -*-
from bs4 import BeautifulSoup
import urllib2
import urllib
import json #使用json格式存儲
def tencent():
url = "http://hr.tencent.com/"
request = urllib2.Request(url+"position.php?&start=10#a")
response = urllib2.urlopen(request)
resHtml = response.read()
output = open('tencent.json', 'w')
html = BeautifulSoup(resHtml, 'lxml')
#創(chuàng)建CSS選擇器
result = html.select('tr[class="even"]')
result2 = html.select('tr[class="odd"]')
result += result2
print(result)
items = []
for site in result:
item = {}
name = site.select('td a')[0].get_text()
dataLink = site.select('td a')[0].attrs['href']
catalog = site.select('td')[1].get_text()
recruitNumber = site.select('td')[2].get_text()
workLocation = site.select('td')[3].get_text()
publishTime = site.select('td')[4].get_text()
item['name'] = name
item['datailLink'] = url + dataLink
item['catalog'] = catalog
item['recruitNumber'] = recruitNumber
item['publishTime'] = publishTime
items.append(item)
#禁用ascii編碼璃吧,按utf-8編碼
line = json.dumps(items, ensure_ascii = False)
output.write(line.encode('utf-8'))
output.close()
if __name__ == '__main__':
tencent()