本爬蟲主要使用了requests缴啡、json阶冈、bs4(BeautifulSoup)等相關(guān)模塊解藻,不完善之處請大家不吝賜教孝治!:)
出處:https://github.com/jingsupo/python-spider/blob/master/day04/04tencent_hr.py
# -*- coding:utf-8 -*-
import requests, json, time
from bs4 import BeautifulSoup
class tencent_hr(object):
def __init__(self):
self.base_url = "http://hr.tencent.com/position.php?"
self.headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko"}
self.item_list = []
self.page = 0
# 發(fā)送請求
def send_request(self, url, params={}):
time.sleep(2)
try:
response = requests.get(url, params=params, headers=self.headers)
return response.content
except Exception as e:
print e
# 解析數(shù)據(jù)
def parse_data(self, data):
# 初始化
bs = BeautifulSoup(data, 'lxml')
# 獲取標簽-結(jié)果為列表
data_list = bs.select('.even, .odd')
# 將結(jié)果中的每一行數(shù)據(jù)提取出來
for data in data_list:
data_dict = {}
data_dict['work_name'] = data.select('td a')[0].get_text()
data_dict['work_type'] = data.select('td')[1].get_text()
data_dict['work_count'] = data.select('td')[2].get_text()
data_dict['work_place'] = data.select('td')[3].get_text()
data_dict['work_time'] = data.select('td')[4].get_text()
# 將每條字典數(shù)據(jù)添加進列表
self.item_list.append(data_dict)
# 判斷是否是最后一頁损痰,條件:是否有noactive值
# 先找到下一頁的標簽
next_label = bs.select('#next')
# 根據(jù)標簽獲取屬性class的值-返回結(jié)果為列表
judge = next_label[0].get('class')
return judge
# 寫入文件
def write_file(self):
# 將列表轉(zhuǎn)換成字符串
data_str = json.dumps(self.item_list)
with open('04tencent_hr.json', 'w') as f:
f.write(data_str)
# 調(diào)度運行
def run(self):
while True:
# 拼接參數(shù)
params = {
"keywords": "python",
"tid": "0",
"lid": "2156",
"start": self.page,
}
# 發(fā)送請求
data = self.send_request(self.base_url, params=params)
# 解析數(shù)據(jù)
judge = self.parse_data(data)
self.page += 10
print self.page
# 如果到了最后一頁福侈,出現(xiàn)noactive,跳出循環(huán)
if judge:
break
self.write_file()
if __name__ == '__main__':
spider = tencent_hr()
spider.run()