爬蟲之 beautifulsoup
利用beautiful 爬取
import requests
from bs4 import BeautifulSoup
import json
headers = {
? ? 'User-Agent':'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) ?????Chrome/63.0.3239.132 Safari/537.36',
base_url = 'https://hr.tencent.com/position.php'
keywords = input('輸入職位:')
begin_page = int(input('起始頁:'))
end_page = int(input('結(jié)束頁:'))
job_list = []
for page in range(begin_page, end_page + 1):
? ? params = {
? ? ? ? 'keywords': keywords,
? ? ? ? 'start': (page - 1) * 10
? ? }
? ? print('%s爬取中...' % page)
? ? response1 = requests.get(url=base_url, params=params,headers=headers)
? ? content = response1.content
? ? # with open('./tencent-%s.html'%page, 'wb') as file:
? ? #? ?? file.write(content)
? ? content = content.decode('utf-8')
? ? '''數(shù)據(jù)提取'''
? ? bs = BeautifulSoup(content,'lxml')
? ? # tr_list = bs.select('tr[class="odd"],tr[class="even"]')
? ? tr_list = bs.find_all(name='tr',attrs={'class':['even','odd']})
? ? for tr in tr_list:
? ? ? ? job={}
? ? ? ? job['job_name'] = tr.a.text.strip()
? ? ? ? job['job_href'] = tr.a['href']
? ? ? ? job['job_type'] = tr.find_all('td')[1].text.strip()
? ? ? ? job['job_person'] = tr.find_all('td')[2].text.strip()
? ? ? ? job['job_address'] = tr.find_all('td')[3].text.strip()
? ? ? ? job['job_time'] = tr.find_all('td')[4].text.strip()
? ? ? ? job_list.append(job)
#轉(zhuǎn)成json ? ? ? ? ? ?ensure_asci=False----默認是True,改成False,才能顯示中文饲齐,?ensure_ascii=False?來禁用ascii編碼
#dump 和 dumps兩種寫法兰伤,
# job_json_string = json.dumps(job_list,ensure_ascii=False)
# with open('./tencent.json', 'w',encoding='utf-8') as file:
#? ?? file.write(job_json_string)
json.dump(job_list,open('./tencent.json', 'w',encoding='utf-8'),ensure_ascii=False)