背景描述:根據(jù)給定excel中的日企名稱可免,在招聘網(wǎng)站上搜索所有崗位信息,并最終生成excel表格龙优。
使用到的框架:scrapy
數(shù)據(jù)庫(kù):mysql
難點(diǎn):部分信息需要爬取二級(jí)頁(yè)面及分頁(yè)羊异,暫未實(shí)現(xiàn),待更新彤断。
原始表格:
需要生成的列表項(xiàng):
單位名稱野舶、崗位名稱、學(xué)歷要求宰衙、工作年限要求平道、工作地點(diǎn)、崗位年薪供炼、招聘人數(shù)一屋、到崗時(shí)間窘疮、任職要求
爬取部分代碼實(shí)現(xiàn):
import array
import time
import scrapy
import json
from qcwy.items import QcwyItem
from openpyxl import load_workbook
from openpyxl import Workbook
from urllib.request import urlopen
from urllib.request import Request
from urllib import parse
from lxml import etree
import sys
from bs4 import BeautifulSoup
from openpyxl import Workbook
import xlwt
import requests
import json
from bs4 import BeautifulSoup
from xlrd import open_workbook
import time
from xlutils.copy import copy
class QcwycrawlerSpider(scrapy.Spider):
name = 'qcwyCrawler'
# allowed_domains = ['www.xxx.com']
start_urls = [] # start_urls列表中的url會(huì)被scrapy自動(dòng)請(qǐng)求
def __init__(self, **kwargs):
super().__init__(**kwargs)
#read Excel
setSQLData = []
ws = load_workbook('D:/code/xxx.xlsx')
Sheet_1 = ws['Sheet1']
for i in range(1,106):
B1 = Sheet_1.cell(i,1).value
if B1 is None:
continue
else:
num = 0
url = 'https://search.51job.com/list/000000,000000,0000,00,9,99,'+B1+',1,1.html?lang=c&postchannel=0000&workyear=99&cotype=99°reefrom=99&jobterm=99&companysize=99&ord_field=0&dibiaoid=0&line=&welfare='
#for i in range(1, 10):
self.start_urls.append(url)
def parse(self, response): # 利用xpath和json解析爬取到的數(shù)據(jù)
json_str = response.xpath('/html/body/script[2]/text()').extract_first()[29:]
data = json.loads(json_str)
item = QcwyItem()
if len(data['engine_search_result'])>0:
for row in data['engine_search_result']:
item['company'] = row['company_name']
item['job_name'] = row['job_name']
item['salary'] = row['providesalary_text']
attribute_text = len(row['attribute_text'])
if attribute_text > 3:
item['experience'] = row['attribute_text'][1]
item['education'] = row['attribute_text'][2]
item['num'] = row['attribute_text'][3]
if attribute_text == 3:
item['experience'] = row['attribute_text'][1]
item['num'] = row['attribute_text'][2]
item['education'] = '無(wú)要求'
item['welfare'] = row['jobwelf']
item['workarea']= row['workarea_text']
yield item
else:
item['company'] = data['searched_condition']
item['job_name'] = '無(wú)'
item['salary'] = '無(wú)'
item['welfare'] = '無(wú)'
item['workarea']= '無(wú)'
item['education'] = '無(wú)'
item['num'] = '0'
item['experience'] = '無(wú)'
yield item