<strong>上一篇python爬蟲——拉鉤網(wǎng)職位信息文章中提到要用scrapy框架去完成這個工作,現(xiàn)在已基本完成院塞,自己可以添加更多職位和城市瓜客。
思路和上一篇文章用requests+beautifulsoup一樣,不同的是上一次寫的是csv格式文件驯遇,這次存入mysql數(shù)據(jù)庫,廢話不多說直接上代碼</strong>
1.items.py
# -*- coding: utf-8 -*-
# Define here the models for your scraped items
#
# See documentation in:
# http://doc.scrapy.org/en/latest/topics/items.html
import scrapy
class LagouzpItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
city = scrapy.Field()
company_name = scrapy.Field()
size = scrapy.Field()
edu = scrapy.Field()
financeStage = scrapy.Field()
firstType = scrapy.Field()
industryField = scrapy.Field()
name = scrapy.Field()
salary = scrapy.Field()
secondType = scrapy.Field()
workYear = scrapy.Field()
time = scrapy.Field()
2.pipeline.py
# -*- coding: utf-8 -*-
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
import MySQLdb
import csv
class LagouzpPipeline(object):
#寫文件
# def __init__(self):
# with open("data.csv", "ab+") as self.files:
# self.write = csv.writer(self.files)
# self.write.writerow(
# ['職位名稱', '公司名稱', '城市', '公司規(guī)模', '公司類型', '月薪', '行業(yè)領(lǐng)域', 'firstType', 'senondType', '工作經(jīng)歷', '學(xué)歷', '發(fā)布時間'])
#
# def process_item(self, item, spider):
# with open("data.csv", "ab+") as self.files:
# self.write = csv.writer(self.files)
# self.line = [item['name'], item['city'], item['company_name'], item['size'], item['financeStage'],item['salary'], item['industryField'], item['firstType'],item['secondType'],item['workYear'],item['edu'],item['time']]
# self.write.writerow(self.line)
# return item
#數(shù)據(jù)庫
def __init__(self):
self.conn = MySQLdb.connect(user='root', passwd='123456', db='lagou', host='localhost', charset='utf8',
use_unicode=True)
self.cursor = self.conn.cursor()
def process_item(self, item, spider):
self.cursor.execute(
"insert into jobinfo(name,city,company,size,type,salary,field,firsttype,secondtype,workyear,edu,time) values(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)",(item['name'], item['city'], item['company_name'], item['size'], item['financeStage'],item['salary'], item['industryField'], item['firstType'],item['secondType'],item['workYear'],item['edu'],item['time'],))
self.conn.commit()
return item
<em>這里分別提到了文件的操作和數(shù)據(jù)庫的操作方式蓄髓,可以參考一下
</em>
3.lg_spider.py
# -*- coding:utf-8 -*-
import scrapy
from ..items import LagouzpItem
import requests
from bs4 import BeautifulSoup
import json
class Spider(scrapy.Spider):
name = 'lagou'
cookies = {
'user_trace_token': '20170314211704-f55f18938db84cfeae95d1efec6d585e',
'LGUID': '20170314211706-859943f0-08b8-11e7-93e0-5254005c3644',
'JSESSIONID': 'AA1DE67564F4C20F86F89F3572B706A1',
'PRE_UTM': '',
'PRE_HOST': 'www.baidu.com',
'PRE_SITE': 'https%3A%2F%2Fwww.baidu.com%2Flink%3Furl%3DuQkzN6ld65B8UHLJeaN2RVwWb3jiAl6AkSQSZRkXpRC%26wd%3D%26eqid%3Df6aa96cc0000dd5e0000000258ff3f34',
'PRE_LAND': 'https%3A%2F%2Fwww.lagou.com%2F',
'index_location_city': '%E5%85%A8%E5%9B%BD',
'Hm_lvt_4233e74dff0ae5bd0a3d81c6ccf756e6': '1491116405,1491116452,1493122880,1493122898',
'Hm_lpvt_4233e74dff0ae5bd0a3d81c6ccf756e6': '1493123186',
'_ga': 'GA1.2.1412866745.1489497427',
'LGSID': '20170425202132-b7ea71dc-29b1-11e7-bc70-525400f775ce',
'LGRID': '20170425202620-6394f6bd-29b2-11e7-bc72-525400f775ce',
'TG-TRACK-CODE': 'search_code',
'SEARCH_ID': '63e7755cfbbf40559a5dac6a35e5f49f'
}
headers = {
"user-agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36"}
def start_requests(self):
kd = ['python工程師', 'python數(shù)據(jù)分析']
city = ['北京', '上海', '深圳', '廣州', '杭州', '成都', '南京', '武漢', '西安', '廈門', '長沙', '蘇州', '天津']
urls_kd = ['https://www.lagou.com/jobs/list_{}?px=default&city='.format(one) for one in kd]
for urls in urls_kd:
urls_city = [urls + one for one in city]
for url in urls_city:
response = requests.get(url, headers=self.headers, cookies=self.cookies)
location = url.split('&')[-1].split('=')[1]
key = url.split('/')[-1].split('?')[0].split('_')[1]
soup = BeautifulSoup(response.text, 'lxml')
pages = soup.find('span', {'class': 'span totalNum'}).get_text()
for i in range(1, int(pages) + 1):
url = 'https://www.lagou.com/jobs/positionAjax.json?px=default&city={}&needAddtionalResult=false'.format(location)
formdata = {
'first': 'true',
'pn': str(i),
'kd': key
}
print u'正在獲取職位——{}叉庐,城市{},第{}頁數(shù)據(jù)'.format(key,location,i)
yield scrapy.FormRequest(url,formdata=formdata,cookies=self.cookies,callback=self.parse)
def parse(self, response):
data = json.loads(response.text)
content = data['content']
positionResult = content['positionResult']
item = LagouzpItem()
for one in positionResult['result']:
try:
item['city'] = one['city']
except:
item['city'] = u''
try:
item['company_name'] = one['companyFullName']
except:
item['company_name'] = u''
try:
item['size'] = one['companySize']
except:
item['size'] = u''
try:
item['edu'] = one['education']
except:
item['edu'] = u''
try:
item['financeStage'] = one['financeStage']
except:
item['financeStage'] = u''
try:
item['firstType'] = one['firstType']
except:
item['firstType'] = u''
try:
item['industryField'] = one['industryField']
except:
item['industryField'] = u''
try:
item['name']= one['positionName']
except:
item['name'] = u''
try:
item['salary'] = one['salary']
except:
item['salary'] = u''
try:
item['secondType'] = one['secondType']
except:
item['secondType'] = u''
try:
item['workYear'] = one['workYear']
except:
item['workYear'] = u''
try:
item['time'] = one['createTime'].split(' ')[0]
except:
item['time'] = u''
yield item
<em>這里用scrapy.FormRequest()直接提交表單數(shù)據(jù),怕出錯全部 try...except 后來想了一下感覺沒多大必要</em>
4.結(jié)果
總結(jié)
<strong>用哪種方式實現(xiàn)不重要会喝,思路一定要清晰正確陡叠,再挖一個坑——用類封裝之前python爬蟲——拉鉤網(wǎng)職位信息的函數(shù),可以更加規(guī)范肢执,嘗試加多線程枉阵。