爬蟲學(xué)習(xí)記錄
獲取58同城上的招聘信息
爬蟲的意義
我們編寫爬蟲就是把網(wǎng)頁(yè)中的關(guān)鍵信息爬取下來,然后做分析踢涌,現(xiàn)在是數(shù)據(jù)時(shí)代通孽,所以數(shù)據(jù)是很重要的資源。爬蟲可以幫助我們獲取這些資源睁壁。
本文的目的
現(xiàn)在的爬蟲技術(shù)很多背苦,但是以python為主,作為初學(xué)者我建議不要使用太多現(xiàn)成的工具潘明,這樣無法學(xué)習(xí)到里面的技術(shù)行剂,比如你在使用scrapy時(shí)都很難了解它在內(nèi)部調(diào)用了什么,這篇文章也將用urllib2+beautifulSoup+mysql來獲取58同城上的招聘信息钳降,最關(guān)鍵的是分析網(wǎng)頁(yè)源代碼厚宰,找到需要的信息。
獲取網(wǎng)頁(yè)源碼
url = "http://hz.58.com/tech/" + "pn"+str(start)+"/"
request = urllib2.Request(url=url,headers=headers)
response = urllib2.urlopen(request,timeout=60)
html = response.read().decode('utf-8')
soup = BeautifulSoup(html,'lxml')
獲取58的列表信息
for item in all_dl:
job = item.find('dt').find('a')
info = getdatas.getInfo(job['href'])
if info != 0:
count += insertmysql.insertMysql(info)
print "現(xiàn)在的數(shù)據(jù)量為%d"%(count)
time.sleep(5)
start = start + 1
其中的每一個(gè)item就是一條招聘信息,然后進(jìn)入這個(gè)二級(jí)地址铲觉,獲取相關(guān)的招聘信息
二級(jí)網(wǎng)址
在這個(gè)部分首先也要獲取網(wǎng)頁(yè)源代碼澈蝙,然后用beautifulSoup來匹配關(guān)鍵信息,beautifulSoup的用法可以在官網(wǎng)看看撵幽。
def getInfo(url):
headers = {}
headers["User-Agent"] = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.85 Safari/537.36"
try:
# proxies = {'http': proxy_ip}
request = urllib2.Request(url=url, headers=headers)
# request.set_proxy(proxy_ip, 'http')
response = urllib2.urlopen(request)
html = response.read().decode('utf-8')
# html = requests.get(url, headers=headers, proxies=proxies)
html = BeautifulSoup(html, 'lxml')
info = {}
info['id'] = uuid.uuid4()
info['title'] = html.find('div', class_='item_con pos_info').find('span', class_='pos_name').get_text()
temp = html.find('div', class_='pos_base_info').find('span', class_='pos_salary').get_text()
info['salary_min'] = 0+int(re.findall(r"(\d+)\-", temp)[0])
info['salary_max'] = 0 + int(re.findall(r"\-(\d+)", temp)[0])
info['company'] = html.find('div', class_='item_con company_baseInfo').find('p',class_='comp_baseInfo_title').find('a', class_='baseInfo_link').get_text()
temp = html.find('div', class_='item_con company_baseInfo').find('p', class_='comp_baseInfo_scale').get_text()
info['scale_min'] = 0+int(re.findall(r"(\d+)\-", temp)[0])
info['scale_max'] = 0+int(re.findall(r"\-(\d+)", temp)[0])
info['address'] = html.find('div', class_='item_con work_adress').find('p', class_='detail_adress').get_text()
return info
except Exception, e:
return 0
我用uuid作為主鍵灯荧,爬取了招聘信息中的主要內(nèi)容,薪水盐杂,公司規(guī)模逗载,公司地址等信息,但是58里的招聘頁(yè)面有些不是按照這個(gè)標(biāo)準(zhǔn)設(shè)置的链烈,所以如果想要更加完整的信息厉斟,就需要在分類討論一下。
存儲(chǔ)數(shù)據(jù)庫(kù)
這里選擇的數(shù)據(jù)庫(kù)是mysql强衡,python連接mysql也很容易:
db = MySQLdb.connect(host='localhost', user='root', passwd='123', db='58city', port=3306,charset='utf8')
cursor = db.cursor()
然后將相關(guān)的信息放到mysql中:
cursor.execute(
'insert into jobs(id,title,salary_min,salary_max,company,scale_min,scale_max,address) values(%s,%s,%s,%s,%s,%s,%s,%s)',
(id,title,salary_min,salary_max,company,scale_min,scale_max,address))
db.commit()
db.close()
cursor.close()
我們?cè)趯懘a的時(shí)候會(huì)肯定會(huì)有bug擦秽,所以使用try catch 的方法最好。
except Exception, e:
print e.message+"數(shù)據(jù)庫(kù)報(bào)錯(cuò)"+e.message+e.args[0]
return 0
反爬的策略
我們可以做個(gè)ip代理食侮,防止地址被封号涯,并且設(shè)置休眠時(shí)間,以免爬取太快
被網(wǎng)站察覺锯七。
這里提供源代碼
# coding:utf8
import random
import urllib2
import time
from bs4 import BeautifulSoup
import getdatas
import insertmysql
import requests
ISOTIMEFORMAT = '%Y-%m-%d %X'
headers = {}
headers["User-Agent"] = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.85 Safari/537.36"
import getip
# 獲取tag
# start
print
"********** START **********"
print
time.strftime(ISOTIMEFORMAT, time.localtime())
try:
start = 33
count = 0
# proxy_list = getip.get_ips()
while True:
try:
# proxy_ip = random.choice(proxy_list)
# proxies = {'http': proxy_ip}
#
url = "http://hz.58.com/tech/" + "pn"+str(start)+"/"
request = urllib2.Request(url=url,headers=headers)
# request.set_proxy(proxy_ip,'http')
response = urllib2.urlopen(request,timeout=60)
html = response.read().decode('utf-8')
# html = requests.get(url, headers=headers, proxies=proxies)
soup = BeautifulSoup(html,'lxml')
all_dl = soup.find('div',id='infolist').findAll('dl')
if len(all_dl) == 0:
break
for item in all_dl:
job = item.find('dt').find('a')
info = getdatas.getInfo(job['href'])
if info != 0:
count += insertmysql.insertMysql(info)
print "現(xiàn)在的數(shù)據(jù)量為%d"%(count)
time.sleep(5)
start = start + 1
print start
time.sleep(5)
# print info_list['director']
except Exception, e:
print e.message + "1"
except Exception, e:
print e.message +'2'
# coding:utf8
import urllib2
import urllib
import json
import time
import re
import random
import uuid
import requests
from bs4 import BeautifulSoup
def getInfo(url):
headers = {}
headers["User-Agent"] = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.85 Safari/537.36"
try:
# proxies = {'http': proxy_ip}
request = urllib2.Request(url=url, headers=headers)
# request.set_proxy(proxy_ip, 'http')
response = urllib2.urlopen(request)
html = response.read().decode('utf-8')
# html = requests.get(url, headers=headers, proxies=proxies)
html = BeautifulSoup(html, 'lxml')
info = {}
info['id'] = uuid.uuid4()
info['title'] = html.find('div', class_='item_con pos_info').find('span', class_='pos_name').get_text()
temp = html.find('div', class_='pos_base_info').find('span', class_='pos_salary').get_text()
info['salary_min'] = 0+int(re.findall(r"(\d+)\-", temp)[0])
info['salary_max'] = 0 + int(re.findall(r"\-(\d+)", temp)[0])
info['company'] = html.find('div', class_='item_con company_baseInfo').find('p',class_='comp_baseInfo_title').find('a', class_='baseInfo_link').get_text()
temp = html.find('div', class_='item_con company_baseInfo').find('p', class_='comp_baseInfo_scale').get_text()
info['scale_min'] = 0+int(re.findall(r"(\d+)\-", temp)[0])
info['scale_max'] = 0+int(re.findall(r"\-(\d+)", temp)[0])
info['address'] = html.find('div', class_='item_con work_adress').find('p', class_='detail_adress').get_text()
return info
except Exception, e:
return 0
# -*- coding:utf-8 -*-
import MySQLdb
import MySQLdb.cursors
import getCity
def insertMysql(info):
if info == None:
print "there is no infomation"
return 0
else:
try:
db = MySQLdb.connect(host='localhost', user='root', passwd='123', db='58city', port=3306,charset='utf8')
cursor = db.cursor()
id = info['id']
title = info['title']
salary_min = info['salary_min']
salary_max = info['salary_max']
company = info['company']
scale_min = info['scale_min']
scale_max = info['scale_max']
address = info['address']
cursor.execute(
'insert into jobs(id,title,salary_min,salary_max,company,scale_min,scale_max,address) values(%s,%s,%s,%s,%s,%s,%s,%s)',
(id,title,salary_min,salary_max,company,scale_min,scale_max,address))
db.commit()
db.close()
cursor.close()
return 1
except Exception, e:
print e.message+"數(shù)據(jù)庫(kù)報(bào)錯(cuò)"+e.message+e.args[0]
return 0