1.首先創(chuàng)建一個(gè)獲取代理ip的類万牺,這里取名為ProxyPool罗珍。
class ProxyPool:
def get_soup(self, url):
pass
def get_youdaili(self):
pass
這個(gè)ProxyPool類中有兩個(gè)方法:
- get_soup(self,url)
這個(gè)方法除了接受本身參數(shù)之外脚粟,還接受一個(gè)url參數(shù)(網(wǎng)址)覆旱,返回一個(gè)美味的soup對象。
def get_soup(self, url):
resp = requests.get(url)
if resp.status_code == 200:
resp.encoding = "utf-8"
soup = BeautifulSoup(resp.text, "lxml")
return soup
- get_youdaili(self)
這個(gè)方法不用額外的參數(shù)核无,它使用方法本身里的數(shù)據(jù)扣唱,在這里是優(yōu)代理網(wǎng)站的地址。經(jīng)過對該方法的調(diào)用团南,網(wǎng)站中提供的ip會(huì)被逐個(gè)地添加到數(shù)據(jù)庫中噪沙。
def get_youdaili(self):
soup = self.get_soup('http://www.youdaili.net/Daili/')
a_tag = soup.select('div.newslist_body > ul > li > a')
for i in a_tag:
url = i.get('href')
ip_re = re.compile(r'((\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}\:\d{2,5})@([a-zA-Z0-9]{4,7}))')
soup = self.get_soup(url)
ips = ip_re.findall(soup.text)
page_tag = soup.select('ul.pagelist > li > a')
if page_tag:
page = re.search(r'\d', page_tag[0].get_text()).group()
page = int(page)
else:
page = 1
if page >= 2: # 如果有第二頁就繼續(xù)爬取
for i in range(2, page + 1):
soup_sub = self.get_soup(url[:-5] + "_" + str(i) + ".html")
ips += ip_re.findall(soup_sub.text)
if ips:
for i in ips:
try:
proxy_pool.insert_one({
'ip_port': i[1],
'protocol': i[2].lower(),
'update_time': int(time.time())
})
except pymongo.errors.DuplicateKeyError as ex:
pass
- 數(shù)據(jù)庫 proxy 以及數(shù)據(jù)庫表單 proxy_pool 的建立
client = pymongo.MongoClient("localhost", 27017)
proxy = client['proxy']
proxy_pool = proxy['proxy_pool']
proxy_pool.ensure_index('ip_port', unique=True) # 如果有重復(fù)的ip 寫進(jìn)去 會(huì)報(bào)錯(cuò)
2.接著創(chuàng)建一個(gè)檢測代理質(zhì)量的類,這里取名為ProxyCheck吐根。
class ProxyCheck:
ip_port_all = [(i['ip_port'], i['protocol']) for i in proxy_pool.find()] # 查詢正歼,獲取所有ip
def remove_ip(self, ip_port):
pass
def get_status(self, ip_port, protocol):
pass
def check(self):
pass
這個(gè)類中有三個(gè)方法:
- remove_ip(self, ip_port)
這個(gè)方法需要一個(gè)ip_port參數(shù)(如1.255.53.81:80)),查詢數(shù)據(jù)庫后拷橘,若找到含有該參數(shù)的pymongo對象局义,他會(huì)給該對象添加一個(gè)為None的speed屬性,隨后判斷該對象的更新時(shí)間是否大于一周冗疮,并刪除大于一周的對象萄唇。
def remove_ip(self, ip_port): # 如果沒能成功響應(yīng),將執(zhí)行次方法术幔,將其響應(yīng)速度設(shè)置為空并且判斷存在時(shí)間是否超過一周
ip_data = proxy_pool.find({'ip_port': ip_port}) # <pymongo.cursor.Cursor object at 0x042D8FF0>
proxy_pool.update_one({'ip_port': ip_port}, {'$set': {'speed': None}})
if int(time.time()) - ip_data[0]['update_time'] > 604800: # time.time()是指1970紀(jì)元后經(jīng)過的浮點(diǎn)秒數(shù)
proxy_pool.remove({'ip_port': ip_port})
- get_status(self, ip_port, protocol)
該函數(shù)需要接受2個(gè)參數(shù)另萤,如('1.255.53.81:80', 'http'),并試圖用該代理參數(shù)去訪問一個(gè)正常的頁面特愿,若成功便將反應(yīng)時(shí)間更新到speed屬性仲墨,同時(shí)更新update屬性,否者就調(diào)用remove_ip()方法來刪除該對象揍障。
def get_status(self, ip_port, protocol):
url = "http://fz.58.com/"
proxies = {"http": protocol + "://" + ip_port}
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.103 Safari/537.36",
}
time1 = time.clock() # 以浮點(diǎn)數(shù)計(jì)算的秒數(shù)返回當(dāng)前的CPU時(shí)間目养,用來衡量不同程序的耗時(shí)
try: # 使用代理常常容易出錯(cuò)
resp = requests.get(url, headers=headers, proxies=proxies, timeout=6)
except Exception as ex:
print(ex)
return self.remove_ip(ip_port)
time2 = time.clock()
time_result = time2 - time1 # 計(jì)算響應(yīng)時(shí)間
if resp.status_code == 200:
print(ip_port)
proxy_pool.update_one({"ip_port": ip_port},
{'$set': {'speed': time_result, 'update_time': int(time.time())}})
else:
self.remove_ip(ip_port)
- check()
開啟多線程進(jìn)行檢測
def check(self):
pool = Pool(20)
for i in self.ip_port_all:
if i[1] == 'http':
pool.apply_async(self.get_status, args=i)
pool.close()
pool.join()
3.if name=='main'啟動(dòng)部分:
if __name__ == "__main__":
if len(sys.argv) > 1: # 接收第一個(gè)參數(shù),第一個(gè)參數(shù)為腳本運(yùn)行的間隔時(shí)間
time_sleep = int(sys.argv[1])
else:
time_sleep = 60 * 60
while (True):
pp = ProxyPool()
pp.get_youdaili()
pc = ProxyCheck()
pc.check()
time.sleep(time_sleep)
全篇代碼如下:
# coding:utf-8
# 因?yàn)榫W(wǎng)絡(luò)上的代理畢竟是有限的毒嫡,所以希望大家不要濫用
import re
import requests
import time
import pymongo
import sys
from bs4 import BeautifulSoup
from multiprocessing.dummy import Pool
client = pymongo.MongoClient("localhost", 27017)
proxy = client['proxy']
proxy_pool = proxy['proxy_pool']
proxy_pool.ensure_index('ip_port', unique=True) # 如果有重復(fù)的ip 寫進(jìn)去 會(huì)報(bào)錯(cuò)
class ProxyPool: # 獲取代理ip的類
def get_soup(self, url):
resp = requests.get(url)
if resp.status_code == 200:
resp.encoding = "utf-8"
soup = BeautifulSoup(resp.text, "lxml")
return soup
def get_youdaili(self):
soup = self.get_soup("http://www.youdaili.net/Daili/")
a_tag = soup.select("div.newslist_body > ul > li > a")
for i in a_tag:
url = i.get('href')
ip_re = re.compile(r'((\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}\:\d{2,5})@([a-zA-Z0-9]{4,7}))')
soup = self.get_soup(url)
ips = ip_re.findall(soup.text)
page_tag = soup.select("ul.pagelist > li > a") # 是否還有第二頁
if page_tag:
page = re.search(r"\d", page_tag[0].get_text()).group()
page = int(page)
else:
page = 1
if page >= 2: # 如果有第二頁就繼續(xù)爬取
for i in range(2, page + 1):
soup_sub = self.get_soup(url[:-5] + "_" + str(i) + ".html")
ips += ip_re.findall(soup_sub.text)
if ips:
for i in ips:
try: # 數(shù)據(jù)庫不允許插入相同的ip癌蚁,如果有相同的,這里將會(huì)報(bào)錯(cuò)兜畸,所以加個(gè)try
proxy_pool.insert_one({
'ip_port': i[1],
'protocol': i[2].lower(), # 協(xié)議
'update_time': int(time.time()) # 抓取時(shí)的時(shí)間
})
except pymongo.errors.DuplicateKeyError as ex:
pass
print(url)
class ProxyCheck:
ip_port_all = [(i['ip_port'], i['protocol']) for i in proxy_pool.find()] # 查詢努释,獲取所有ip
def remove_ip(self, ip_port): # 如果沒能成功響應(yīng),將執(zhí)行次方法咬摇,將其響應(yīng)速度設(shè)置為空并且判斷存在時(shí)間是否超過一周
ip_data = proxy_pool.find({'ip_port': ip_port})
proxy_pool.update_one({'ip_port': ip_port}, {'$set': {'speed': None}})
if int(time.time()) - ip_data[0]['update_time'] > 604800:
proxy_pool.remove({'ip_port': ip_port})
def get_status(self, ip_port, protocol):
url = "http://fz.58.com/"
proxies = {"http": protocol + "://" + ip_port}
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.103 Safari/537.36",
}
time1 = time.clock()
try: # 使用代理常常容易出錯(cuò)
resp = requests.get(url, headers=headers, proxies=proxies, timeout=6)
except Exception as ex:
print(ex)
return self.remove_ip(ip_port)
time2 = time.clock()
time_result = time2 - time1 # 計(jì)算響應(yīng)時(shí)間
if resp.status_code == 200:
print(ip_port)
proxy_pool.update_one({"ip_port": ip_port},
{'$set': {'speed': time_result, 'update_time': int(time.time())}})
else:
self.remove_ip(ip_port)
def check(self): # 開啟多線程進(jìn)行檢測
pool = Pool(20)
for i in self.ip_port_all:
if i[1] == 'http':
pool.apply_async(self.get_status, args=i)
pool.close()
pool.join()
if __name__ == "__main__":
if len(sys.argv) > 1: # 接收第一個(gè)參數(shù)伐蒂,第一個(gè)參數(shù)為腳本運(yùn)行的間隔時(shí)間
time_sleep = int(sys.argv[1])
else:
time_sleep = 60 * 60
while (True):
pp = ProxyPool()
pp.get_youdaili()
pc = ProxyCheck()
pc.check()
time.sleep(time_sleep)