一開(kāi)始我是學(xué)習(xí)崔大神的flask和redis動(dòng)態(tài)代理池握恳,可惜里面的異步檢查以及請(qǐng)求紊选,還有元類(lèi)的編程(看了整整一天,實(shí)在沒(méi)有完全搞懂)卸勺,我就算照著寫(xiě)也報(bào)錯(cuò)不斷,于是今天自己寫(xiě)了一個(gè)代理池烫扼,也實(shí)現(xiàn)了一樣的功能曙求,先說(shuō)一下代理池主要的思路,首先是啟動(dòng)一個(gè)主進(jìn)程材蛛,然后分別創(chuàng)建兩個(gè)子進(jìn)程圆到,一個(gè)是定期測(cè)試IP,一個(gè)是定期檢查IP數(shù)量卑吭,符合要求則不用進(jìn)行抓取芽淡,不符合要求則進(jìn)行抓取,同時(shí)記得啟動(dòng)tornado框架豆赏,讓我們可以在瀏覽器訪問(wèn)挣菲,然后獲得代理富稻。樓主對(duì)于代理的測(cè)試用的是線程池,用協(xié)程實(shí)在沒(méi)有信心寫(xiě)好白胀,git_hub地址是https://github.com/xiaobeibei26/dynamic_ip_pool
代碼下載之后椭赋,大家只要運(yùn)行了里面的run文件就不用管了,代理池會(huì)自己?jiǎn)?dòng)或杠,自己刷新哪怔,你可用在別的腳本或者瀏覽器提取用就好了
這里先看一看關(guān)鍵的兩個(gè)子進(jìn)程的代碼
from ip_pool.database import RedisConnect#導(dǎo)入數(shù)據(jù)庫(kù)鏈接
import requests
import re
from ip_pool.ip_request import FreeProxyGetter
from ip_pool.Thread_pool import ThreadPool
import time
from multiprocessing import Process
check_time = 50#多久檢查一次IP的有效性
count_time =50#多久嚴(yán)查一次IP數(shù)量是否夠
lower_num_ip = 30#最少的IP數(shù)量
max_num_ip = 70#最多的IP數(shù)量
class Test_ip(object):#用于測(cè)試IP
def __init__(self):
self.url='http://ip.chinaz.com/getip.aspx'#用于測(cè)試的url
self._conn=RedisConnect()
self._raw_proxies =None
self.Thread_pool =ThreadPool(10)
def set_raw_proxies(self,proxies):#接收以備測(cè)試的代理
self._raw_proxies =proxies
def _test(self,proxy):#測(cè)試IP
if isinstance(proxy,bytes):
proxy=proxy.decode('utf-8')
real_proxy= {'http':'http://{}'.format(proxy),
'https':'http://{}'.format(proxy),}
print('正在測(cè)試儿普,',proxy)
try:
html = requests.get(self.url, proxies=real_proxy, timeout=1)
status_number = re.findall(r'\d\d\d', str(html))[0] # 提取網(wǎng)頁(yè)返回碼
re_ip = re.findall(r'\{ip', html.text) # 有些 ip極其惡心决瞳,雖然返回的是200數(shù)字,表示正常课竣,實(shí)則是bad request挟鸠,這里去除掉
if status_number == str(200):
if re_ip:
# 檢驗(yàn)代理是否能正常使用
self._conn.put(proxy)
print('網(wǎng)頁(yè)返回狀態(tài)碼:', html, proxy, '代理有效,地址是:', html.text)
except Exception as e:
print('移除無(wú)效代理叉信,',proxy)
def Thread_test_ip(self,proxies):#傳進(jìn)去一個(gè)列表
for proxy in proxies:
self.Thread_pool.run(func=self._test, args=proxy) # 用多線程測(cè)試
print('本次測(cè)試用了多少線程',len(self.Thread_pool.generate_list))
class Get_ip(object):
def __init__(self,max_ip_count=max_num_ip):#默認(rèn)IP池里面最多的IP上面設(shè)置好了
self._conn=RedisConnect()
self.max_ip_count=max_ip_count
self.crawl=FreeProxyGetter()
self.Test=Test_ip()
self.Thread_pool=ThreadPool(10)#用于多線程測(cè)試,這里設(shè)置最多10個(gè)線程
def is_ip_enough(self):
if self._conn.ip_count >= self.max_ip_count:#如果池里IP數(shù)大于規(guī)定最大的數(shù)量艘希,則返回False
return False
return True
def catch_ip(self):
while self.is_ip_enough():
print('代理數(shù)量不足硼身,代理抓取中')
for callback_lable in range(self.crawl.__CrawlFuncCount__):#該方法在元類(lèi)里面添加了
callback = self.crawl.__CrawlFunc__[callback_lable]
raw_proxies = self.crawl.get_raw_proxies(callback)#這是接收抓取的代理
if raw_proxies:
self.Test.Thread_test_ip(raw_proxies)
else:
print('該源頭沒(méi)有代理')
class schedule(object):
@staticmethod
def check_ip(cycle_time=check_time):
conn = RedisConnect()
tester = Test_ip()
while True:
print('IP檢查程序啟動(dòng)')
count= int(0.5 * conn.ip_count)
if count == 0:
time.sleep(cycle_time)
continue
raw_proxies = conn.get_to_test(count)
tester.Thread_test_ip(raw_proxies)#將列表傳進(jìn)去
time.sleep(cycle_time)
@staticmethod
def catch_ip(cycle_time = count_time,max_ip=max_num_ip,min_ip=lower_num_ip):#時(shí)間多久檢查一次IP數(shù)量是否足夠
conn= RedisConnect()
Ip_catch = Get_ip()
while True:
if conn.ip_count <min_ip:#小于最少I(mǎi)P數(shù)量,啟動(dòng)抓取
Ip_catch.catch_ip()
print('代理暫時(shí)充足覆享,不用抓取')
time.sleep(cycle_time)
def run(self):
print('代理池啟動(dòng)')
check_process =Process(target=schedule.check_ip)
catch_process = Process(target=schedule.catch_ip)
check_process.start()
catch_process.start()
這么長(zhǎng)的代碼佳遂,我也懶得再看了,其主要功能就是同事運(yùn)行兩個(gè)進(jìn)程撒顿,一個(gè)檢查IP數(shù)量讶迁,一個(gè)測(cè)試質(zhì)量
數(shù)據(jù)庫(kù)的操作也很簡(jiǎn)單
import redis
class RedisConnect(object):
def __init__(self):
self.db=redis.Redis(host='localhost',port=6379)
def get_to_test(self,count=1):#獲取代理以備檢查
proxy = self.db.lrange('proxies',0,count-1)
self.db.ltrim('proxies',count,-1)
return proxy
def put(self,proxy):#放進(jìn)一個(gè)代理
self.db.rpush('proxies',proxy)
def pop(self):#獲取一個(gè)代理,然后提出
try:
return self.db.rpop('proxies')
except Exception as e:
return '沒(méi)有代理可用'
@property
def ip_count(self):#獲取代理數(shù)量
return self.db.llen("proxies")
這里值得一提的是元類(lèi)的編程核蘸,如圖這是代理抓取的代碼
from .html_request import MyRequest
from pyquery import PyQuery as pq
from bs4 import BeautifulSoup
class ProxyMetaclass(type):
"""
__CrawlFunc__和__CrawlFuncCount__
兩個(gè)參數(shù),分別表示爬蟲(chóng)函數(shù)啸驯,和爬蟲(chóng)函數(shù)的數(shù)量客扎。
"""
def __new__(cls, name, bases, attrs):#這幾個(gè)是固定參數(shù)
count = 0
attrs['__CrawlFunc__'] = []
for k, v in attrs.items():
if 'crawl_' in k:#檢查每個(gè)方法里面的key值是否有crawl字符,然后運(yùn)行罚斗,方便以后添加代理的抓取進(jìn)去
attrs['__CrawlFunc__'].append(k)
count += 1
attrs['__CrawlFuncCount__'] = count
return type.__new__(cls, name, bases, attrs)
class FreeProxyGetter(object, metaclass=ProxyMetaclass):
def get_raw_proxies(self, callback):
proxies = []
print('Callback', callback)
for proxy in eval("self.{}()".format(callback)):#eval這里用于執(zhí)行字符串
print('Getting', proxy, 'from', callback)
proxies.append(proxy)
return proxies
def crawl_daili66(self, page_count=4):
start_url = 'http://www.66ip.cn/{}.html'
urls = [start_url.format(page) for page in range(1, page_count + 1)]
for url in urls:
print('Crawling', url)
try:
html = MyRequest.get(url,3)
if html:
doc = pq(html)
trs = doc('.containerbox table tr:gt(0)').items()
for tr in trs:
ip = tr.find('td:nth-child(1)').text()
port = tr.find('td:nth-child(2)').text()
yield ':'.join([ip, port])
except Exception as e:
print('Crawling faild', url)
def crawl_proxy360(self):
start_url = 'http://www.proxy#/Region/China'
print('Crawling', start_url)
try:
html = MyRequest.get(start_url,3)
if html:
doc = pq(html)
lines = doc('div[name="list_proxy_ip"]').items()
for line in lines:
ip = line.find('.tbBottomLine:nth-child(1)').text()
port = line.find('.tbBottomLine:nth-child(2)').text()
yield ':'.join([ip, port])
except:
print('Crawling faild', start_url)
def crawl_goubanjia(self):
start_url = 'http://www.goubanjia.com/free/gngn/index.shtml'
try:
html = MyRequest.get(start_url,3)
if html:
doc = pq(html)
tds = doc('td.ip').items()
for td in tds:
td.find('p').remove()
yield td.text().replace(' ', '')
except:
print('Crawling faild', start_url)
def crawl_haoip(self):
start_url = 'http://haoip.cc/tiqu.htm'
try:
html =MyRequest.get(start_url,3)
if html:
doc = pq(html)
results = doc('.row .col-xs-12').html().split('<br/>')
for result in results:
if result: yield result.strip()
except:
print('Crawling faild', start_url)
def crawl_xici(self):#西刺網(wǎng)的爬取
start_url ='http://www.xicidaili.com/nn/1'
try:
data = MyRequest.get(start_url, 3)
all_data = BeautifulSoup(data, 'lxml')
all_ip = all_data.find_all('tr', class_='odd')
for i in all_ip:
ip = i.find_all('td')[1].get_text() # ip
port = i.find_all('td')[2].get_text() # 端口
proxy = (ip + ':' + port).strip() # 組成成proxy代理
if proxy:
yield proxy
except:
print('Crawling faild', start_url)
這里封裝了多個(gè)網(wǎng)站的代理抓取徙鱼,元類(lèi)起的作用是根據(jù)這個(gè)類(lèi)創(chuàng)建的對(duì)象,都具有了自定義new方法里面的屬性针姿,我們這里定義的是檢查自身所定義的方法以及數(shù)量這個(gè)屬性袱吆,后面要添加什么網(wǎng)站的代理抓取,直接添加進(jìn)去就好了距淫,不需要做任何改動(dòng)绞绒,上兩張結(jié)果圖,代理不足的時(shí)候就會(huì)自動(dòng)去抓取榕暇,也會(huì)定時(shí)測(cè)試可用代理蓬衡,這就實(shí)現(xiàn)了動(dòng)態(tài)代理池的功能
我們?cè)跒g覽器打開(kāi)喻杈,一個(gè)顯示的是當(dāng)前代理可用的數(shù)量,一個(gè)是提取的代理狰晚,每次是提取一個(gè)