記錄一個(gè)免費(fèi)代理池的維護(hù)识补,主要包含四個(gè)模塊:
獲取模塊:主要負(fù)責(zé)從各個(gè)免費(fèi)代理網(wǎng)站提取出最新發(fā)布的免費(fèi)代理,獲取到本地并解析
存儲(chǔ)模塊:負(fù)責(zé)將獲取模塊獲取到的proxy存儲(chǔ)至redis數(shù)據(jù)庫(kù)
檢測(cè)模塊:負(fù)責(zé)檢測(cè)redis數(shù)據(jù)庫(kù)中proxy的可用代理可不可用代理辫红,并賦以權(quán)重
調(diào)度模塊:負(fù)責(zé)將獲取模塊凭涂、存儲(chǔ)模塊和檢測(cè)模塊關(guān)聯(lián),并封裝
主要涉及知識(shí)點(diǎn):
- 元類
- python操作redis數(shù)據(jù)庫(kù)贴妻,redis庫(kù)的使用
- requests庫(kù)的使用
- pyquery的使用
- aiohttp異步http框架的簡(jiǎn)單使用
- 多線程和多進(jìn)程
獲取模塊
# -*- coding: utf-8 -*-
"""
__author__ = 'bingo'
__date__ = '2019/9/7'
# code is far away from bugs with the god animal protecting
I love animals. They taste delicious.
┏┓ ┏┓
┏┛┻━━━┛┻┓
┃ ? ┃
┃ ┳┛ ┗┳ ┃
┃ ┻ ┃
┗━┓ ┏━┛
┃ ┗━━━┓
┃ 神獸保 ┣┓
┃ 永無(wú)BUG┏┛
┗ ┓┏━┳┓┏┛
┃┫┫ ┃┫┫
┗┻┛ ┗┻┛
"""
import random
import asyncio
import requests
import time
import redis
import aiohttp
from pyquery import PyQuery as pq
from redis import ResponseError
from requests import ConnectTimeout
from concurrent.futures import ThreadPoolExecutor
from multiprocessing import Process
from flask import Flask
# 獲取模塊
class ProxyMeta(type):
def __new__(cls, name, bases, attrs):
crawl_count = 0
attrs["__CrawlFunc__"] = []
# 獲取獲取模塊中用來(lái)爬取代理的所有函數(shù)
for k, v in attrs.items():
if k.startswith("crawl_"):
func = "self.{}()".format(k)
attrs["__CrawlFunc__"].append(func)
crawl_count += 1
# 獲取獲取模塊中用來(lái)爬取代理的函數(shù)數(shù)量
attrs["__CrawlFuncCount__"] = crawl_count
return type.__new__(cls, name, bases, attrs)
class CrawlerGetter(object, metaclass=ProxyMeta):
def __init__(self):
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X x.y; rv:42.0) Gecko/20100101 Firefox/42.0'}
self.headers = headers
self.proxy_count = 0
self.db_client = ProxyRedisClient()
def get_page(self, url, encoding):
try:
res = requests.get(url, headers=self.headers, timeout=2.5)
if res.status_code == 200:
res.encoding = encoding
html = res.text
return html
else:
return None
except ConnectTimeout:
return None
def crawl_66daili(self):
"""
66代理
:return:
"""
i = 0
url = "http://www.66ip.cn/{page}.html"
for page in range(1, 11):
html = self.get_page(url.format(page=page), 'gbk')
if html:
p = pq(html)
doc = p(".containerbox table tr:gt(0)")
for item in doc.items():
proxy_ip = item("td:first-child").text()
proxy_port = item("td:nth-child(2)").text()
if proxy_ip and proxy_port:
proxy = ":".join([proxy_ip, proxy_port])
i += 1
print("【66代理%s】:%s" % (i, proxy))
self.proxy_count += 1
yield proxy
else:
pass
else:
print("【66代理】獲取代理失敗page:%s" % page)
continue
def crawl_iphai(self):
"""
ip海代理
:return:
"""
i = 0
urls = ["http://www.iphai.com/free/ng", "http://www.iphai.com/free/wg"]
for url in urls:
html = self.get_page(url, 'utf8')
if html:
p = pq(html)
doc = p(".table-responsive table tr:gt(0)")
for item in doc.items():
proxy_ip = item("td:first-child").text()
proxy_port = item("td:nth-child(2)").text()
if proxy_ip and proxy_port:
proxy = ":".join([proxy_ip, proxy_port])
i += 1
print("【IP海代理%s】:%s" % (i, proxy))
self.proxy_count += 1
yield proxy
else:
pass
else:
print("【IP海代理】獲取代理失敗: %s" % url)
continue
def crawl_qiyun(self):
"""
齊云代理
:return:
"""
i = 0
url = "http://www.qydaili.com/free/?action=china&page={page}"
for page in range(1, 11):
html = self.get_page(url.format(page=page), "utf8")
if html:
p = pq(html)
doc = p(".table tbody tr")
for item in doc.items():
proxy_ip = item("td:first-child").text()
proxy_port = item("td:nth-child(2)").text()
if proxy_ip and proxy_port:
proxy = ":".join([proxy_ip, proxy_port])
i += 1
print("【齊云代理%s】:%s" % (i, proxy))
self.proxy_count += 1
yield proxy
else:
pass
else:
print("【齊云代理】獲取代理失敗page:%s" % page)
continue
def crawl_89daili(self):
"""
89免費(fèi)代理
:return:
"""
i = 0
url = "http://www.89ip.cn/index_{page}.html"
for page in range(1, 21):
html = self.get_page(url.format(page=page), "utf8")
if html:
p = pq(html)
doc = p(".layui-table tbody tr")
for item in doc.items():
proxy_ip = item("td:first-child").text()
proxy_port = item("td:nth-child(2)").text()
if proxy_ip and proxy_port:
proxy = ":".join([proxy_ip, proxy_port])
i += 1
print("【89免費(fèi)代理%s】:%s" % (i, proxy))
self.proxy_count += 1
yield proxy
else:
pass
else:
print("【89免費(fèi)代理】獲取代理失敗page:%s" % page)
continue
def crawl_kuaidaili(self):
"""
快代理
:return:
"""
i = 0
url = "https://www.kuaidaili.com/free/inha/{page}/"
for page in range(1, 11):
html = self.get_page(url.format(page=page), "utf8")
if html:
p = pq(html)
doc = p("table tbody tr")
for item in doc.items():
proxy_ip = item("td:first-child").text()
proxy_port = item("td:nth-child(2)").text()
if proxy_ip and proxy_port:
proxy = ":".join([proxy_ip, proxy_port])
i += 1
print("【快代理%s】:%s" % (i, proxy))
self.proxy_count += 1
yield proxy
else:
pass
else:
print("【快代理】獲取代理失敗page:%s" % page)
continue
def crawl_yundaili(self):
"""
云代理
:return:
"""
i = 0
url = "http://www.ip3366.net/free/?stype=1&page={page}"
for page in range(1, 8):
html = self.get_page(url.format(page=page), "gb2312")
if html:
p = pq(html)
doc = p("table tbody tr")
for item in doc.items():
proxy_ip = item("td:first-child").text()
proxy_port = item("td:nth-child(2)").text()
if proxy_ip and proxy_port:
proxy = ":".join([proxy_ip, proxy_port])
i += 1
print("【云代理%s】:%s" % (i, proxy))
self.proxy_count += 1
yield proxy
else:
pass
else:
print("【云代理】獲取代理失敗page:%s" % page)
continue
def crawl_xicidaili(self):
"""
西刺代理
:return:
"""
i = 0
url = "https://www.xicidaili.com/nn/{page}"
for page in range(1, 6):
html = self.get_page(url.format(page=page), "utf8")
if html:
p = pq(html)
doc = p(".proxies table tr:gt(0)")
for item in doc.items():
proxy_ip = item("td:nth-child(2)").text()
proxy_port = item("td:nth-child(3)").text()
if proxy_ip and proxy_port:
proxy = ":".join([proxy_ip, proxy_port])
i += 1
print("【西刺代理%s】:%s" % (i, proxy))
self.proxy_count += 1
yield proxy
else:
pass
else:
print("【西刺代理】獲取代理失敗page:%s" % page)
continue
def run(self):
"""
返回各個(gè)網(wǎng)站爬蟲接口函數(shù)生成器,并以多線程方式存入redis數(shù)據(jù)庫(kù)
:return:
"""
crawl_funcs_list = []
try:
executor = ThreadPoolExecutor(max_workers=10)
for crawl_func_name in self.__CrawlFunc__:
crawl_funcs_list.append(eval(crawl_func_name))
for crawl_func in crawl_funcs_list:
executor.submit(self.to_redis_db, crawl_func)
executor.shutdown()
except Exception as e:
print("ERROR:", e)
def to_redis_db(self, generation):
"""
接受一個(gè)生成代理ip的生成器切油,將代理存入redis代理池
:param generation:
:return:
"""
proxies_generation = generation
for proxy in proxies_generation:
self.db_client.add(proxy)