# test_proxy.py
import requests
from lxml.etree import HTML
import re
import os
import pandas as pd
from fake_useragent import UserAgent
from threading import Thread
from configparser import ConfigParser
import logging
from datetime import date
from urllib.parse import urlparse
from pprint import pprint
log = logging.getLogger(__name__)
log.level = logging.ERROR
class Verifier():
def __init__(self):
self.ua = UserAgent()
self.headers = {"user-agent":self.ua.ie}
self.conf = ConfigParser()
self.conf.read("base.ini")
self.test_url_http = self.conf["test_urls"]["test_url_http"]
self.test_url_https = self.conf["test_urls"]["test_url_https"]
self.df = self.gen_dataframe()
# print(self.df)
self.df ["proxy"] = self.df[4].str.lower() +"," + self.df[0]+":"+ self.df[1]
self.proxies = [{key:value} for key,value in [i.split(",") for i in self.df ["proxy"].tolist()]]
self.proxies_http = [d for d in self.proxies if list(d.keys())[0] == "http" ]
self.proxies_https = [d for d in self.proxies if list(d.keys())[0] == "https"]
self.verified_proxies_http = set()
self.verified_proxies_https = set()
# self.item = {}
def gen_dataframe(self):
datasets = pd.DataFrame()
r = requests.get("https://www.xicidaili.com/",headers=self.headers)
html = HTML(r.content)
trs = html.xpath("http://tr")
for tr in trs:
data = [i.strip() for i in tr.xpath("./td/text()") if i.strip() is not ""]
if data:
datasets = datasets.append(pd.Series(data),ignore_index=True)
return datasets
def _verify(self,test_url,proxy={}):
try:
if urlparse(test_url).scheme == "http" and list(proxy.keys())[0] == "http":
log.debug("test...%s"%proxy)
self.r = requests.get(test_url,proxies=proxy,headers=self.headers,timeout=20)
self.verified_proxies_http.add(proxy["http"])
log.debug("%s success!"%proxy)
elif urlparse(test_url).scheme == "https" and list(proxy.keys())[0] == "https":
log.debug("test...%s"%proxy)
self.r = requests.get(test_url,proxies=proxy,headers=self.headers,timeout=20,verify=False)
self.verified_proxies_https.add(proxy["https"])
log.debug("%s success弯蚜!"%proxy)
else:
...
except Exception as e:
log.debug(e)
def verify_all(self):
threads = [Thread(target=self._verify,args=(self.test_url_http,proxy)) for proxy in self.proxies_http]
print("*正在測試HTTP")
for thread in threads:
thread.start()
for thread in threads:
thread.join()
print("*測試完畢\n測試網(wǎng)站:%s"%self.test_url_http)
threads = [Thread(target=self._verify,args=(self.test_url_https,proxy)) for proxy in self.proxies_https]
print("*正在測試HTTPS")
for thread in threads:
thread.start()
for thread in threads:
thread.join()
print("*測試完畢\n測試網(wǎng)站:%s"%self.test_url_https)
print("本次測試HTTP代理{}個(gè)孔轴,有效 {} 個(gè),HTTPS代理{}個(gè)碎捺,有效 {} 個(gè)".format(len(self.proxies_http),len(self.verified_proxies_http),len(self.proxies_https),len(self.verified_proxies_https)))
if __name__=="__main__":
verifier = Verifier()
verifier.verify_all()
print(verifier.verified_proxies_http)
pprint(verifier.verified_proxies_https)