3.1 安裝selenium并學(xué)習(xí)
1.?安裝
pip install selenium
2.學(xué)習(xí)使用
import time
from selenium import webdriver
#from selenium.webdriver.common.keys import Keys
browser = webdriver.Chrome()
url = 'http://mail.163.com/'
browser.get(url)
time.sleep(3)
browser.maximize_window()
time.sleep(5)
browser.switch_to.frame(0)
email = browser.find_element_by_name('email')
email.send_keys('athena_liyu@163.com')
password = browser.find_element_by_name('password')
password.send_keys('******')
login_em = browser.find_element_by_id('dologin')
login_em.click()
time.sleep(10)
3.2 學(xué)習(xí)IP相關(guān)知識
1) 為什么會出現(xiàn)IP被封
網(wǎng)站為了防止被爬取贾漏,會有反爬機(jī)制主卫,對于同一個(gè)IP地址的大量同類型的訪問捌年,會封鎖IP沟于,過一段時(shí)間后霹崎,才能繼續(xù)訪問
2) 如何應(yīng)對IP被封的問題
修改請求頭,模擬瀏覽器(而不是代碼去直接訪問)去訪問
采用代理IP并輪換
設(shè)置訪問時(shí)間間隔
3) 抓取西刺代理,并構(gòu)建自己的代理池
from bs4 import BeautifulSoup
import requests
import time
def open_proxy_url(url):
???user_agent = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_2)AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.119 Safari/537.36'
???headers = {'User-Agent': user_agent}
???try:
???????r = requests.get(url, headers = headers, timeout = 20)
? ??????r.raise_for_status()
???????r.encoding = r.apparent_encoding
???????return r.text
???except:
???????print('無法訪問' + url)
def get_proxy_ip(response):
???proxy_ip_list = [ ]
???soup = BeautifulSoup(response, 'html.parser')
???proxy_ips? = soup.select('.odd')
???for proxy_ip in proxy_ips:
???????ip = proxy_ip.select('td')[1].text
???????port = proxy_ip.select('td')[2].text
???????protocol = proxy_ip.select('td')[5].text
???????if protocol in ('HTTP','HTTPS'):
???????????proxy_ip_list.append(f'{protocol}://{ip}:{port}')
???return proxy_ip_list
if __name__ == '__main__':
???proxy_url = 'https://www.xicidaili.com/'
???text = open_proxy_url(proxy_url)
???proxy_ip_filename = 'proxy_ip.txt'
???with open(proxy_ip_filename, 'w') as f:
???????f.write(text)
???text = open(proxy_ip_filename, 'r').read()
???proxy_ip_list = get_proxy_ip(text)
???print(proxy_ip_list)
def get_proxy_ip(response):
???proxy_ip_list = []
???soup = BeautifulSoup(response, 'html.parser')
???proxy_ips = soup.find(id = 'ip_list').find_all('tr')
???for proxy_ip in proxy_ips:
???????if len(proxy_ip.select('td')) >=8:
???????????ip = proxy_ip.select('td')[1].text
???????????port = proxy_ip.select('td')[2].text
???????????protocol = proxy_ip.select('td')[5].text
???????????if protocol in ('HTTP','HTTPS','http','https'):
???????????????proxy_ip_list.append(f'{protocol}://{ip}:{port}')
???return proxy_ip_list