在進(jìn)行接口測(cè)試等工作中,很多頁(yè)面訪問需要權(quán)限,這些權(quán)限管理字段一般存儲(chǔ)于header里面鄙皇,已cookie竿奏、token等形式存在,因此經(jīng)常需要在url1中發(fā)送請(qǐng)求獲得這些許可證,在后面的接口測(cè)試中利用許可證構(gòu)造header繼續(xù)進(jìn)行請(qǐng)求以下是一個(gè)舉例,是構(gòu)造header在爬蟲中的應(yīng)用。
coding=utf-8
http://699pic.com/download/getDownloadUrl----獲取下載URL地址的接口接收pid:500472407
返回包含圖片鏈接URL=http://down.699pic.com/photo/50047/2407.jpg?_upt=63305cd11514965673&_upd=500472407.jpg
upd分解成兩塊填入 _upt為實(shí)時(shí)生成欠拾,具有時(shí)間使用限制 #圖片id
如何獲得_upd?
http://699pic.com/sousuo-61847-0-1-0-0-0.html第一頁(yè)#keyword 如何轉(zhuǎn)化為61847---頁(yè)面代碼中可以找到
http://699pic.com/sousuo-61847-0-2-0-0-0.html第二頁(yè)
http://699pic.com/sousuo-61847-0-3-0-0-0.html第三頁(yè)
多頁(yè)面獲取只要改變 這個(gè)數(shù)字,數(shù)字在html代碼中可尋找到
_upd在網(wǎng)頁(yè)元素中html代碼中可以找到拼接即可
問題轉(zhuǎn)化為keyword---五位數(shù)字的對(duì)應(yīng)關(guān)系如何生成的
import requests
import time
import multiprocessing#多進(jìn)程
from bs4 import BeautifulSoup#用于處理html文本骗绕,可以樹狀解析藐窄,方便查找和拆分
import sys
import io
from urllib import request#用于模擬登陸請(qǐng)求,攜帶登陸cookie信息進(jìn)行訪問
import json
import os
import random
sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8')#改變標(biāo)準(zhǔn)輸出的默認(rèn)編碼
class SpiderForPicture(object):
author = 'Blokks'
def __init__(self, keyword):
self.keyword = keyword
def saving_folder_making(self):
folder_path = 'F:\\test_auto\\spider\\pictures\\' + self.keyword
if not os.path.exists(folder_path):
os.mkdir(folder_path)
print('創(chuàng)建名字為%s的目錄存放圖片' % self.keyword)
return folder_path
def get_page_count(self):
try:
keyword = self.keyword
url = 'http://699pic.com/tupian/' + keyword + '.html'
html = requests.get(url)
content = (html.content).decode('utf-8')
re_1 = BeautifulSoup(content, "lxml")
re_2 = re_1.find_all(name='div', attrs={'class': 'pager-linkPage'})
re_3 = re_2[0].find_all(name='a')
list_ = []
list_result = []
result_dict = {}
for item in re_3:
ls = (item.get('href')).split('-')
list_.append(ls)
list_result.append(int(ls[3]))
page_count = str(max(list_result))
key_number = str(list_[0][1])
result_dict[key_number] = page_count
return result_dict#取得對(duì)應(yīng)數(shù)字kw和頁(yè)碼數(shù)
except:
print('搜索關(guān)鍵字未找到圖片...')
exit(1)
def get_pic_id(self):
pic_id_list = []
kw_dict = self.get_page_count()
list_ = []
for i in kw_dict:
list_.append(i)
list_.append(kw_dict[i])
page_count = list_[1]
print('根據(jù)關(guān)鍵字%s一共搜索到圖片%s張' % (self.keyword, page_count))
key_number = list_[0]
for num in range(1, int(page_count)+1):
url = 'http://699pic.com/sousuo-'+key_number+'-0-'+str(num)+'-0-0-0.html'
html = requests.get(url)
content = (html.content).decode('utf-8')
re_1 = BeautifulSoup(content, "lxml")
re_2 = re_1.find_all(name='div', attrs={'class': 'list'})
for item in re_2:
pic_id_list.append(item.get('data-id'))
for i in pic_id_list:
if len(str(i)) < 9:
pic_id_list.remove(i)
return pic_id_list
def get_download_url(self):
pic_id_list = self.get_pic_id()
url_pool = []
for pic_id in pic_id_list:
url = 'http://699pic.com/download/getDownloadUrl?pid=' + pic_id
cookie_str = r'2017endalert=1; uniqid=5a4c7bd11a363; bargain_popup=1; uv_cookie=c610bdc8d6965b2e7abec5d93' \
r'd07ad59; is_click_activity=1; from_data=YTo1OntzOjQ6Imhvc3QiO3M6MTA6IjY5OXBpYy5jb20iO3M6Mzoi' \
r'c2VtIjtiOjA7czoxMDoic291cmNlZnJvbSI7aTowO3M6NDoid29yZCI7TjtzOjM6ImtpZCI7aTowO30%3D; isVip=0; ' \
r'isPay=0; is_qy_vip=1; is_join_2017_end_18454014=0; isSearch=0; s_token=03e987b8c9b7912d89e77b' \
r'b7fd9b62e8; PHPSESSID=kt1v9k8sid51kg0ej6e127cvkvgmpc7q; Qs_lvt_135734=1513923395%2C1513923542' \
r'%2C1514961873%2C1515026629%2C1515031146; mediav=%7B%22eid%22%3A%22278616%22%2C%22ep%22%3A' \
r'%22%22%2C%22vid%22%3A%22%5EySs)9Ku%25D%3A*qX%24(Pe%3FD%22%2C%22ctn%22%3A%22%22%7D; ' \
r'Hm_lvt_1154154465e0978ab181e2fd9a9b9057=1515026630,1515026702,1515031028,1515031147; ' \
r'Hm_lvt_ddcd8445645e86f06e172516cac60b6a=1515026629,1515026702,1515031028,1515031147; ' \
r'recentlysearch=YTo0OntpOjA7YToyOntzOjI6Imt3IjtzOjc6ImRpYW5uYW8iO3M6NjoicGlueWluIjtzOjY6IjMx' \
r'MTExMCI7fWk6MTthOjI6e3M6Mjoia3ciO3M6Njoi55S16ISRIjtzOjY6InBpbnlpbiI7czo3OiJkaWFubmFvIjt9aTo' \
r'yO2E6Mjp7czoyOiJrdyI7czoxMjoi5pm66IO95a625bGFIjtzOjY6InBpbnlpbiI7czoxMjoiemhpbmVuZ2ppYWp1Ij' \
r't9aTozO2E6Mjp7czoyOiJrdyI7czo2OiLlpKfmtbciO3M6NjoicGlueWluIjtzOjU6ImRhaGFpIjt9fQ%3D%3D; ' \
r'search_Kw=%22diannao%22; is_join_2017_end_533435=0; Qs_pv_135734=144824772440290620%2C38906' \
r'64247893633500%2C3737559667568741000%2C2243149228815513300%2C1985644855545767200; ' \
r'Hm_lpvt_1154154465e0978ab181e2fd9a9b9057=1515034556; Hm_lpvt_ddcd8445645e86f06e172516cac60' \
r'b6a=1515034556; redirect=http%3A%2F%2F699pic.com%2Ftupian-500472175.html; session_data=YTo1' \
r'OntzOjM6InVpZCI7czo2OiI1MzM0MzUiO3M6NToidG9rZW4iO3M6MzI6ImZkZDIyZWY5NDJlMjY3NjViYTdhMGE2NmY' \
r'4NzVmMTE3IjtzOjM6InV1dCI7czozMjoiMWM0Y2E4ZDZmMDRhYTdhYmJiNTNkNTkwZmI4MGJiMWMiO3M6NDoiZGF0YS' \
r'I7YToxOntzOjg6InVzZXJuYW1lIjtzOjEyOiLku5nlpbPlprnlprkiO31zOjY6ImV4dGltZSI7aToxNTE1NjM5MzgzO' \
r'30%3D; uid=533435; username=%E4%BB%99%E5%A5%B3%E5%A6%B9%E5%A6%B9; head_pic=http%3A%2F%2' \
r'Fq.qlogo.cn%2Fqqapp%2F101268598%2FD2C2DF0668D1C9B957ADD345B9B7A420%2F40; login_user=1'
req = request.Request(url)
req.add_header('Cookie', cookie_str)
req.add_header('User-Agent', 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.84 Safari/537.36')
resp = request.urlopen(req)
result_ = resp.read().decode('utf-8')
result_dict = json.loads(result_)
if not 'url' in result_dict:
print('cookies失敗o(╥﹏╥)o')
exit(1)
download_url = result_dict['url']
url_pool.append(download_url)
return url_pool
def download_picture(self, url):
file_name = self.keyword + str(random.randint(100000, 999999)) + '.jpg'
folder_path = self.saving_folder_making()
file_path = folder_path + '\\' + file_name
resp = requests.get(url)
content = resp.content
with open(file_path, 'wb') as f:
f.write(content)
def main():
start_time = time.time()
keyword = input('請(qǐng)輸入需要搜索的關(guān)鍵字(拼音):')
spider = SpiderForPicture(keyword)
middle_time = time.time()
time_cost = middle_time - start_time
url_pool = spider.get_download_url()
print('下載地址解析完畢---用時(shí)%s---現(xiàn)在開始下載....' % time_cost)
p = multiprocessing.Pool(processes=4)
p.map(spider.download_picture, url_pool)
p.close()
p.join()
end_time = time.time()
time_used = end_time - start_time
print('全部下載完畢酬土,用時(shí)%s' % time_used)
if name == 'main':
main()
可以看到上面代碼中cookie賊長(zhǎng)荆忍,在本例子中使用add_header進(jìn)行請(qǐng)求頭構(gòu)造完成后面的請(qǐng)求需要。