今日內(nèi)容
python爬蟲
1.爬取梨視頻頁(yè)面全部視頻:
'''
'''
'''
爬取梨視頻:
請(qǐng)求url:
? ? https://www.pearvideo.com/
請(qǐng)求方式:
? ? GET
請(qǐng)求頭:
? ? user-agent: Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.146 Safari/537.36
'''
import requests
import re? # 正則模塊
# 1控淡、對(duì)梨視頻詳情頁(yè)發(fā)送請(qǐng)求,獲取響應(yīng)數(shù)據(jù)
response = requests.get(url='https://www.pearvideo.com/')
print(response.status_code)
print(response.text)
# re.findall('正則匹配規(guī)則', '解析文本', "正則模式")
# re.S: 全局模式 (對(duì)整個(gè)文本行進(jìn)匹配)
# .指的是當(dāng)前位置
# *指的是查找所有
'''
<a href="video_1543373"
<a href="video_(.*?)"? # 提取1543373
'''
# 2、獲取主頁(yè)視頻詳情頁(yè)ID
res = re.findall('<a href="video_(.*?)"', response.text, re.S)
print(res)
for m_id in res:
? ? # 拼接詳情頁(yè)url
? ? detail_url = 'https://www.pearvideo.com/video_' + m_id
? ? print(detail_url)
import requests
import re? # 正則模塊
# uuid.uuid4()? 可以根據(jù)時(shí)間戳生成一段世界上唯一的隨機(jī)字符串
import uuid
# 爬蟲三部曲
# 1荐健、發(fā)送請(qǐng)求
def get_page(url):
? ? response = requests.get(url)
? ? return response
# 2表谊、解析數(shù)據(jù)
# 解析主頁(yè)獲取視頻詳情頁(yè)ID
def parse_index(text):
? ? res = re.findall('<a href="video_(.*?)"', text, re.S)
? ? # print(res)
? ? detail_url_list = []
? ? for m_id in res:
? ? ? ? # 拼接詳情頁(yè)url
? ? ? ? detail_url = 'https://www.pearvideo.com/video_' + m_id
? ? ? ? # print(detail_url)
? ? ? ? detail_url_list.append(detail_url)
? ? # print(detail_url_list)
? ? return detail_url_list
# 解析詳情頁(yè)獲取視頻url
def parse_detail(text):
? ? ''''''
? ? '''
? ? ? ? (.*?): 提取括號(hào)的內(nèi)容
? ? ? ? .*?: 直接匹配
? ? ? ? <video webkit-playsinline="" playsinline="" x-webkit-airplay="" autoplay="autoplay" src="https://video.pearvideo.com/mp4/adshort/20190613/cont-1566073-14015522_adpkg-ad_hd.mp4" style="width: 100%; height: 100%;"></video>
? ? 正則: <video.*?src="(.*?)"
? ? # 以上是分析過(guò)程,不需要寫
? ? 正則: srcUrl="(.*?)"
? ? '''
? ? movie_url = re.findall('srcUrl="(.*?)"', text, re.S)[0]
? ? return movie_url
# 3、保存數(shù)據(jù)
def save_movie(movie_url):
? ? response = requests.get(movie_url)
? ? # 把視頻寫到本地
? ? with open(f'{uuid.uuid4()}.mp4', 'wb') as f:
? ? ? ? f.write(response.content)
? ? ? ? f.flush()
if __name__ == '__main__':? # main + 回車鍵
? ? # 1续语、對(duì)主頁(yè)發(fā)送請(qǐng)求
? ? index_res = get_page(url='https://www.pearvideo.com/')
? ? # 2襟己、對(duì)主頁(yè)進(jìn)行解析引谜、獲取詳情頁(yè)id
? ? detail_url_list = parse_index(index_res.text)
? ? # print(detail_url_list)
? ? # 3、對(duì)每個(gè)詳情頁(yè)url發(fā)送請(qǐng)求
? ? for detail_url in detail_url_list:
? ? ? ? detail_res = get_page(url=detail_url)
? ? ? ? print(detail_res.text)
? ? ? ? # 4擎浴、解析詳情頁(yè)獲取視頻url
? ? ? ? movie_url = parse_detail(detail_res.text)
? ? ? ? print(movie_url)
? ? ? ? # 5员咽、保存視頻
? ? ? ? save_movie(movie_url)
2.高性能爬蟲
import requests
import re? # 正則模塊
# uuid.uuid4()? 可以根據(jù)時(shí)間戳生成一段世界上唯一的隨機(jī)字符串
import uuid
# 導(dǎo)入線程池模塊
from concurrent.futures import ThreadPoolExecutor
# 線程池限制50個(gè)線程
pool = ThreadPoolExecutor(50)
# 爬蟲三部曲
# 1、發(fā)送請(qǐng)求
def get_page(url):
? ? print(f'開始異步任務(wù): {url}')
? ? response = requests.get(url)
? ? return response
# 2贮预、解析數(shù)據(jù)
# 解析主頁(yè)獲取視頻詳情頁(yè)ID
def parse_index(res):
? ? response = res.result()
? ? # 提取出主頁(yè)所有ID
? ? id_list = re.findall('<a href="video_(.*?)"', response.text, re.S)
? ? # print(res)
? ? # 循環(huán)id列表
? ? for m_id in id_list:
? ? ? ? # 拼接詳情頁(yè)url
? ? ? ? detail_url = 'https://www.pearvideo.com/video_' + m_id
? ? ? ? # print(detail_url)
? ? ? ? # 把詳情頁(yè)url提交給get_page函數(shù)
? ? ? ? pool.submit(get_page, detail_url).add_done_callback(parse_detail)
# 解析詳情頁(yè)獲取視頻url
def parse_detail(res):
? ? response = res.result()
? ? movie_url = re.findall('srcUrl="(.*?)"', response.text, re.S)[0]
? ? # 異步提交把視頻url傳給get_page函數(shù),把返回的結(jié)果傳給save_movie
? ? pool.submit(get_page, movie_url).add_done_callback(save_movie)
# 3贝室、保存數(shù)據(jù)
def save_movie(res):
? ? movie_res = res.result()
? ? # 把視頻寫到本地
? ? with open(f'{uuid.uuid4()}.mp4', 'wb') as f:
? ? ? ? f.write(movie_res.content)
? ? ? ? print(f'視頻下載結(jié)束: {movie_res.url}')
? ? ? ? f.flush()
if __name__ == '__main__':? # main + 回車鍵
? ? # 一 往get_page發(fā)送異步請(qǐng)求,把結(jié)果交給parse_index函數(shù)
? ? url = 'https://www.pearvideo.com/'
? ? pool.submit(get_page, url).add_done_callback(parse_index)
3.requests詳細(xì)使用
'''
GET請(qǐng)求講解
'''
'''
User-Agent
# 訪問(wèn)知乎發(fā)現(xiàn)
請(qǐng)求url:
? ? https://www.zhihu.com/explore
請(qǐng)求方式:
? ? GET
請(qǐng)求頭:
? ? user-agent: Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.146 Safari/537.36
? ? cookies
'''
# 訪問(wèn)知乎
# import requests
# response = requests.get(url='https://www.zhihu.com/explore')
# print(response.status_code)? # 400
# print(response.text)? # 返回錯(cuò)誤頁(yè)面
# 攜帶請(qǐng)求頭參數(shù)訪問(wèn)知乎:
import requests
# 請(qǐng)求頭字典
# headers = {
#? ? 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.146 Safari/537.36'
# }
# 在get請(qǐng)求內(nèi)萌狂,添加user-agent
# response = requests.get(url='https://www.zhihu.com/explore', headers=headers)
# print(response.status_code)? # 200
# # print(response.text)
# with open('zhihu.html', 'w', encoding='utf-8') as f:
#? ? f.write(response.text)
'''
params請(qǐng)求參數(shù)
訪問(wèn)百度搜查安徽工程大學(xué)url
https://www.baidu.com/s?wd=安徽工程大學(xué)&pn=10
https://www.baidu.com/s?wd=安徽工程大學(xué)&pn=20
# '''
from urllib.parse import urlencode
# url = 'https://www.baidu.com/s?wd=%E8%94%A1%E5%BE%90%E5%9D%A4'
# url = 'https://www.baidu.com/s?' + urlencode({"wd": "蔡徐坤"})
url = 'https://www.baidu.com/s?'
headers = {
? ? 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.146 Safari/537.36'
}
# print(url)
# 在get方法中添加params參數(shù)
# response = requests.get(url, headers=headers, params={"wd": "安徽工程大學(xué)"})
response = requests.get(url, headers=headers, params={"wd": "安徽工程大學(xué)", "pn": "20"})
# print(response.text)
with open('gongcheng2.html', 'w', encoding='utf-8') as f:
? ? f.write(response.text)
'''
攜帶cookies
攜帶登錄cookies破解github登錄驗(yàn)證
請(qǐng)求url:
? ? https://github.com/settings/emails
請(qǐng)求方式:
? ? GET
請(qǐng)求頭:
? ? User-Agen
? ? Cookie: has_recent_activity=1; _ga=GA1.2.1416117396.1560496852; _gat=1; tz=Asia%2FShanghai; _octo=GH1.1.1728573677.1560496856; _device_id=1cb66c9a9599576a3b46df2455810999; user_session=1V8n9QfKpbgB-DhS4A7l3Tb3jryARZZ02NDdut3J2hy-8scm; __Host-user_session_same_site=1V8n9QfKpbgB-DhS4A7l3Tb3jryARZZ02NDdut3J2hy-8scm; logged_in=yes; dotcom_user=TankJam; _gh_sess=ZS83eUYyVkpCWUZab21lN29aRHJTUzgvWjRjc2NCL1ZaMHRsdGdJeVFQM20zRDdPblJ1cnZPRFJjclZKNkcrNXVKbTRmZ3pzZzRxRFExcUozQWV4ZG9kOUQzZzMwMzA2RGx5V2dSaTMwaEZ2ZDlHQ0NzTTBtdGtlT2tVajg0c0hYRk5IOU5FelYxanY4T1UvVS9uV0YzWmF0a083MVVYVGlOSy9Edkt0aXhQTmpYRnVqdFAwSFZHVHZQL0ZyQyt0ZjROajZBclY4WmlGQnNBNTJpeEttb3RjVG1mM0JESFhJRXF5M2IwSlpHb1Mzekc5M0d3OFVIdGpJaHg3azk2aStEcUhPaGpEd2RyMDN3K2pETmZQQ1FtNGNzYnVNckR4aWtibkxBRC8vaGM9LS1zTXlDSmFnQkFkWjFjanJxNlhCdnRRPT0%3D--04f6f3172b5d01244670fc8980c2591d83864f60
'''
import requests
# 請(qǐng)求url
url = 'https://github.com/settings/emails'
# 請(qǐng)求頭
headers = {
? ? 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.146 Safari/537.36',
? ? # 在請(qǐng)求頭中拼接cookies
# github_res = requests.get(url, headers=headers)
import requests
cookies = {
? ? 'Cookie': 'ccc'; _ga='ddd'; _gat=1; tz=‘xxx’; _octo='yyy'; _device_id=‘zzz’
#?'ccc''ddd'‘xxx’'yyy'‘zzz’ depends on your browser information
}
github_res = requests.get(url, headers=headers, cookies=cookies)
print('xxx' in github_res.text)#xxx is your username
4.爬取豆瓣Top250電影信息
''''''
'''
主頁(yè):
? ? https://movie.douban.com/top250
? ? GET
? ? User-Agent: Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.146 Safari/537.36
re正則:
? ? # 電影詳情頁(yè)url档玻、圖片鏈接、電影名稱茫藏、電影評(píng)分误趴、評(píng)價(jià)人數(shù)
? ? <div class="item">.*?href="(.*?)">.*?src="(.*?)".*?<span class="title">(.*?)</span>.*?<span class="rating_num".*?>(.*?)</span>.*?<span>(.*?)人評(píng)價(jià)
'''
import requests
import re
url = 'https://movie.douban.com/top250'
headers = {
? ? 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.146 Safari/537.36'
}
# 1、往豆瓣TOP250發(fā)送請(qǐng)求獲取響應(yīng)數(shù)據(jù)
response = requests.get(url, headers=headers)
# print(response.text)
# 2务傲、通過(guò)正則解析提取數(shù)據(jù)
# 電影詳情頁(yè)url凉当、圖片鏈接枣申、電影名稱、電影評(píng)分看杭、評(píng)價(jià)人數(shù)
movie_content_list = re.findall(
? ? # 正則規(guī)則
? ? '<div class="item">.*?href="(.*?)">.*?src="(.*?)".*?<span class="title">(.*?)</span>.*?<span class="rating_num".*?>(.*?)</span>.*?<span>(.*?)人評(píng)價(jià)',
? ? # 解析文本
? ? response.text,
? ? # 匹配模式
? ? re.S)
for movie_content in movie_content_list:
? ? # 解壓賦值每一部電影
? ? detail_url, movie_jpg, name, point, num = movie_content
? ? data = f'電影名稱:{name},? 詳情頁(yè)url:{detail_url}, 圖片url:{movie_jpg}, 評(píng)分: {point}, 評(píng)價(jià)人數(shù): {num} \n'
? ? print(data)
? ? # 3忠藤、保存數(shù)據(jù),把電影信息寫入文件中
? ? with open('douban.txt', 'a', encoding='utf-8') as f:
? ? ? ? f.write(data)