Requests 是一個(gè) Python 的外部模塊, 需要手動(dòng)安裝. 使用 pip 安裝就好了.
import requests
import webbrowser # 使用瀏覽器打開(kāi)
param = {"wd": "itswl.github"} # 搜索的信息
r = requests.get('https://www.baidu.com/s', params=param)
print(r.url) # 用get 方式
webbrowser.open(r.url)
# https://www.baidu.com/s?wd=itswl.github
import requests
import webbrowser # 使用瀏覽器打開(kāi)
param = {"wd": "itswl.github"} # 搜索的信息
r = requests.get('http://www.baidu.com/s', params=param)
print(r.url) # 用get 方式
webbrowser.open(r.url)
def get():
print('\nget')
param = {"wd": "itswl.github"}
r = requests.get('https://www.baidu.com/s', params=param)
print(r.url)
print(r.text)
# get()
def post_name():
print('\npost name')
# http://pythonscraping.com/pages/files/form.html
data = {'firstname': 'laii', 'lastname': 'weii'}
r = requests.post('http://pythonscraping.com/files/processing.php', data=data)
print(r.text)
post_name()
def post_image():
print('\npost image')
# http://pythonscraping.com/files/form2.html
file = {'uploadFile': open('./image.png', 'rb')}
r = requests.post('http://pythonscraping.com/files/processing2.php', files=file)
print(r.text)
def post_login():
print('\npost login')
# http://pythonscraping.com/pages/cookies/login.html
payload = {'username': 'Morvan', 'password': 'password'}
r = requests.post('http://pythonscraping.com/pages/cookies/welcome.php', data=payload)
print(r.cookies.get_dict())
# http://pythonscraping.com/pages/cookies/profile.php
r = requests.get('http://pythonscraping.com/pages/cookies/profile.php', cookies=r.cookies)
print(r.text)
def session_login(): # 使用cookie
print('\nsession login')
# http://pythonscraping.com/pages/cookies/login.html
session = requests.Session()
payload = {'username': 'Morvan', 'password': 'password'}
r = session.post('http://pythonscraping.com/pages/cookies/welcome.php', data=payload)
print(r.cookies.get_dict())
r = session.get("http://pythonscraping.com/pages/cookies/profile.php")
print(r.text)
post_name()
post_image()
post_login()
session_login()
下載文件
import os
os.makedirs('./img/', exist_ok=True)
IMAGE_URL = "https://morvanzhou.github.io/static/img/description/learning_step_flowchart.png"
def urllib_download():
from urllib.request import urlretrieve
urlretrieve(IMAGE_URL, './img/image1.png') # whole document
def request_download():
import requests
r = requests.get(IMAGE_URL)
with open('./img/image2.png', 'wb') as f:
f.write(r.content) # whole document
def chunk_download():
import requests
r = requests.get(IMAGE_URL, stream=True) # stream loading
with open('./img/image3.png', 'wb') as f:
for chunk in r.iter_content(chunk_size=32):
f.write(chunk)
urllib_download()
print('download image1')
request_download()
print('download image2')
chunk_download()
print('download image3')
一個(gè)小練習(xí)鞋邑,抓取美女吧圖片
# coding=utf-8
import requests
from lxml import etree
import os
import re
class TieBa(object):
"""抓取百度貼吧美女圖片"""
def __init__(self, word):
self.url = 'https://tieba.baidu.com/f?kw={}'.format(word) # word 美女
self.headers = {
'User-Agent': 'Mozilla/4.0 (compatible; MSIE 5.01; Windows NT 5.0; TUCOWS) '
}
def get_data(self, url):
# 構(gòu)造請(qǐng)求
response = requests.get(url, headers=self.headers)
data = response.content
# print(data)
return data
def parse_page(self, data):
"""解析數(shù)據(jù)"""
# 創(chuàng)建xpath對(duì)象
html = etree.HTML(data)
# 提取當(dāng)前頁(yè)標(biāo)題,url數(shù)據(jù)
node_list = html.xpath('//*[@id="thread_list"]/li/div/div[2]/div[1]/div[1]/a')
detail_list = []
for node in node_list:
temp = dict()
temp['title'] = node.xpath('./text()')[0]
temp['url'] = 'https://tieba.baidu.com' + node.xpath('./@href')[0]
detail_list.append(temp)
# print(temp)
# 提取下一頁(yè)連接
next_url = html.xpath('//*[@id="frs_list_pager"]/a[contains(text(), "下一頁(yè)")]/@href')[0]
next_url = 'http:' + next_url if len(next_url) > 0 else None
# print(next_url)
return detail_list, next_url
def parse_detail(self, detail_list):
"""提取詳情頁(yè)url"""
data_url = []
for detail in detail_list:
data_url.append(detail['url'])
return data_url
def save_data(self, url):
"""保存數(shù)據(jù)"""
# 請(qǐng)求標(biāo)題連接地址
data = self.get_data(url)
# 創(chuàng)建xpath對(duì)象
html = etree.HTML(data)
# print(html)
# print(url)
# 獲取圖片url
try:
image_url = html.xpath('//*[contains(@id,"post_content")]/img[1]/@src')[0]
except Exception as e:
return
print(image_url)
# 判斷圖片地址是否已jpg結(jié)尾
if re.match(r'.*\.jpg$', image_url):
# 請(qǐng)求圖片地址缀皱,獲取圖片
image_data = self.get_data(image_url)
filename = 'image/' + image_url.split('/')[-1]
# print(filename)
# 保存圖片
with open(filename, 'wb') as f:
f.write(image_data)
def run(self):
# 判斷是否有image文件夾
if not os.path.exists('image'):
# 創(chuàng)建文件夾
os.mkdir('image')
next_url = self.url
# 請(qǐng)求美女吧首頁(yè)
data = self.get_data(next_url)
# 保存首頁(yè)文件,觀察數(shù)據(jù)蔓搞,是否有需要的數(shù)據(jù)
with open('tieba.json', 'wb') as f:
f.write(data)
# 如果有下一頁(yè)就執(zhí)行
while next_url:
# 獲取每頁(yè)標(biāo)題和對(duì)應(yīng)的連接地址
detail_list, next_url = self.parse_page(data)
# 提取每頁(yè)的詳情頁(yè)的url
data_url = self.parse_detail(detail_list)
# 遍歷每個(gè)url
for url in data_url:
# 保存圖片
self.save_data(url)
# 構(gòu)造下一頁(yè)請(qǐng)求
data = self.get_data(next_url)
if __name__ == '__main__':
tb = TieBa('美女')
tb.run()