1.爬蟲基礎(chǔ)
1.1獲取網(wǎng)址
url='https://www.baidu.com'
response=requests.get(url)
1.2獲取str類型的響應(yīng)
print(response.text)
1.3獲取bytes類型的響應(yīng)
print(response.content)
1.4獲取響應(yīng)頭
print(response.headers)
1.5獲取狀態(tài)碼
print(response.status_code)
1.6響應(yīng)頭用以偽裝成瀏覽器
#沒有添加響應(yīng)頭
# resp=requests.get('https://www.zhihu.com/')
# print(resp.status_code)
#運(yùn)行返回400
#使用字典定義請(qǐng)求頭
headers={"User-Agent":"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36"}
resp=requests.get('https://pvp.qq.com/')
print(resp.status_code)
#運(yùn)行返回200
2.靜態(tài)網(wǎng)頁爬蟲
2.1導(dǎo)入lxml庫
from lxml import html
2.2打開并讀取本地html文件
with open('./index.html','r',encoding='utf-8') as f:
html_data=f.read()
print(html_data)
2.3解析html文件,獲取selector對(duì)象
selector =html.fromstring(html_data)
#要獲取標(biāo)簽內(nèi)容署浩,末尾要添加text()
h1=selector.xpath('/html/body/h1/text()')
print(h1[0])
2.4//表示可以代表任意位置出發(fā)
#//標(biāo)簽1[@屬性=屬性值]/標(biāo)簽2[@屬性=屬性值]..../text()
a=selector.xpath('//div[@id="container"]/a/text()')
print(a)
3.動(dòng)態(tài)網(wǎng)頁爬蟲(當(dāng)當(dāng)網(wǎng)和電影網(wǎng))
3.1導(dǎo)入庫
import requests
from lxml import html
import pandas as pd
from matplotlib import pyplot as plt
3.2設(shè)置響應(yīng)頭和url
瀏覽器中按f12揉燃,點(diǎn)擊network,刷新界面筋栋,下面的name中隨意選取查看右邊信息的User-Agent
def spider_dangdang(isbn):
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36"}
#裝圖書信息的list
book_list = []
#目標(biāo)站點(diǎn)地址
url='http://search.dangdang.com/?key={}&act=input'.format(isbn)
3.3獲取站點(diǎn)str類型的響應(yīng)
resp=requests.get(url,headers=headers)
html_data=resp.text
3.4提取目標(biāo)站所有圖書信息
selector=html.fromstring(html_data)
ul_list=selector.xpath('//div[@id="search_nature_rg"]/ul/li')
print('共有{}家店鋪售賣此書'.format(len(ul_list)))
3.5遍歷信息獲取想要的數(shù)據(jù)
for li in ul_list:
#圖書名
title=li.xpath('./a/@title')[0].strip()
# print(title)
#圖書購買鏈接
link=li.xpath('a/@href')[0]
# print(link)
#圖書價(jià)格
price=li.xpath('./p[@class="price"]/span[@class="search_now_price"]/text()')[0]
price = float(price.replace('¥', ''))
# print(price)
#圖書賣家名稱
store =li.xpath('./p[@class="search_shangjia"]/a/text()')
# if len(store)==0:
# store='當(dāng)當(dāng)自營'
store='當(dāng)當(dāng)自營' if len(store) ==0 else store[0]
#添加每個(gè)商家的圖書信息
book_list.append({
'title':title,
'link':link,
'price':price,
'store':store
})
#排序
book_list.sort(key=lambda x:x['price'])
3.6獲取銷量最高的10家繪制柱狀圖
#展示價(jià)格最低的10家 柱狀圖
top10_store=[book_list[i] for i in range(10)]
# x=[]
# for stroe in top10_store:
# x.append(store['store'])
x=[x['store'] for x in top10_store]
y=[x['price'] for x in top10_store]
plt.barh(x,y)
plt.show()
3.7存儲(chǔ)成csv文件
df=pd.DataFrame(book_list)
df.to_csv('dangdang.csv')
#以上步驟均是在函數(shù)spider_dangdang中執(zhí)行
3.8調(diào)用函數(shù)
#要查詢的圖書的編號(hào)9787115428028
spider_dangdang('9787115428028')
4.對(duì)豆瓣網(wǎng)爬蟲
#電影名炊汤,上映日期,類型弊攘,上映國家抢腐,想看人數(shù)
#根據(jù)想看人數(shù)進(jìn)行排序
#繪制即將上映電影國家的占比圖
#繪制top5最想看的電影
#請(qǐng)求遠(yuǎn)程端站點(diǎn)
import requests
from lxml import html
import pandas as pd
from matplotlib import pyplot as plt
plt.rcParams["font.sans-serif"] = ['SimHei']
plt.rcParams['axes.unicode_minus'] = False
counts={}
# 目標(biāo)站點(diǎn)地址
def spider_douban():
movie_list=[]
url = 'https://movie.douban.com/cinema/later/chongqing/'
headers = {"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.142 Safari/537.36"}
resp = requests.get(url, headers=headers)
html_data = resp.text
# 將html頁面寫入本地
# with open('dangdang.html', 'w', encoding='utf-8') as f:
# f.write(html_data)
#提取目標(biāo)站信息
selector = html.fromstring(html_data)
ul_list = selector.xpath('//div[@id="showing-soon"]/div/div')
print('您好,共有{}部電影即將在重慶上映'.format(len(ul_list)))
# 遍歷ul_list
for li in ul_list:
# 電影名稱
title = li.xpath('./h3/a/text()')[0].strip()
print(title)
# 上映日期
date = li.xpath('./ul/li/text()')[0]
print(date)
# 類型
type = li.xpath('./ul/li/text()')[1]
print(type)
# 上映國家
country = li.xpath('./ul/li/text()')[2]
print(country)
# 想看人數(shù)
num = li.xpath('./ul/li/span/text()')[0]
print(num)
num = int(num.replace('人想看', ''))
#添加電影信息
movie_list.append({
'title':title,
'date': date,
'type':type,
'country':country,
'num':num
})
#按照人數(shù)進(jìn)行排序
movie_list.sort(key=lambda x:x['num'],reverse=True)
#遍歷booklist
for movie in movie_list:
print(movie)
#畫餅圖襟交,把國家提取出來
city=[]
# 提取國家信息
for country in movie_list:
city.append((country['country']))
# 將國家信息匯總
for country in city:
if len(country) <= 1:
continue
else:
counts[country] = counts.get(country, 0) + 1
items = list(counts.items())
print(items)
movie_name=[]
people=[]
for i in range(4):
role, count = items[i]
print(role, count)
movie_name.append(role)
people.append(count)
#繪制即將上映電影國家的占比圖迈倍,餅圖
explode = [0.1, 0, 0, 0]
plt.pie(people, explode=explode,labels=movie_name, shadow=True, autopct='%1.1f%%')
plt.axis('equal') # 保證餅狀圖是正圓承匣,否則會(huì)有點(diǎn)扁
plt.show()
# 展示最想看的前5家础嫡,柱狀圖
# 電影名稱
top5_movie = [movie_list[i] for i in range(5)]
print(top5_movie)
x = [x['title'] for x in top5_movie]
print(x)
# 想看人數(shù)
y = [x['num'] for x in top5_movie]
print(y)
plt.bar(x,y)
#plt.barh(x,y)
plt.show()
存儲(chǔ)成csv文件
df = pd.DataFrame(movie_list)
df.to_csv('douban.csv')
spider_douban()
5.電影網(wǎng)爬蟲
import requests
from lxml import html
import pandas as pd
from matplotlib import pyplot as plt
plt.rcParams["font.sans-serif"] = ['SimHei']
plt.rcParams['axes.unicode_minus'] = False
def spider_film():
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36"}
film_list = []
#目標(biāo)站點(diǎn)地址
url='https://movie.douban.com/cinema/later/chongqing/'
#獲取站點(diǎn)str類型的響應(yīng)
resp=requests.get(url,headers=headers)
html_data=resp.text
#提取目標(biāo)站信息
selector=html.fromstring(html_data)
ul_list=selector.xpath('//div[@id="showing-soon"]/div')
print('您好,共有{}部電影'.format(len(ul_list)))
#遍歷 ul_list
for div in ul_list:
#電影名
title=div.xpath('./div/h3/a/text()')[0]
print(title)
#上映日期
date=div.xpath('./div/ul/li/text()')[0]
print(date)
#類型
style=div.xpath('./div/ul/li/text()')[1]
print(style)
#上映國家
state =div.xpath('./div/ul/li/text()')[2]
print(state)
#想看人數(shù)
want_people = div.xpath('./div/ul/li[@class="dt last"]/span/text()')[0]
want_people = int(want_people.replace('人想看', ''))
print(want_people)
#添加每個(gè)電影的圖書信息
film_list.append({
'title':title,
'date':date,
'style':style,
'state':state,
'want_people':want_people
})
#排序
film_list.sort(key=lambda x:x['want_people'])
#展示價(jià)格最低的10家 柱狀圖
top5_film=[film_list[i] for i in range(5)]
x=[x['title'] for x in top5_film]
y=[x['want_people'] for x in top5_film]
plt.barh(x,y)
plt.show()
# 調(diào)用函數(shù)
spider_film()