爬蟲
- 需要導入requests妖泄,lxml包
-
思路:
1.url:獲取站點地址
2.headers: 獲取地址響應表頭
3.用requests.get(url,headers)接收網頁后臺服務器響應的內容
4.html.fromstring()提取目標站信息
5.使用xpath語法獲取想要的數據
獲取數據的路徑表達式
比如:.xpath('//div[@id="container"]/a/text()')選取id為container的div中a標簽轉為文本格式
6.遍歷出想要統(tǒng)計的數據進行排序和繪制成圖形展示等操作
實例:從https://movie.douban.com/cinema/later/chongqing/中獲取八月份即將上映的電影名稱险掀,得出上映時間秧均,類型,想看人數籍救,上映電影國家,并將上映國家铺遂,和最想看的電影top用餅狀圖和柱狀圖展示出來
import requests
from lxml import html
from matplotlib import pyplot as plt
plt.rcParams["font.sans-serif"] = ['SimHei']
plt.rcParams['axes.unicode_minus'] = False
def spider_movie():
moviv_list = []
# 目標站點地址
url = 'https://movie.douban.com/cinema/later/chongqing/'
# print(url)
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) "
"Chrome/75.0.3770.142 Safari/537.36"}
# 響應頭部淘讥,如果不寫,會讓瀏覽器拒絕
resp = requests.get(url, headers=headers)
html_data = resp.text
# 將html頁面寫入本地
# with open('movie.html', 'w', encoding='utf-8') as f:
# f.write(html_data)
# 提取目標站的信息
selector = html.fromstring(html_data)
div_list = selector.xpath('//div[@id="showing-soon"]/div')
print('您好泻拦,共有{}部電影上映'.format(len(div_list)))
#
# # 遍歷 div_list
for div in div_list:
# 電影名
title = div.xpath('./div/h3/a/text()')[0]
print(title)
# 獲取ul
ul_list = div.xpath('./div/ul/li/text()')
print(ul_list[0]) # 上映日期
print(ul_list[1]) # 類型
print(ul_list[2]) # 上映國家
person = div.xpath('./div/ul/li[@class="dt last"]/span/text()')[0]
person = int(person.replace('人想看', '')) # 想看人數
# person =person.isdigit()
print(person)
# 添加每一部電影的信息
moviv_list.append({
'name': title,
'time': ul_list[0],
'type': ul_list[1],
'country': ul_list[2],
'person': person
})
# 按照想看人數進行排序
moviv_list.sort(key=lambda x: x['person'], reverse=True)
# 遍歷moviv_list
for movie in moviv_list:
print(movie)
# 繪制top5最想看的電影的柱狀圖
top5_movie = [moviv_list[i] for i in range(5)]
# 電影的名稱
x = [x['name'] for x in top5_movie]
print(x)
# 電影人數
y = [x['person'] for x in top5_movie]
print(y)
# plt.bar(x, y)
plt.barh(x, y)
plt.show()
cout = {}
# 繪制即將上映電影國家的占比圖
for i in moviv_list:
# print(i['country'])
cout[i['country']] = cout.get(i['country'], 0) + 1
# print(cout)
print(cout.values())
print(cout.keys())
counts = cout.values()
labels = cout.keys()
# 距離圓心點距離
explode = [0.1, 0, 0, 0]
plt.pie(counts, explode=explode, shadow=True, labels=labels, autopct='%1.1f%%')
plt.legend(loc=2)
plt.axis('equal')
plt.show()
spider_movie()
爬出數據展示:
數據字典列表
即將上映電影TOP5
即將上映電影國家