爬蟲學(xué)習(xí)
# -*- coding: utf-8 -*-
# @Time : 2019/7/31 11:28
# @Author : Eric Lee
# @Email : li.yan_li@neusoft.com
# @File : spider_dangdang.py
# @Software: PyCharm
import requests
from lxml import html
def spider_dangdang(isbn):
# 目標(biāo)站點(diǎn)地址
url = 'http://search.dangdang.com/?key={}&act=input'.format(isbn)
# print(url)
# 獲取站點(diǎn)str類型的響應(yīng)
headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.142 Safari/537.36"}
resp = requests.get(url, headers=headers)
html_data = resp.text
# 將html頁面寫入本地
# with open('dangdang.html', 'w', encoding='utf-8') as f:
# f.write(html_data)
# 提取目標(biāo)站的信息
selector = html.fromstring(html_data)
ul_list = selector.xpath('//div[@id="search_nature_rg"]/ul/li')
print('您好,共有{}家店鋪售賣此圖書'.format(len(ul_list)))
# 遍歷 ul_list
for li in ul_list:
# 圖書名稱
title = li.xpath('./a/@title')[0].strip()
print(title)
# 圖書購買鏈接
link = li.xpath('a/@href')[0]
print(link)
# 圖書價(jià)格
price = li.xpath('./p[@class="price"]/span[@class="search_now_price"]/text()')[0]
price = float(price.replace('¥',''))
print(price)
# 圖書賣家名稱
store = li.xpath('./p[@class="search_shangjia"]/a/text()')
# if len(store) == 0:
# store = '當(dāng)當(dāng)自營'
# else:
# store = store[0]
store = '當(dāng)當(dāng)自營' if len(store) == 0 else store[0]
print(store)
XPath
XPath 節(jié)點(diǎn)
節(jié)點(diǎn)
在 XPath 中线得,有七種類型的節(jié)點(diǎn):元素、屬性裸扶、文本框都、命名空間、處理指令呵晨、注釋以及文檔(根)節(jié)點(diǎn)。XML 文檔是被作為節(jié)點(diǎn)樹來對(duì)待的熬尺。樹的根被稱為文檔節(jié)點(diǎn)或者根節(jié)點(diǎn)摸屠。
請(qǐng)看下面這個(gè) XML 文檔:
<?xml version="1.0" encoding="UTF-8"?>
<bookstore>
<book>
<title lang="en">Harry Potter</title>
<author>J K. Rowling</author>
<year>2005</year>
<price>29.99</price>
</book>
</bookstore>
上面的XML文檔中的節(jié)點(diǎn)例子:
<bookstore> (文檔節(jié)點(diǎn))
<author>J K. Rowling</author> (元素節(jié)點(diǎn))
lang="en" (屬性節(jié)點(diǎn))
基本值(或稱原子值,Atomic value)
基本值是無父或無子的節(jié)點(diǎn)粱哼。
基本值的例子:
J K. Rowling
"en"
項(xiàng)目(Item)
項(xiàng)目是基本值或者節(jié)點(diǎn)季二。
節(jié)點(diǎn)關(guān)系
父(Parent)
每個(gè)元素以及屬性都有一個(gè)父。
在下面的例子中揭措,book 元素是 title胯舷、author、year 以及 price 元素的父:
<book>
<title>Harry Potter</title>
<author>J K. Rowling</author>
<year>2005</year>
<price>29.99</price>
</book>
子(Children)
元素節(jié)點(diǎn)可有零個(gè)绊含、一個(gè)或多個(gè)子桑嘶。
在下面的例子中,title躬充、author逃顶、year 以及 price 元素都是 book 元素的子:
<book>
<title>Harry Potter</title>
<author>J K. Rowling</author>
<year>2005</year>
<price>29.99</price>
</book>
同胞(Sibling)
擁有相同的父的節(jié)點(diǎn)
在下面的例子中,title充甚、author以政、year 以及 price 元素都是同胞:
<book>
<title>Harry Potter</title>
<author>J K. Rowling</author>
<year>2005</year>
<price>29.99</price>
</book>
先輩(Ancestor)
某節(jié)點(diǎn)的父、父的父伴找,等等盈蛮。
在下面的例子中,title 元素的先輩是 book 元素和 bookstore 元素:
<bookstore>
<book>
<title>Harry Potter</title>
<author>J K. Rowling</author>
<year>2005</year>
<price>29.99</price>
</book>
</bookstore>
后代(Descendant)
某個(gè)節(jié)點(diǎn)的子技矮,子的子抖誉,等等。
在下面的例子中穆役,bookstore 的后代是 book寸五、title、author耿币、year 以及 price 元素:
<bookstore>
<book>
<title>Harry Potter</title>
<author>J K. Rowling</author>
<year>2005</year>
<price>29.99</price>
</book>
</bookstore>
選取節(jié)點(diǎn)
電影top5
import requests
from lxml import html
import pandas as pd
import jieba
from matplotlib import pyplot as plt
plt.rcParams["font.sans-serif"] = ['SimHei']
plt.rcParams['axes.unicode_minus'] = False
def Film():
# 目標(biāo)站點(diǎn)地址
url = 'https://movie.douban.com/cinema/later/chongqing/'
header = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.142 Safari/537.36"}
resp = requests.get(url, headers=header)
html_data = resp.text
# 提取目標(biāo)站的信息
selector = html.fromstring(html_data)
film = selector.xpath('//div[@id="showing-soon"]/div')
print(film)
div_list = []
for film_list in film:
# 電影名
title_list = film_list.xpath('./div/h3/a/text()')[0]
print(title_list)
# 上映時(shí)間
time_list = film_list.xpath('./div/ul/li[1]/text()')[0]
print(time_list)
# 電影類型
type_list = film_list.xpath('./div/ul/li[2]/text()')[0]
print(type_list)
# 上映國家
con_list = film_list.xpath('./div/ul/li[3]/text()')[0]
print(con_list)
# 想看人數(shù)
number_list = film_list.xpath('./div/ul/li[4]/span/text()')[0]
print(number_list)
# 替換
number_list = int(number_list.replace('人想看',''))
# 添加電影信息
div_list.append({
'title': title_list,
'time': time_list,
'type': type_list,
'con': con_list,
'number': number_list
})
# 按照想看人數(shù)排序
div_list.sort(key=lambda x:x['number'], reverse=True )
print(div_list)
# 遍歷
for items_list in div_list:
print(items_list)
# 繪制top5最想看的電影占比圖
# 提取前五部電影信息
top5_store = [div_list[i] for i in range(5)]
# 提取電影名
x = [x['title'] for x in top5_store]
print(x)
# 提取想看人數(shù)
y = [x['number'] for x in top5_store]
print(y)
explode = [0.1, 0, 0, 0, 0]
plt.pie(y, explode=explode, labels=x, shadow=True, autopct='%1.1f%%')
plt.axis('equal')
plt.legend(loc=2)
plt.show()
# 繪制即將上映電影國家的占比圖
counts = {}
# 提取所有上映國家
s = [x['con'] for x in div_list]
print(s)
# 統(tǒng)計(jì)上映國家與數(shù)量
for word in s:
counts[word] = counts.get(word, 0) + 1
print(counts)
# 提取上映國家
name = counts.keys()
print(name)
# 提取數(shù)量
number = counts.values()
print(number)
explode1 = [0.1, 0, 0, 0]
plt.pie(number, explode=explode1, labels=name, shadow=True, autopct='%1.1f%%')
plt.axis('equal')
plt.legend(loc=2)
plt.show()
Film()