1爬蟲解析庫的使用
Xpath解析庫
使用Xpath解析庫需要先安裝lxml庫
pip3 install lxml
Beautiful Soup解析庫
pip3 install beautifulsoup4
還需要安裝requests
pip3 install requests
2從js中獲取數(shù)據(jù)
將接口傳遞給js的數(shù)據(jù)獲取
import re
import json
import requests
# 獲取網(wǎng)頁
def get_page():
# 網(wǎng)址
url = 'http://cd.meituan.com/meishi/b6119/'
# 偽裝請求頭
headers = {
"User-Agent": "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; 360SE)"
}
# 請求網(wǎng)頁并返回響應(yīng)
response = requests.get(url, headers=headers)
if response.status_code == 200:
return response.content.decode('utf-8')
return None
def main():
html = get_page()
print(html)
pattern = re.compile('"poiInfos":(.*?)},"comHeader"', re.S)
result = re.findall(pattern, html)
print(result)
meituan = json.loads(result[0])
print(len(meituan))
for item in meituan:
print(item['title'])
if __name__ == '__main__':
main()
3lxml使用
import requests
import re
from lxml import etree
# 取頁面HTML
def get_one_page():
url = "https://www.douban.com/group/explore"
headers = {
"User-Agent": "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; 360SE)"
}
response = requests.get(url, headers=headers)
if response.status_code == 200:
text = response.content.decode('utf-8')
return text
return None
# 解析頁面
def parse_with_xpath(html):
etree_html = etree.HTML(html)
print(etree_html)
channel_result = etree_html.xpath('//div[@class="channel-item"]')
for channel in channel_result:
title = channel.xpath('./div[@class="bd"]/h3/a/text()')[0]
print(title)
title_result = etree_html.xpath('//div[@class="channel-item"]/div[@class="bd"]/h3/a/text()')
print(title_result)
# 匹配所有節(jié)點(diǎn) //* ; 雙斜杠表示從根目錄下匹配(即從最開始匹配),單斜杠表示獲取上一個的子節(jié)點(diǎn)
result = etree_html.xpath('//*')
print(result)
print(len(result))
# 匹配所有子節(jié)點(diǎn) //a 文本獲壤:text()
result = etree_html.xpath('//a/text()')
print(result)
# 查找元素子節(jié)點(diǎn) /
result = etree_html.xpath('//div/p/text()')
print(result)
# 查找元素所有子孫節(jié)點(diǎn) //
result = etree_html.xpath('//div[@class="channel-item"]')
print(len(result))
result = etree_html.xpath('//div[@class="channel-item"] | //span[@class="pubtime"]/../span/a/text()')
print(result)
# 父節(jié)點(diǎn) ..
result = etree_html.xpath('//span[@class="pubtime"]/../span/a/text()')
print(result)
# 屬性匹配 [@class="xxx"]
# 文本匹配 text() 獲取所有文本//text()
result = etree_html.xpath('//div[@class="article"]//text()')
print(result)
# 屬性獲取 @href
result = etree_html.xpath('//div[@class="article"]/div/div/@class')[0]
print(result)
result = etree_html.xpath('//div[@class="bd"]/h3/a/@href')
print(result)
# 屬性多值匹配 contains(@class 'xx')
result = etree_html.xpath('//div[contains(@class, "grid-16-8")]//div[@class="likes"]/text()[1]')
print(result)
# 多屬性匹配 or, and, mod, //book | //cd, + - * div = != < > <= >=
result = etree_html.xpath('//span[@class="pubtime" and contains(text(), "-12-29")]/text()')
print(result)
# 按序選擇 [1] [last()] [poistion() < 3] [last() -2]
# 節(jié)點(diǎn)軸
result = etree_html.xpath('//div/child::div[@class="likes"]/following-sibling::*//span[@class="pubtime"]/text()')
print(result)
print(len(result))
//li/ancestor::* 所有祖先節(jié)點(diǎn)
//li/ancestor::div div這個祖先節(jié)點(diǎn)
//li/attribute::* attribute軸,獲取li節(jié)點(diǎn)所有屬性值
//li/child::a[@href="link1.html"] child軸愉烙,獲取直接子節(jié)點(diǎn)
//li/descendant::span 獲取所有span類型的子孫節(jié)點(diǎn)
//li/following::* 選取文檔中當(dāng)前節(jié)點(diǎn)的結(jié)束標(biāo)記之后的所有節(jié)點(diǎn)
//li/following-sibling::* 選取當(dāng)前節(jié)點(diǎn)之后的所有同級節(jié)點(diǎn)
result = etree_html.xpath('//div[@class="channel-item"][1]/following-sibling::*')
print(result)
print(len(result))
result = etree_html.xpath('//div[contains(@class, "channel-group-rec")]//div[@class="title"]/following::*[1]/text()')
print(result)
def main():
html = get_one_page()
print(html)
parse_with_xpath(html)
if __name__ == '__main__':
main()
channel_result = etree_html.xpath('//div[@class="channel-item"]') 獲取div中class屬性為channel-item的節(jié)點(diǎn)
channel_result = etree_html.xpath('//div[@class="channel-item"]/text()') 獲取div中class屬性為channel-item的節(jié)點(diǎn)的文本內(nèi)容(不包括屬性)
channel_result = etree_html.xpath('//div[@class="channel-item"]//text()') 獲取div中class屬性為channel-item的節(jié)點(diǎn)的文本內(nèi)容(包括屬性)
雙斜杠表示從當(dāng)前目錄下匹配;單斜杠text()表示獲取標(biāo)簽中的文本內(nèi)容;//text()表示獲取所有文本(包括屬性)
/表示該標(biāo)簽下的子標(biāo)簽;//標(biāo)簽該標(biāo)簽下的所有標(biāo)簽;./表示從當(dāng)前標(biāo)簽取其子標(biāo)簽;.//表示從當(dāng)前標(biāo)簽取其所有標(biāo)簽
result = etree_html.xpath('//div[@class="bd"]/h3/a/@href') 獲取屬性
| 表示并差购,一下取多個內(nèi)容渔肩;
按序選擇 [1]第一個 [last()]最后一個 [poistion() < 3]前兩個 [last() -2]倒數(shù)第三個
4beautifulsoup使用
from bs4 import BeautifulSoup
import requests
import re
# 取頁面HTML
def get_one_page():
url = "http://sports.sina.com.cn/nba/"
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:2.0b13pre) Gecko/20110307 Firefox/4.0b13pre"
}
response = requests.get(url, headers=headers)
if response.status_code == 200:
text = response.content.decode('utf-8')
return text
return None
def parse_with_bs4(html):
# html = '<p><div><a></a></div></p>'
# print(html)
soup = BeautifulSoup(html, 'lxml')
# 讓頁面標(biāo)簽整齊的輸出
print(soup.prettify())
# head標(biāo)簽里面title的文字內(nèi)容
print(soup.title.string)
# 取整個指定的標(biāo)簽
print(soup.head)
print(type(soup.head))
print(soup.p)
print(soup.p.name)
print(soup.img.attrs["src"])
print(soup.img.attrs)
print(soup.img.attrs['src'])
print(soup.img['src'])
print(soup.p)
print(soup.p.contents) # 取p節(jié)點(diǎn)下面所有子節(jié)點(diǎn)列表
print(soup.p.descendants) # 取p節(jié)點(diǎn)所有子孫節(jié)點(diǎn)
# 取a的所有祖先節(jié)點(diǎn)
# [0]表示取它的父標(biāo)簽汽纠,1表示在往上取一層
print(soup.a.parent) # 取父節(jié)點(diǎn)
print(soup.a.parents) # 取所有祖先節(jié)點(diǎn)
print(list(soup.a.parents)) # 取父節(jié)點(diǎn)
print(list(soup.a.parents)[0].attrs['class'])
print(soup.head.title.string)
result = soup.select('.news-list-b .list .item p a')
for item in result:
print(item.string)
print(item['href'])
result = soup.select('.-live-layout-row.layout_sports_350_650')
print(result)
l = soup.select('.ct_t_01 a')
for item in l:
print(item.string)
print(item['href'])
print(len(l))
item = soup.select('#syncad_1 p')[0]
print(item)
print(item.contents)
print(len(item.contents))
item = soup.select('.b_time')[0].string
print(item)
def main():
html = get_one_page()
# print(html)
parse_with_bs4(html)
if __name__ == '__main__':
main()
soup = BeautifulSoup(html, 'lxml');生成soup對象厨相;
soup.prettify()對傳入的html參數(shù)進(jìn)行全部
soup.title.string 獲取title中的文本內(nèi)容;soup.head獲取head標(biāo)簽及其中的內(nèi)容
soup.p.name 獲取p標(biāo)簽的標(biāo)簽名(此處為p)凡纳;soup.img['src']獲取img中的src屬性
soup.p.contents 取p節(jié)點(diǎn)下面所有子節(jié)點(diǎn)列表窃植;
soup.p.descendants 取p節(jié)點(diǎn)所有子孫節(jié)點(diǎn);soup.a.parent 取父節(jié)點(diǎn)荐糜;soup.a.parents 取所有祖先節(jié)點(diǎn)巷怜;list(soup.a.parents)[0].attrs['class']葛超; [0]表示取它的父標(biāo)簽,1表示在往上取一層
result = soup.select('.news-list-b .list .item p a')
for item in result:
print(item.string)
print(item['href'])
獲取對應(yīng)class下的a標(biāo)簽并輸出其文本內(nèi)容及其href屬性延塑。
result = soup.select('.-live-layout-row.layout_sports_350_650')
5將數(shù)據(jù)存至數(shù)據(jù)庫
5.1建表語句
create database maoyan default character set='utf8';
use maoyan;
create table movie (
id int primary key auto_increment,
title varchar(256),
actor varchar(256),
detail varchar(1024),
cover_url varchar(1024)
);
5.2存數(shù)據(jù)相關(guān)函數(shù)
import pymysql
# 獲取數(shù)據(jù)庫連接
def get_connection():
host = '127.0.0.1'
port = 3306
user = 'root'
password = '123456'
database = 'maoyan'
db = pymysql.connect(host, user, password, database, charset='utf8', port=port)
return db
# 獲取數(shù)據(jù)庫游標(biāo)
def get_cursor(db):
cursor = db.cursor()
return cursor
# 關(guān)閉鏈接
def close_connection(db):
db.colos()
# 插入一條記錄
def insert_record(db, cursor, item):
sql = 'insert into movie (title, actor, detail, cover_url) values("%s","%s","%s","%s")' % (item['movie_name'], item['actor'], item['detail'], item['cover'])
print(sql)
cursor.execute(sql)
# 需要上傳绣张,數(shù)據(jù)才能存入數(shù)據(jù)庫
db.commit()
5.3爬蟲中存數(shù)據(jù)
from maoyan_db_helper import *
db = get_connection()
cursor = get_cursor(db)
for i in range(len(details)):
simple_message = {}
simple_message['cover'] = covers[i]
insert_record(db, cursor, simple_message)