day67-爬蟲之xml及beautifulsoup

1爬蟲解析庫的使用

Xpath解析庫
使用Xpath解析庫需要先安裝lxml庫
pip3 install lxml
Beautiful Soup解析庫
pip3 install beautifulsoup4
還需要安裝requests
pip3 install requests

2從js中獲取數(shù)據(jù)

將接口傳遞給js的數(shù)據(jù)獲取

import re
import json
import requests
# 獲取網(wǎng)頁
def get_page():
    # 網(wǎng)址
    url = 'http://cd.meituan.com/meishi/b6119/'
    # 偽裝請求頭
    headers = {
        "User-Agent": "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; 360SE)"
    }
    # 請求網(wǎng)頁并返回響應(yīng)
    response = requests.get(url, headers=headers)
    if response.status_code == 200:
        return response.content.decode('utf-8')
    return None
def main():
    html = get_page()
    print(html)
    pattern = re.compile('"poiInfos":(.*?)},"comHeader"', re.S)
    result = re.findall(pattern, html)
    print(result)
    meituan = json.loads(result[0])
    print(len(meituan))
    for item in meituan:
        print(item['title'])
if __name__ == '__main__':
    main()

3lxml使用

import requests
import re
from lxml import etree
# 取頁面HTML
def get_one_page():
    url = "https://www.douban.com/group/explore"
    headers =  {
        "User-Agent": "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; 360SE)" 
    }
    response = requests.get(url, headers=headers)
    if response.status_code == 200:
        text = response.content.decode('utf-8')
        return text
    return None
# 解析頁面
def parse_with_xpath(html):
    etree_html = etree.HTML(html)
    print(etree_html)
    channel_result = etree_html.xpath('//div[@class="channel-item"]')
    for channel in channel_result:
        title = channel.xpath('./div[@class="bd"]/h3/a/text()')[0]  
        print(title)        
    title_result = etree_html.xpath('//div[@class="channel-item"]/div[@class="bd"]/h3/a/text()')
    print(title_result)
    # 匹配所有節(jié)點(diǎn) //* ; 雙斜杠表示從根目錄下匹配(即從最開始匹配)，單斜杠表示獲取上一個的子節(jié)點(diǎn)
    result = etree_html.xpath('//*')
    print(result)
    print(len(result))
    # 匹配所有子節(jié)點(diǎn) //a    文本獲壤：text()
    result = etree_html.xpath('//a/text()')
    print(result)
    # 查找元素子節(jié)點(diǎn) /
    result = etree_html.xpath('//div/p/text()')
    print(result)
    # 查找元素所有子孫節(jié)點(diǎn) //
    result = etree_html.xpath('//div[@class="channel-item"]')
    print(len(result))
    result = etree_html.xpath('//div[@class="channel-item"] | //span[@class="pubtime"]/../span/a/text()')
    print(result)
    # 父節(jié)點(diǎn) ..
    result = etree_html.xpath('//span[@class="pubtime"]/../span/a/text()')
    print(result)
    # 屬性匹配 [@class="xxx"]
    # 文本匹配 text() 獲取所有文本//text()
    result = etree_html.xpath('//div[@class="article"]//text()')
    print(result)
    # 屬性獲取 @href
    result = etree_html.xpath('//div[@class="article"]/div/div/@class')[0]
    print(result)
    result = etree_html.xpath('//div[@class="bd"]/h3/a/@href')
    print(result)
    # 屬性多值匹配 contains(@class 'xx')
    result = etree_html.xpath('//div[contains(@class, "grid-16-8")]//div[@class="likes"]/text()[1]')
    print(result)
    # 多屬性匹配 or, and, mod, //book | //cd, + - * div = != < > <= >=
    result = etree_html.xpath('//span[@class="pubtime" and contains(text(), "-12-29")]/text()')
    print(result)
    # 按序選擇 [1] [last()] [poistion() < 3] [last() -2]
    # 節(jié)點(diǎn)軸
    result = etree_html.xpath('//div/child::div[@class="likes"]/following-sibling::*//span[@class="pubtime"]/text()')
    print(result)
    print(len(result))
    //li/ancestor::*  所有祖先節(jié)點(diǎn)
    //li/ancestor::div div這個祖先節(jié)點(diǎn)
    //li/attribute::* attribute軸，獲取li節(jié)點(diǎn)所有屬性值
    //li/child::a[@href="link1.html"]  child軸愉烙，獲取直接子節(jié)點(diǎn)
    //li/descendant::span 獲取所有span類型的子孫節(jié)點(diǎn)   
    //li/following::* 選取文檔中當(dāng)前節(jié)點(diǎn)的結(jié)束標(biāo)記之后的所有節(jié)點(diǎn)
    //li/following-sibling::*     選取當(dāng)前節(jié)點(diǎn)之后的所有同級節(jié)點(diǎn)
    result = etree_html.xpath('//div[@class="channel-item"][1]/following-sibling::*')
    print(result)
    print(len(result))      
    result = etree_html.xpath('//div[contains(@class, "channel-group-rec")]//div[@class="title"]/following::*[1]/text()')
    print(result)
def main():
    html = get_one_page()
    print(html)
    parse_with_xpath(html)
if __name__ == '__main__':
    main()

channel_result = etree_html.xpath('//div[@class="channel-item"]') 獲取div中class屬性為channel-item的節(jié)點(diǎn)
channel_result = etree_html.xpath('//div[@class="channel-item"]/text()') 獲取div中class屬性為channel-item的節(jié)點(diǎn)的文本內(nèi)容(不包括屬性)
channel_result = etree_html.xpath('//div[@class="channel-item"]//text()') 獲取div中class屬性為channel-item的節(jié)點(diǎn)的文本內(nèi)容(包括屬性)
雙斜杠表示從當(dāng)前目錄下匹配;單斜杠text()表示獲取標(biāo)簽中的文本內(nèi)容;//text()表示獲取所有文本(包括屬性)
/表示該標(biāo)簽下的子標(biāo)簽;//標(biāo)簽該標(biāo)簽下的所有標(biāo)簽;./表示從當(dāng)前標(biāo)簽取其子標(biāo)簽;.//表示從當(dāng)前標(biāo)簽取其所有標(biāo)簽
result = etree_html.xpath('//div[@class="bd"]/h3/a/@href') 獲取屬性
| 表示并差购，一下取多個內(nèi)容渔肩；
按序選擇 [1]第一個 [last()]最后一個 [poistion() < 3]前兩個 [last() -2]倒數(shù)第三個

4beautifulsoup使用

from bs4 import BeautifulSoup
import requests
import re
# 取頁面HTML
def get_one_page():
    url = "http://sports.sina.com.cn/nba/"
    headers =  {
        "User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:2.0b13pre) Gecko/20110307 Firefox/4.0b13pre" 
    }
    response = requests.get(url, headers=headers)
    if response.status_code == 200:
        text = response.content.decode('utf-8')
        return text
    return None
def parse_with_bs4(html):
    # html = '<p><div><a></a></div></p>'
    # print(html)
    soup = BeautifulSoup(html, 'lxml')      
    # 讓頁面標(biāo)簽整齊的輸出
    print(soup.prettify())
    # head標(biāo)簽里面title的文字內(nèi)容
    print(soup.title.string)
    # 取整個指定的標(biāo)簽
    print(soup.head)
    print(type(soup.head))
    print(soup.p)
    print(soup.p.name)
    print(soup.img.attrs["src"])
    print(soup.img.attrs)
    print(soup.img.attrs['src'])
    print(soup.img['src'])
    print(soup.p)
    print(soup.p.contents) # 取p節(jié)點(diǎn)下面所有子節(jié)點(diǎn)列表
    print(soup.p.descendants) # 取p節(jié)點(diǎn)所有子孫節(jié)點(diǎn)
    # 取a的所有祖先節(jié)點(diǎn)
    # [0]表示取它的父標(biāo)簽汽纠，1表示在往上取一層
    print(soup.a.parent) # 取父節(jié)點(diǎn)
    print(soup.a.parents) # 取所有祖先節(jié)點(diǎn)
    print(list(soup.a.parents)) # 取父節(jié)點(diǎn)
    print(list(soup.a.parents)[0].attrs['class'])
    print(soup.head.title.string)
    result = soup.select('.news-list-b .list .item p a')
    for item in result:
        print(item.string)
        print(item['href'])
    result = soup.select('.-live-layout-row.layout_sports_350_650')
    print(result)
    l = soup.select('.ct_t_01 a')
    for item in l:
        print(item.string)
        print(item['href'])
    print(len(l))
    item = soup.select('#syncad_1 p')[0]
    print(item)
    print(item.contents)
    print(len(item.contents))
    item = soup.select('.b_time')[0].string
    print(item)
def main():
    html = get_one_page()
    # print(html)
    parse_with_bs4(html)
if __name__ == '__main__':
    main()

soup = BeautifulSoup(html, 'lxml');生成soup對象厨相；
soup.prettify()對傳入的html參數(shù)進(jìn)行全部
soup.title.string 獲取title中的文本內(nèi)容；soup.head獲取head標(biāo)簽及其中的內(nèi)容
soup.p.name 獲取p標(biāo)簽的標(biāo)簽名(此處為p)凡纳；soup.img['src']獲取img中的src屬性
soup.p.contents 取p節(jié)點(diǎn)下面所有子節(jié)點(diǎn)列表窃植；
soup.p.descendants 取p節(jié)點(diǎn)所有子孫節(jié)點(diǎn)；soup.a.parent 取父節(jié)點(diǎn)荐糜；soup.a.parents 取所有祖先節(jié)點(diǎn)巷怜；list(soup.a.parents)[0].attrs['class']葛超； [0]表示取它的父標(biāo)簽，1表示在往上取一層

result = soup.select('.news-list-b .list .item p a')
for item in result:
    print(item.string)
    print(item['href'])

獲取對應(yīng)class下的a標(biāo)簽并輸出其文本內(nèi)容及其href屬性延塑。
result = soup.select('.-live-layout-row.layout_sports_350_650')

5將數(shù)據(jù)存至數(shù)據(jù)庫

5.1建表語句

create database maoyan default character set='utf8';
use maoyan;
create table movie (
  id int primary key auto_increment,
  title varchar(256),
  actor varchar(256),
  detail varchar(1024),
  cover_url varchar(1024)
);

5.2存數(shù)據(jù)相關(guān)函數(shù)

import pymysql
# 獲取數(shù)據(jù)庫連接
def get_connection():
    host = '127.0.0.1'
    port = 3306
    user = 'root'
    password = '123456'
    database = 'maoyan'
    db = pymysql.connect(host, user, password, database, charset='utf8', port=port)
    return db
# 獲取數(shù)據(jù)庫游標(biāo)
def get_cursor(db):
    cursor = db.cursor()
    return cursor
# 關(guān)閉鏈接
def close_connection(db):
    db.colos()
# 插入一條記錄
def insert_record(db, cursor, item):
    sql = 'insert into movie (title, actor, detail, cover_url) values("%s","%s","%s","%s")' % (item['movie_name'], item['actor'], item['detail'], item['cover'])
    print(sql)
    cursor.execute(sql)
    # 需要上傳绣张，數(shù)據(jù)才能存入數(shù)據(jù)庫
    db.commit()

5.3爬蟲中存數(shù)據(jù)

from maoyan_db_helper import *
db = get_connection()
cursor = get_cursor(db)
for i in range(len(details)):
    simple_message = {}
    simple_message['cover'] = covers[i]
    insert_record(db, cursor, simple_message)

最后編輯于：2022.01.27 17:08:59

?著作權(quán)歸作者所有,轉(zhuǎn)載或內(nèi)容合作請聯(lián)系作者

人面猴
序言：七十年代末，一起剝皮案震驚了整個濱河市关带，隨后出現(xiàn)的幾起案子侥涵，更是在濱河造成了極大的恐慌，老刑警劉巖宋雏，帶你破解...
沈念sama閱讀 212,884評論 6贊 492
死咒
序言：濱河連續(xù)發(fā)生了三起死亡事件芜飘，死亡現(xiàn)場離奇詭異，居然都是意外死亡磨总，警方通過查閱死者的電腦和手機(jī)嗦明，發(fā)現(xiàn)死者居然都...
沈念sama閱讀 90,755評論 3贊 385
救了他兩次的神仙讓他今天三更去死
文/潘曉璐我一進(jìn)店門，熙熙樓的掌柜王于貴愁眉苦臉地迎上來蚪燕，“玉大人娶牌，你說我怎么就攤上這事」菽桑” “怎么了诗良？”我有些...
開封第一講書人閱讀 158,369評論 0贊 348
道士緝兇錄：失蹤的賣姜人
文/不壞的土叔我叫張陵，是天一觀的道長鲁驶。經(jīng)常有香客問我鉴裹，道長，這世上最難降的妖魔是什么灵嫌？我笑而不...
開封第一講書人閱讀 56,799評論 1贊 285
?港島之戀（遺憾婚禮）
正文為了忘掉前任壹罚，我火速辦了婚禮，結(jié)果婚禮上寿羞，老公的妹妹穿的比我還像新娘猖凛。我一直安慰自己，他們只是感情好绪穆，可當(dāng)我...
茶點(diǎn)故事閱讀 65,910評論 6贊 386
惡毒庶女頂嫁案：這布局不是一般人想出來的
文/花漫我一把揭開白布辨泳。她就那樣靜靜地躺著，像睡著了一般玖院。火紅的嫁衣襯著肌膚如雪菠红。梳的紋絲不亂的頭發(fā)上，一...
開封第一講書人閱讀 50,096評論 1贊 291
城市分裂傳說
那天难菌，我揣著相機(jī)與錄音试溯，去河邊找鬼。笑死郊酒，一個胖子當(dāng)著我的面吹牛遇绞，可吹牛的內(nèi)容都是我干的键袱。我是一名探鬼主播，決...
沈念sama閱讀 39,159評論 3贊 411
雙鴛鴦連環(huán)套：你想象不到人心有多黑
文/蒼蘭香墨我猛地睜開眼摹闽，長吁一口氣：“原來是場噩夢啊……” “哼蹄咖！你這毒婦竟也來了？” 一聲冷哼從身側(cè)響起付鹿，我...
開封第一講書人閱讀 37,917評論 0贊 268
萬榮殺人案實(shí)錄
序言：老撾萬榮一對情侶失蹤澜汤，失蹤者是張志新（化名）和其女友劉穎，沒想到半個月后舵匾，有當(dāng)?shù)厝嗽跇淞掷锇l(fā)現(xiàn)了一具尸體俊抵，經(jīng)...
沈念sama閱讀 44,360評論 1贊 303
?護(hù)林員之死
正文獨(dú)居荒郊野嶺守林人離奇死亡，尸身上長有42處帶血的膿包…… 初始之章·張勛以下內(nèi)容為張勛視角年9月15日...
茶點(diǎn)故事閱讀 36,673評論 2贊 327
?白月光啟示錄
正文我和宋清朗相戀三年坐梯，在試婚紗的時候發(fā)現(xiàn)自己被綠了务蝠。大學(xué)時的朋友給我發(fā)了我未婚夫和他白月光在一起吃飯的照片。...
茶點(diǎn)故事閱讀 38,814評論 1贊 341
活死人
序言：一個原本活蹦亂跳的男人離奇死亡烛缔，死狀恐怖，靈堂內(nèi)的尸體忽然破棺而出轩拨，到底是詐尸還是另有隱情践瓷，我是刑警寧澤，帶...
沈念sama閱讀 34,509評論 4贊 334
?日本核電站爆炸內(nèi)幕
正文年R本政府宣布亡蓉，位于F島的核電站晕翠，受9級特大地震影響，放射性物質(zhì)發(fā)生泄漏砍濒。R本人自食惡果不足惜淋肾，卻給世界環(huán)境...
茶點(diǎn)故事閱讀 40,156評論 3贊 317
男人毒藥：我在死后第九天來索命
文/蒙蒙一、第九天我趴在偏房一處隱蔽的房頂上張望爸邢。院中可真熱鬧樊卓，春花似錦、人聲如沸杠河。這莊子的主人今日做“春日...
開封第一講書人閱讀 30,882評論 0贊 21
一樁弒父案，背后竟有這般陰謀
文/蒼蘭香墨我抬頭看了看天上的太陽券敌。三九已至唾戚，卻和暖如春，著一層夾襖步出監(jiān)牢的瞬間待诅，已是汗流浹背叹坦。一陣腳步聲響...
開封第一講書人閱讀 32,123評論 1贊 267
情欲美人皮
我被黑心中介騙來泰國打工，沒想到剛下飛機(jī)就差點(diǎn)兒被人妖公主榨干…… 1. 我叫王不留卑雁，地道東北人募书。一個月前我還...
沈念sama閱讀 46,641評論 2贊 362
代替公主和親
正文我出身青樓绪囱，卻偏偏與公主長得像，于是被迫代替她去往敵國和親锐膜。傳聞我的和親對象是個殘疾皇子毕箍，可洞房花燭夜當(dāng)晚...
茶點(diǎn)故事閱讀 43,728評論 2贊 351