from lxml import etree
from bs4 import BeautifulSoup
import re
html = """
<!DOCTYPE html>
<html>
<head>
<title>xpath test</title>
</head>
<body>
<div price="99.8">
<div>
<ul>
<li>時(shí)間</li>
<li>地點(diǎn)</li>
<li>任務(wù)</li>
</ul>
</div>
<div id='testid' data-h="first">
<h2>這里是個(gè)小標(biāo)題</h2>
<ol>
<li data="one">1</li>
<li data="two">2</li>
<li data="three">3</li>
</ol>
<ul>
<li code="84">84</li>
<li code="104">104</li>
<li code="223">223</li>
</ul>
</div>
<div>
<h3>這里是H3的內(nèi)容
<a >百度一下</a>
<ul>
<li>test1</li>
<li>test2</li>
</ul>
</h3>
</div>
<div id="go">
<ul>
<li>1</li>
<li>2</li>
<li>3</li>
<li>4</li>
<li>5</li>
<li>6</li>
<li>7</li>
<li>8</li>
<li>9</li>
<li>10</li>
</ul>
</div>
</div>
</body>
</html>
"""
def title():
#第一種,xpath提取
html_etree = etree.HTML(html)
# print(type(html_etree)) #<class 'lxml.etree._Element'>
# result = etree.tostring(html_etree) #如果標(biāo)簽不全浙巫,tostring()可以補(bǔ)全
# print(result.decode('utf-8')) #tostring()后的數(shù)據(jù)類型是bytes金蜀,需要decode()轉(zhuǎn)成str
title_xpath1 = html_etree.xpath('/html/head/title/text()') #需要text()把文字解析出來
print('用xpath絕對路徑方法提取title:', title_xpath1) #xpath返回的是列表
title_xpath2 = html_etree.xpath('//head/title/text()') #效果一樣,/表示絕對路徑的畴,//表示相對路徑
print('用xpath相對路徑方法提取title:', title_xpath2)
#第二種渊抄,BeautifulSoup提取
soup = BeautifulSoup(html, 'lxml')
# print(soup)
# print(type(soup)) #<class 'bs4.BeautifulSoup'>
title_soup = soup.select('title') #soup.select返回的也是列表,需要提取出來在用get_text()拿出文字
# css選擇器丧裁,標(biāo)簽名不加修飾护桦,類名前加點(diǎn),id名前加#煎娇,可組合查找
# print(title_soup)
# print(type(title_soup)) #list
title_BeautifuleSoup = title_soup[0].get_text()
# title_BeautifuleSoup = soup.title.get_text()
print('用BeautifulSoup方法提取title:', title_BeautifuleSoup)
#第三種二庵,正則表達(dá)式提取
re_pattern = re.compile(r'<title>(.*?)</title>', re.S) #(.*?)是需要匹配返回的字符串,re.S可換行匹配
# print(type(re_pattern)) #re.compile返回的是數(shù)據(jù)類型正則表達(dá)式:<class 're.Pattern'>
title_re_compile = re.findall(re_pattern, html)
print('用正則表達(dá)式方法提取title:', title_re_compile)
#可以不使用re.compile
title_re = re.findall(r'<title>(.*?)</title>', html)
print('用正則表達(dá)式跳過re.compile提取title:', title_re)
def price():
#第一種,xpath提取
html_etree = etree.HTML(html)
# price_xpath = html_etree.xpath('/html/body/div/@price')
# price_xpath = html_etree.xpath('/html/body/child::*/@price') #child::* 選取當(dāng)前節(jié)點(diǎn)所有子元素
# price_xpath = html_etree.xpath('/html/body/child::div/@price') # child::div 子節(jié)點(diǎn)定位div標(biāo)簽
# price_xpath = html_etree.xpath('//@price') #相對路徑缓呛,且price屬性只有一個(gè)
# price_xpath = html_etree.xpath("http://div[@id='testid']/ancestor::div") #ancestor:: 提取所有父輩div元素
# price_xpath = html_etree.xpath("http://div[@id='testid']/ancestor::div/@price") #父輩定位div元素price屬性
price_xpath = html_etree.xpath("http://div[@id='testid']/ancestor-or-self::div/@price") # 父輩及當(dāng)前節(jié)點(diǎn)div元素
print('用xpath方法提取price:', price_xpath)
#第二種催享,BeautifulSoup提取
soup = BeautifulSoup(html, 'lxml')
price_BeautifulSoup = soup.div.attrs['price']
# price_BeautifulSoup = soup.find('div').attrs['price']
# price_BeautifulSoup = soup.select('div')[0].attrs['price']
print('用BeautifulSoup方法提取price:', price_BeautifulSoup)
#第三種,正則表達(dá)式提取
re_pattern = re.compile(r'<div price="(.*?)">', re.S)
price_re = re.findall(re_pattern, html)
print('用正則表達(dá)式跳過re.compile提取price:', price_re)
提取第一個(gè)div下ul下li的文字
def ul_li():
# 第一種哟绊,xpath提取
html_etree = etree.HTML(html)
# ul_li = html_etree.xpath('//div/div[1]/ul/child::*/text()') #child::節(jié)點(diǎn)子元素方法
# ul_li = html_etree.xpath('//div/div[1]/ul/li/text()')
# ul_li = html_etree.xpath("http://div[@id='testid']/preceding::div/ul/li/text()") #preceding:: 當(dāng)前節(jié)點(diǎn)標(biāo)簽之前的所有節(jié)點(diǎn)因妙,可定點(diǎn)
ul_li = html_etree.xpath("http://div[@id='testid']/preceding::li/text()") #preceding:: 可避免重復(fù)節(jié)點(diǎn)帶來的麻煩
print('用xpath方法提取ul標(biāo)簽下的li的內(nèi)容:', ul_li)
# 第二種,BeautifulSoup提取
soup = BeautifulSoup(html, 'lxml')
# 第一種BeautifulSoup方法
# ul_li = soup.select('ul')[0].select('li')
# ul_li = [i.get_text() for i in ul_li]
#另外一種BeautifulSoup方法
ul_li = soup.div.div.get_text()
ul_li = ul_li.strip() #刪除首尾空格
ul_li = ul_li.split('\n') #按換行符分割字符串
print('用BeautifulSoup方法提取ul_li:', ul_li)
# 第三種票髓,正則表達(dá)式提取
re_pattern = re.compile(r'<div price="99.8">.*?<div>.*?<ul>.*?<li>(.*?)</li>.*?<li>(.*?)</li>.*?<li>(.*?)</li>', re.S)
re_ul_li = re.findall(re_pattern, html)
print('用正則表達(dá)式跳過re.compile提取ul_li:', re_ul_li)
def first_id():
# 第一種攀涵,xpath提取
html_etree = etree.HTML(html)
first_id = html_etree.xpath('//div/div[2]/@id')
print('用xpath方法提取first_id的內(nèi)容:', first_id)
# 第二種,BeautifulSoup提取
soup = BeautifulSoup(html, 'lxml')
first_id = soup.select('div')[2].attrs['id']
print('用BeautifulSoup方法提取first_id:', first_id)
# 第三種洽沟,正則表達(dá)式提取
re_comppile = re.compile(r"<div id='(.*?)' data-h=\"first\">", re.S)
first_id = re.findall(re_comppile, html)
print('用正則表達(dá)式跳過re.compile提取first_id:', first_id)
def h2():
# 第一種以故,xpath提取
html_etree = etree.HTML(html)
h2 = html_etree.xpath('//div/div[2]/h2/text()')
print('用xpath方法提取h2的內(nèi)容:', h2)
# 第二種,BeautifulSoup提取
soup = BeautifulSoup(html, 'lxml')
# h2 = soup.select('h2')[0].get_text()
h2 = soup.div.h2.get_text()
print('用BeautifulSoup方法提取h2:', h2)
# 第三種裆操,正則表達(dá)式提取
re_comppile = re.compile(r'<h2>(.*?)</h2>', re.S)
h2 = re.findall(re_comppile, html)
print('用正則表達(dá)式跳過re.compile提取h2:', h2)
def main():
title()
price()
ul_li()
first_id()
h2()
if name == 'main':
main()