一、常用xpath表達(dá)式
#找到class屬性值為btn的div標(biāo)簽
//div[@class="btn"]
#找到class屬性值為song的div的直系子標(biāo)簽ul下的第二個(gè)子標(biāo)簽li下的直系子標(biāo)簽a
//div[@class="song"]/ul/li[2]/a
#找到href屬性值為空且class屬性值為music的a標(biāo)簽
//a[@href="" and @class="music"]
//div[contains(@class, "ng")]
//div[starts-with(@class, "so")]
# /表示獲取某個(gè)標(biāo)簽下的文本內(nèi)容
# //表示獲取某個(gè)標(biāo)簽下的文本內(nèi)容和所有子標(biāo)簽下的文本內(nèi)容
//div[@class="song"]/p[1]/text()
//div[@class="music"]//text()
# 提取div里的所有文字捆姜,深層嵌套的全部文字
data = selector.xpath('//div[@id="test3"]')[0]
info = data.xpath('string(.)')
//div[@class="music"]//li[2]/a/@href
二、python使用xpath表達(dá)式的步驟
1.下載:pip install lxml
2.導(dǎo)包:from lxml import etree
3.將html文檔或者xml文檔轉(zhuǎn)換成一個(gè)etree對(duì)象迎膜,然后調(diào)用對(duì)象中的方法查找指定的節(jié)點(diǎn)
3.1 本地文件:tree = etree.parse(文件名)
tree.xpath("xpath表達(dá)式")
3.2 網(wǎng)絡(luò)數(shù)據(jù):tree = etree.HTML(網(wǎng)頁(yè)內(nèi)容字符串)
tree.xpath("xpath表達(dá)式")
三泥技、案例
import requests
from lxml import etree
url = 'http://www.haoduanzi.com/category-10_2.html'
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.181 Safari/537.36',
}
url_content = requests.get(url, headers=headers).text
# 使用xpath對(duì)url_conten進(jìn)行解析
# 使用xpath解析從網(wǎng)絡(luò)上獲取的數(shù)據(jù)
tree = etree.HTML(url_content)
# 解析獲取當(dāng)頁(yè)所有段子的標(biāo)題
title_list = tree.xpath('//div[@class="log cate10 auth1"]/h3/a/text()')
ele_div_list = tree.xpath('//div[@class="log cate10 auth1"]')
text_list = [] # 存儲(chǔ)段子的文本內(nèi)容
for ele in ele_div_list:
# 段子的文本內(nèi)容
text_list = ele.xpath('./div[@class="cont"]//text()')
# list列表中的文本內(nèi)容全部提取到一個(gè)字符串中
text_str = str(text_list)
# 字符串形式的文本內(nèi)容放置到text_list列表中
text_list.append(text_str)
print(title_list)
print(text_list)
import requests
from lxml import etree
job = input('enter a job:')
url = 'https://www.zhipin.com/job_detail/?'
param = {
'query': job
}
headers = {
'user-agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.81 Safari/537.36'
}
response = requests.get(url=url, params=param, headers=headers)
page_text = response.text # (【1磕仅,獲取網(wǎng)絡(luò)頁(yè)面】)
# 解析:
# 1.獲取所有崗位的鏈接(【2珊豹,將網(wǎng)頁(yè)html實(shí)例化成一個(gè)tree對(duì)象】)
tree = etree.HTML(page_text)
li_list = tree.xpath('//div[@class="job-list"]/ul/li') # (【3,用tree.xpath方法解析篩選想要的部分】)
# 只用Element類型的對(duì)象可以調(diào)用xpath方法
for li in li_list:
job_url = li.xpath("./div/div[1]/h3/a/@href")[0] # .li對(duì)象表示的局部頁(yè)面內(nèi)容
job_url = "https://www.zhipin.com" + job_url
# 對(duì)job_url發(fā)起請(qǐng)求榕订,獲取崗位對(duì)應(yīng)的詳情頁(yè)面(又重新獲取一個(gè)新的網(wǎng)絡(luò)頁(yè)面)
secondPage_text = requests.get(url=job_url, headers=headers).text
tree = etree.HTML(secondPage_text)
# 解析崗位名稱
jobName = tree.xpath('//div[@class="info-primary"]/div[2]/h1/text()')[0]
salary = tree.xpath('//div[@class="info-primary"]/div[2]/span/text()')[0].strip('\n\t')
detail = tree.xpath('//div[@class="info-primary"]/p//text()')[0]
company = tree.xpath('//div[@class="info-company"]/h3/a/text()')[0]
jobDesc = tree.xpath('//div[@class="detail-content"]/div[1]/div//text()')[0]
# 持久化存儲(chǔ)