from lxml import etree
text = '''
<li class="con_list_item default_list" data-index="1" data-positionid="4383913" data-salary="20k-25k" data-company="人人行(借貸寶)" data-positionname="Java工程師" data-companyid="61921" data-hrid="7134052" data-adword="0">
<div class="list_item_top">
<div class="position">
<div class="p_top">
<a class="position_link" target="_blank" data-index="1" data-lg-tj-id="8E00" data-lg-tj-no="
0102
" data-lg-tj-cid="4383913" data-lg-tj-abt="dm-csearch-useUserAllInterest|0">
<h3 style="max-width: 180px;">Java工程師</h3>
<span class="add">[<em>北京·朝陽(yáng)區(qū)</em>]</span>
</a>
<span class="format-time">2天前發(fā)布</span>
</div>
<div class="p_bot">
<div class="li_b_l">
<span class="money">20k-25k</span>
<!--<i></i>-->經(jīng)驗(yàn)3-5年 / 不限
</div>
</div>
</div>
<div class="company">
<div class="company_name">
<a target="_blank" data-lg-tj-id="8F00" data-lg-tj-no="
0102
" data-lg-tj-cid="61921" data-lg-tj-abt="dm-csearch-useUserAllInterest|0">人人行(借貸寶)</a><i class="company_mark"><span>該企業(yè)已上傳營(yíng)業(yè)執(zhí)照并通過資質(zhì)驗(yàn)證審核</span></i>
</div>
<div class="industry">
移動(dòng)互聯(lián)網(wǎng),金融 / C輪
</div>
</div>
<div class="com_logo">
<a target="_blank" data-lg-tj-id="8G00" data-lg-tj-no="
0102
" data-lg-tj-cid="61921" data-lg-tj-abt="dm-csearch-useUserAllInterest|0"><img src="http://www.lgstatic.com/thumbnail_120x120/i/image/M00/04/01/CgqKkVbFXXqAPo0fAAATqvTo2-I592.png" alt="人人行(借貸寶)" width="60" height="60"></a>
</div>
</div>
<div class="list_item_bot">
<div class="li_b_l">
<span>年底雙薪</span>
<span>節(jié)日禮物</span>
<span>績(jī)效獎(jiǎng)金</span>
<span>崗位晉升</span>
</div>
<div class="li_b_r">“彈性工作,高并發(fā)系統(tǒng),團(tuán)隊(duì)大牛多,工作氛”</div>
</div>
</li>
<li class="con_list_item default_list" data-index="2" data-positionid="4477797" data-salary="15k-25k" data-company="拉勾網(wǎng)" data-positionname="java開發(fā)工程師" data-companyid="147" data-hrid="2848224" data-adword="0">
<div class="list_item_top">
<div class="position">
<div class="p_top">
<a class="position_link" target="_blank" data-index="2" data-lg-tj-id="8E00" data-lg-tj-no="
0103
" data-lg-tj-cid="4477797" data-lg-tj-abt="dm-csearch-useUserAllInterest|0">
<h3 style="max-width: 180px;">java開發(fā)工程師</h3>
<span class="add">[<em>北京·海淀區(qū)</em>]</span>
</a>
<span class="format-time">3天前發(fā)布</span>
</div>
<div class="p_bot">
<div class="li_b_l">
<span class="money">15k-25k</span>
<!--<i></i>-->經(jīng)驗(yàn)3-5年 / 本科
</div>
</div>
</div>
<div class="company">
<div class="company_name">
<a target="_blank" data-lg-tj-id="8F00" data-lg-tj-no="
0103
" data-lg-tj-cid="147" data-lg-tj-abt="dm-csearch-useUserAllInterest|0">拉勾網(wǎng)</a><i class="company_mark"><span>該企業(yè)已上傳營(yíng)業(yè)執(zhí)照并通過資質(zhì)驗(yàn)證審核</span></i>
</div>
<div class="industry">
企業(yè)服務(wù),招聘 / D輪及以上
</div>
</div>
<div class="com_logo">
<a target="_blank" data-lg-tj-id="8G00" data-lg-tj-no="
0103
" data-lg-tj-cid="147" data-lg-tj-abt="dm-csearch-useUserAllInterest|0"><img src="http://www.lgstatic.com/thumbnail_120x120/i/image/M00/76/40/Cgp3O1g1TNOAB2yxAAA9bQUyc4g814.png" alt="拉勾網(wǎng)" width="60" height="60"></a>
</div>
</div>
<div class="list_item_bot">
<div class="li_b_l">
<span>分布式</span>
<span>架構(gòu)</span>
</div>
<div class="li_b_r">“技術(shù)挑戰(zhàn),成長(zhǎng)空間”</div>
</div>
</li>
<li class="con_list_item default_list" data-index="3" data-positionid="3155873" data-salary="4k-8k" data-company="小肚皮App" data-positionname="實(shí)習(xí)JAVA/PHP開發(fā)" data-companyid="41030" data-hrid="795835" data-adword="0">
<div class="list_item_top">
<div class="position">
<div class="p_top">
<a class="position_link" target="_blank" data-index="3" data-lg-tj-id="8E00" data-lg-tj-no="
0104
" data-lg-tj-cid="3155873" data-lg-tj-abt="dm-csearch-useUserAllInterest|0">
<h3 style="max-width: 180px;">實(shí)習(xí)JAVA/PHP開發(fā)</h3>
<span class="add">[<em>北京·大望路</em>]</span>
</a>
<span class="format-time">10:13發(fā)布</span>
</div>
<div class="p_bot">
<div class="li_b_l">
<span class="money">4k-8k</span>
<!--<i></i>-->經(jīng)驗(yàn)應(yīng)屆畢業(yè)生 / 本科
</div>
</div>
</div>
<div class="company">
<div class="company_name">
<a target="_blank" data-lg-tj-id="8F00" data-lg-tj-no="
0104
" data-lg-tj-cid="41030" data-lg-tj-abt="dm-csearch-useUserAllInterest|0">小肚皮App</a><i class="company_mark"><span>該企業(yè)已上傳營(yíng)業(yè)執(zhí)照并通過資質(zhì)驗(yàn)證審核</span></i>
</div>
<div class="industry">
移動(dòng)互聯(lián)網(wǎng) / A輪
</div>
</div>
<div class="com_logo">
<a target="_blank" data-lg-tj-id="8G00" data-lg-tj-no="
0104
" data-lg-tj-cid="41030" data-lg-tj-abt="dm-csearch-useUserAllInterest|0"><img src="http://www.lgstatic.com/thumbnail_120x120/i/image/M00/65/83/CgpFT1mlFSWAJrrnAACWcSWNiu8425.png" alt="小肚皮App" width="60" height="60"></a>
</div>
</div>
<div class="list_item_bot">
<div class="li_b_l">
<span>游戲</span>
<span>后端開發(fā)</span>
</div>
<div class="li_b_r">“轉(zhuǎn)正機(jī)會(huì),逗逼氛圍,美女如云,餐補(bǔ)”</div>
</div>
</li>
'''
公用代碼
htmlElement = etree.HTML(text)
1.獲得所有l(wèi)i標(biāo)簽
#//li表示從根節(jié)點(diǎn)開始所有l(wèi)i標(biāo)簽
lis = htmlElement.xpath("http://li")
for li in lis:
print(etree.tostring(li,encoding="utf-8").decode("utf-8"))
2.獲得第2個(gè)li標(biāo)簽
#[]里面為謂語(yǔ)皮官,即限制條件硝逢,如[2]表示第二個(gè)
li= htmlElement.xpath("http://li[2]")[0]
print(etree.tostring(li,encoding="utf-8").decode("utf-8"))
3.獲得第2個(gè)以后的li標(biāo)簽
#[]里面為謂語(yǔ)炼七,即限制條件输拇,如[position()>2]表示查找第二個(gè)以后的li,position()還可以跟<,<=,>,>=
lis= htmlElement.xpath("http://li[position()>2]")
for li in lis:
print(etree.tostring(li,encoding="utf-8").decode("utf-8"))
4.獲得所有class = industry的標(biāo)簽div元素
#[]里面為謂語(yǔ),即限制條件,@后跟屬性,即滿足屬性等于多少的標(biāo)簽嗽冒,
divs = htmlElement.xpath("http://div[@class = 'industry']")
for div in divs:
print(etree.tostring(div,encoding="utf-8").decode("utf-8"))
5.獲得所有class中含有_list的元素li標(biāo)簽
#[]里面為謂語(yǔ),即限制條件补履,@后跟屬性添坊,contains()模糊匹配,只要屬性含有這個(gè)字符串就可以匹配到
lis = htmlElement.xpath("http://li[contains(@class,'_list')]")
for li in lis:
print(etree.tostring(li,encoding="utf-8").decode("utf-8"))
6.獲得a標(biāo)簽的href屬性的文本
#獲得標(biāo)簽屬性值//a/@href
alist = htmlElement.xpath("http://a/@href")#不能使用//a[@href]這樣找的是含有href屬性的a標(biāo)簽不是href的值
for a in alist:
print(a)
7.獲得 class為industry 或者li_b_r的div
#還可以為 and
divs = htmlElement.xpath("http://div[@class = 'industry' or @class = 'li_b_r']")
for div in divs:
print(etree.tostring(div,encoding="utf-8").decode("utf-8"))
8.獲得所有文本信息
#查找一個(gè)元素下的所有的元素一定在斜杠前加上點(diǎn)箫锤,即.//贬蛙,否則會(huì)自動(dòng)去根節(jié)點(diǎn)下查找所有元素
#使用/text()可以返回標(biāo)簽的值
lis = htmlElement.xpath("http://li")
for li in lis:
position = li.xpath(".//h3/text()")[0]
salary = li.xpath(".//span[@class = 'money']/text()")[0]
address = li.xpath(".//span[@class = 'add']/em/text()")[0]
advantages = li.xpath(".//div[@class ='list_item_bot']/div[@class='li_b_l']/span/text()")
temp = {
"positon":position,
"salary":salary,
"address":address,
"advantage":advantages
}
print(temp)