要解析下面的 HTML, 提取到 div 中的文本:
<div id="post_content_91765531755" class="d_post_content j_d_post_content clearfix"> 樓主現(xiàn)在iOS10么</div>
定位到 @id 以 post_content
開頭并且 @class為 d_post_content j_d_post_content clearfix
的 div唾琼。
>>> import lxml
>> html = requests.get('http://tieba.baidu.com/p/4609646212')
>>> content = etree.HTML(html.text)
>>> content = content.xpath('//div[starts-with(@id, "post_content") and contains(@class,"d_post_content j_d_post_content clearfix")]')
-
starts-with(@attr, "xxxx")
函數(shù), 以 xxxx 開頭的 attr 屬性。 -
contains(@attr, "xxxx")
函數(shù), 精確含有值為 xxxx 的屬性瞒御。 - and, 兩個(gè)函數(shù)都為真時(shí), 則返回過濾后的元素。
爬取百度貼吧里面的帖子, 爬取字段為 「回帖日期」神郊、「回帖人」肴裙、「回帖內(nèi)容」:
# -*- coding:utf-8 -*-
from lxml import etree
from multiprocessing.dummy import Pool as ThreadPool
import requests
import json
def spider(url):
# test_url = 'http://tieba.baidu.com/p/4609646212'
html = requests.get(url)
selector = etree.HTML(html.text)
# 獲取每個(gè)內(nèi)容塊
content_field = selector.xpath('//div[@class="l_post j_l_post l_post_bright "]')
reply = {}
for each_content in content_field:
reply_info = json.loads(each_content.xpath('@data-field')[0])
author = reply_info['author']['user_name']
reply_time = reply_info['content']['date']
content = each_content.xpath('div[@class="d_post_content_main"]/div/cc/div[starts-with(@id, "post_content") \
and contains(@class,"d_post_content j_d_post_content clearfix")]')
#content = each_content.xpath('div[@class="d_post_content_main"]/div/cc/div[@class="d_post_content j_d_post_content clearfix"]')
print(author)
print(reply_time)
print(content[0].xpath('string(.)').replace(' ', ''))
print('----------------------------------------------------')
reply['reply_author'] = author
reply['reply_content_time'] = reply_time
reply['reply_content'] = content[0].xpath('string(.)').replace(' ', '')
savetofile(reply)
def savetofile(dict):
f.writelines(u'回帖時(shí)間:' + str(dict['reply_content_time']) + "\n")
f.writelines(u'回貼人:' + dict['reply_author'] + "\n")
f.writelines(u'回帖內(nèi)容:' + dict['reply_content'] + "\n")
f.writelines("\n\n")
if __name__ == '__main__':
pool = ThreadPool(4) # 使用 4 核 cpu
page = []
base_url = 'http://tieba.baidu.com/p/4609646212?pn='
f = open("result.txt", "a", encoding='utf-8') # 將結(jié)果寫入文件
[page.append(base_url + str(i)) for i in range(1, 21)]
result = pool.map(spider, page)
pool.close()
pool.join()
f.close()
注意, 元素的定位一定要精確, 不然會(huì)發(fā)生報(bào)錯(cuò)涌乳。