使用XPath分析一下比較復(fù)雜的貼吧
先上代碼
#!/usr/bin/env python
#-*- coding:utf-8 -*-
from lxml import etree
import requests
import sys
reload(sys)
sys.setdefaultencoding("utf-8")
url = 'https://tieba.baidu.com/p/5098845608?pn=1'
html = requests.get(url)
selector = etree.HTML(html.text)
img_all_list = [] # 存儲所有圖片鏈接
content_field = selector.xpath('//div[@class="l_post l_post_bright j_l_post clearfix "]') # 獲取指定內(nèi)容
for each in content_field[1:]:
author = each.xpath('div[1]/ul/li[@class="d_name"]/a/text()')[0] # 獲取作者名稱
content = each.xpath('div[2]/div[@class="p_content "]/cc/div/text()') # 獲取貼吧內(nèi)容
time = each.xpath('div[2]/div[@class="core_reply j_lzl_wrapper"]/div[1]/div[@class="post-tail-wrap"]/span[last()]/text()')[0] # 獲取發(fā)帖時間
img_list = each.xpath('div[2]/div[@class="p_content "]/cc/div//@src') # 獲取圖片鏈接
img_all_list.append(img_list)
print author
print "\n".join(content).strip()
print time
print '\n'
i = 0
for img_list in img_all_list: # 下載圖片模塊
for img_url in img_list:
pic = requests.get(img_url)
string = str(i + 1) + img_url[-4:]
fp = open(string, 'wb')
fp.write(pic.content)
fp.close()
i += 1
結(jié)果如下(當(dāng)然還有相親者圖片0-0):
如果熟悉Xpath語法,會很容易爬取這個網(wǎng)頁的內(nèi)容恐锦。因?yàn)樯弦黄獌?nèi)容分析過使用Xpath抓取網(wǎng)頁信息,這一篇就不詳細(xì)再說,原理都一樣岭妖。而這一次純屬拿來玩玩~
需要注意的兩個地方
1.last()的使用
time = each.xpath('div[2]/div[@class="core_reply j_lzl_wrapper"]/div[1]/div[@class="post-tail-wrap"]/span[last()]/text()')[0]
last()表示返回元素最后一個值届巩,在本例中源碼中跋核,我們可以看到
對于同一個標(biāo)簽'div[@class="post-tail-wrap"]/span',回帖時間有時候出現(xiàn)在第三個span標(biāo)簽狮杨,有時候出現(xiàn)在第四個span標(biāo)簽,但共同點(diǎn)都是處在最后一個span標(biāo)簽中到忽,所以采用last()值
2. Xpath獲取圖片
img_list = each.xpath('div[2]/div[@class="p_content "]/cc/div//@src')
“*//@src”用于獲取所有圖片鏈接
爬取前10頁信息
#-*_coding:utf8-*-
import requests
from lxml import etree
import sys
reload(sys)
sys.setdefaultencoding('utf-8')
def get_Allurl(): # 獲取貼吧前10頁內(nèi)容
page = []
url = 'https://tieba.baidu.com/p/5098845608?pn='
for i in range(1, 11):
a = url + str(i)
page.append(a)
return page
def spider(url):
info_list=[]
html = requests.get(url,timeout = 5) # 如果5秒內(nèi)網(wǎng)頁沒有響應(yīng)訪問請求橄教,則直接結(jié)束
selector = etree.HTML(html.text)
reply = selector.xpath('//div[@class="l_post l_post_bright j_l_post clearfix "]')
for each in reply:
author = each.xpath('div[1]/ul/li[@class="d_name"]/a/text()')
if len(author) == 0:
continue
author = author[0]
content = each.xpath('div[2]/div[@class="p_content "]/cc/div/text()')
time = each.xpath('div[2]/div[@class="core_reply j_lzl_wrapper"]/div[1]/div[@class="post-tail-wrap"]/span[last()]/text()')[0]
info = {}
info['author'] = author
info['reply'] = "\n\t".join(content).strip()
info['time'] = time
info_list.append(info)
return info_list
def saveinfo(classinfo): # 保存信息
f = open('tiebainfo.txt', 'w')
for info_all in classinfo:
for each in info_all:
f.writelines('Author: ' + each['author'] + '\n')
f.writelines('Content:\n\t' + each['reply'] + '\n')
f.writelines('Time: ' + each['time'] + '\n\n')
f.close
if __name__ == '__main__':
classinfo = []
all_url = get_Allurl()
for url in all_url:
print u'正在處理:' + url
info_list = spider(url)
classinfo.append(info_list)
saveinfo(classinfo)
就到這吧