豆瓣日記
先分析腳本結(jié)構(gòu)剃袍,再用BeautifulSoup處理
#-*- coding:utf-8 -*-
import re
import requests
import numpy
from bs4 import BeautifulSoup
i=1#計(jì)數(shù)君
name="https://www.douban.com/people/petitespot/notes?start=30&type=note"#鏈接地^
#翻頁的話把30改成40椭员、50……
a= requests.get(name).text
soup = BeautifulSoup(a)
for url in soup.find_all("div", attrs={"class": "note-header-container"}):
res = url.find(class_='j a_unfolder_n')
link= res['href']#獲取每篇文章的鏈接地址
a= requests.get(link).text
soup1 = BeautifulSoup(a)#第二個(gè)soup結(jié)構(gòu)
title=soup1.find("meta", attrs={"property": "og:title"})
title=title['content']#獲取title
res= soup1.find("div", attrs={"id": "link-report"})
st=" "
for a in res.find_all('p'):#如果網(wǎng)頁結(jié)構(gòu)理想,便只有<p></p>
st+='\n'
st+=unicode(a.string)
if st==" ":#否則簡單處理笛园,去掉<br>變?yōu)閾Q行
st=str(res)
st=st.replace("<br/>","\n")
string = str(i) + title + '.txt'
fp = open(string, 'wb')
fp.writelines(st)
fp.close()
i = i + 1
else:
string = str(i) + title + '.txt'
fp = open(string, 'wb')
fp.writelines(st.encode("u8"))
fp.close()
i = i + 1
新浪博客
新浪的編碼特別奇怪隘击,網(wǎng)上的.decode().encode
處理之后仍然是亂碼,最后還是機(jī)緣巧合找到了解決方案研铆。
http://blog.chinaunix.net/uid-13869856-id-5747417.html
https://segmentfault.com/q/1010000000665231
#-*- coding:utf-8 -*-
import re
import requests
import numpy
import urllib
import urllib2
import chardet
import sys
import chardet
from bs4 import BeautifulSoup
i=1
reload(sys)
url = 'http://blog.sina.com.cn/s/articlelist_1270505543_0_1.html'
req = requests.get(url)
if req.encoding == 'ISO-8859-1':
encodings = requests.utils.get_encodings_from_content(req.text)
if encodings:
encoding = encodings[0]
else:
encoding = req.apparent_encoding
encode_content = req.content.decode(encoding, 'replace').encode('utf-8', 'replace')
soup = BeautifulSoup(encode_content,'html5lib')
#必須是html5lib埋同,否則會(huì)損失信息
for url in soup.find_all("p", attrs={"class": "atc_main SG_dot"}):
if i<=36:
i=i+1
continue
res = url.find(target='_blank')
link= res['href']
a= requests.get(link)
encodings = requests.utils.get_encodings_from_content(a.text)
encoding = encodings[0]
content = a.content.decode(encoding, 'replace').encode('utf-8', 'replace')
soup1 = BeautifulSoup(content, 'html5lib')
title=soup1.title.string
res= soup1.find("div", attrs={"id": "sina_keyword_ad_area2"})
st=" "
st=str(res)
string = str(i) + title + '.html'
fp = open(string, 'wb')
fp.writelines(st)
fp.close()
i = i + 1
由于新浪的正文中出現(xiàn)大量html結(jié)構(gòu),只好用html格式存儲(chǔ)棵红,沒有轉(zhuǎn)換成txt文本凶赁。