遇到了編碼錯誤真的很蛋疼窍霞,卡了2個小時才解決!
參考文獻1拯坟,參考文獻2但金,參考文獻3
網(wǎng)站示例一:
# -*- coding: utf-8 -*-
import requests, re
from bs4 import BeautifulSoup
content='http://www.8shuw.com/BookReader/24-24559.html' #目錄頁
resp = requests.get(content)
resp.encoding = 'gbk'
soup=BeautifulSoup(resp.text,'lxml')
tbody = soup.find('table',{'border':'0','class':'acss'}).find('tbody')
trs = tbody.find_all('a',{'itemprop':'url','href': True})
trs = [tr for tr in trs if re.match(u'^第.*$',tr.text) != None] #排除一些沒用的章節(jié)
#print('Count:',len(trs))
#print(trs[-1].text,'href =',trs[-1].get('href'))
with open ('novel.txt', 'w') as f:
for chapter in reversed(trs):
f.write(chapter.text+'\n')
resp = requests.get(chapter.get('href'))
resp.encoding = 'gbk'
soup=BeautifulSoup(resp.text,'lxml')
texts = soup.find('div',{'id':'readtext','class':'fontm'}).find_all('p')
print(trs.index(chapter),chapter.text)
for line in texts:
#解決錯誤關鍵點,encode后再decode郁季,加上ignore參數(shù)忽略一些解碼錯誤
f.write(re.sub(r'CNZZ_SLOT_RENDER\(\"\d{3,8}\"\)\;','',line.text.encode('gb18030').decode('gbk','ignore'))+'\n')
f.close()
網(wǎng)站示例二:
import requests, re
from bs4 import BeautifulSoup
content='http://www.piaotian.com/html/5/5896/'
resp = requests.get(content)
resp.encoding = 'gbk'
soup=BeautifulSoup(resp.text,'lxml')
tbody = soup.find('div',{'class':'centent'})
trs = tbody.find_all('a',{'href': True})
trs = [tr for tr in trs if re.match(u'^第.*$',tr.text) != None]
print('Count:',len(trs))
print(trs[1980].text,'href =',trs[1980].get('href'))
#print(re.sub(r'CNZZ_SLOT_RENDER\(\"\d{3,8}\"\)\;','',texts[1].text))
with open ('novel.txt', 'w') as f:
for chapter in trs[1980:]: # 倒序目錄 reversed(trs):
print(trs.index(chapter),chapter.text)
f.write(chapter.text+'\n') # 章節(jié)標題
resp = requests.get(content + chapter.get('href'))
#resp.encoding = 'gb18030'
soup=BeautifulSoup(resp.text,'html.parser') #這里解析器不同于前例
texts = soup.find_all('br')
#print(soup.get_text())
for line in texts:
if len(line.text)>0:
f.write(line.text.encode('utf-8').decode('gbk','ignore')+'\n')
f.close()