最后輸出PDF格式割疾,暫不穩(wěn)定,有待改進。
import requests
from lxml import etree
from bs4 import BeautifulSoup
import pdfkit
headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36'}
KEYWORD = 'duhaoshu'
# 判斷該公眾號是否被傳送門網(wǎng)站收錄。
def judge(nameid):
url = 'http://chuansong.me/account/' + nameid + '?start=' + str(0)
response =requests.get(url,headers=headers)
html = etree.HTML(response.text)
url = html.xpath('//div[contains(@class,"topic_page")]/h1/text()')
if '404' in url:
print('暫未收錄該公眾號的文章蛉艾。')
else:
pass
def parse_one_page(url):
r = requests.get(url,headers=headers)
soup = BeautifulSoup(r.text,'lxml')
data = soup.find_all('div',{'tabindex':'-1'})
for html in data:
detail_url = html.find_all('a')[0].get('href')
full_url = 'http://chuansong.me' + detail_url
parse_detail(full_url)
break
def parse_detail(full_url):
r = requests.get(full_url, headers=headers)
data = etree.HTML(r.text)
title = data.xpath('//h2[@class="rich_media_title"]/text()')[0]
publish_time = data.xpath('//em[@id="publish_time"]/text()')[0]
# url 裝換成 pdf
path_wk = r'F:\Downloads\html-pdf\wkhtmltopdf\bin\wkhtmltopdf.exe' # 安裝wkhtmltopdf的位置
config = pdfkit.configuration(wkhtmltopdf=path_wk)
print(title, publish_time)
pdfkit.from_url(full_url,title + '.pdf', configuration=config)
print('轉(zhuǎn)換成功!')
def main():
for page in range(1):
url = 'http://chuansong.me/account/' + KEYWORD + '?start={}'.format(page)
parse_one_page(url)
if __name__ == '__main__':
main()