任務(wù)1
爬取網(wǎng)址:http://www.doupoxs.com/doupocangqiong
爬取內(nèi)容:斗破蒼穹的各章節(jié)正文
爬取方式:正則表達(dá)式 & BeautifulSoup(get_info2(url)完箩,選用)
import requests
from bs4 import BeautifulSoup
import re
import time
base_url = "http://www.doupoxs.com/doupocangqiong"
headers = {'user-agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3294.6 Safari/537.36'}
r = requests.get(base_url, headers = headers)
soup = BeautifulSoup(r.text,"lxml")
def get_url():
detail_urls = soup.select("div.book_list a")
url_list = []
for url in detail_urls:
detail_url = url.get("href")
all_url = "http://www.doupoxs.com" + detail_url
url_list.append(all_url)
return url_list
##正則表達(dá)式
def get_info(url):
r = requests.get(url,headers = headers)
dpcp_texts = re.findall('<p>(.*?)</p>',r.content.decode('utf-8'),re.S)
for dpcp_text in dpcp_texts:
f.write(dpcp_text + '\n')
##BeautifulSoup
def get_info2(url):
r = requests.get(url,headers = headers)
soup = BeautifulSoup(r.content,"lxml")
infos = soup.select("div.read_chapterDetail p")
for info in infos:
dpcq_text = info.text
f.write(dpcp_text + '\n')
if __name__ == "__main__":
url_list = get_url()
f = open("F:/doupo.txt",'a+')
for url in url_list:
get_info(url)
time.sleep(1)
f.close()
部分結(jié)果為:
天才一秒記住本站網(wǎng)站 www.doupoxs.com 中間是<span style="color:blue">斗破 拼音+小說 首字母</span> 連起來就是斗破小說讽膏,喜歡我就記住我吧!
第一章隕落的天才
“斗之力氯葬,三段!”
望著測驗?zāi)厦骈W亮得甚至有些刺眼的五個大字,少年面無表情捅暴,唇角有著一抹自嘲恬砂,緊握的手掌,因為大力蓬痒,而導(dǎo)致略微尖銳的指甲深深的刺進(jìn)了掌心之中泻骤,帶來一陣陣鉆心的疼痛…
“蕭炎,斗之力梧奢,三段!級別:低級!”測驗?zāi)杂啵晃恢心昴凶樱戳艘谎郾纤@示出來的信息亲轨,語氣漠然的將之公布了出來…
中年男子話剛剛脫口趋惨,便是不出意外的在人頭洶涌的廣場上帶起了一陣嘲諷的騷動。
“三段?嘿嘿惦蚊,果然不出我所料器虾,這個“天才”這一年又是在原地踏步!”
“哎,這廢物真是把家族的臉都給丟光了蹦锋。”
“要不是族長是他的父親兆沙,這種廢物,早就被驅(qū)趕出家族莉掂,任其自生自滅了挤悉,哪還有機(jī)會待在家族中白吃白喝。”
任務(wù)2
爬取網(wǎng)址:https://www.qiushibaike.com/text/
爬取內(nèi)容:用戶ID巫湘、用戶等級装悲、用戶性別、發(fā)表段子文字信息尚氛、好笑數(shù)量诀诊、評價數(shù)量
爬取方式:正則表達(dá)式 & BeautifulSoup(get_info2(url),選用)
import requests
from bs4 import BeautifulSoup
import re
import time
def get_sex(sex_info):
if sex_info == "manIcon":
return "男"
else:
return "女"
##正則表達(dá)式
def get_info(url):
headers = {'user-agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3294.6 Safari/537.36'}
r = requests.get(url, headers = headers)
print(r.status_code)
#infos = soup.select("div.article")
ids = re.findall("<h2>(.*?)</h2>",r.text,re.S)
try:
levels = re.findall('<div class="articleGender .*?">(.*?)</div>',r.text,re.S)
sex_infos = re.findall('<div class="articleGender (.*?)"',r.text,re.S)
except:
levels = '未知'
sex_infos = '未知'
contents = re.findall('<div class="content">.*?<span>(.*?)</span>',r.text,re.S)
laughs = re.findall('<span class="stats-vote">.*?<i class="number">(.*?)</i>',r.text,re.S)
comments = re.findall('<span class="stats-comments">.*?<i class="number">(.*?)</i>',r.text,re.S)
for id,level,sex_info,content,laugh,comment in zip(ids,levels,sex_infos,contents,laughs,comments):
info = {
'id':id,
'level':level,
'sex':get_sex(sex_info),
'content':content,
'laugh':laugh,
'comment':comment
}
info_lists.append(info)
#print(info)
for info_list in info_lists:
try:
f.write(info_list['id']+'\n')
f.write(info_list['level']+'\n')
f.write(info_list['sex']+'\n')
f.write(info_list['content']+'\n')
f.write(info_list['laugh']+'\n')
f.write(info_list['comment']+'\n\n')
except UnicodeEncodeError: ##寫入txt文件時去掉錯誤編碼阅嘶。如果僅打印內(nèi)容不需要此語句
pass
##BeautifulSoup
def get_info2(url):
headers = {'user-agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3294.6 Safari/537.36'}
r = requests.get(url, headers = headers)
print(r.status_code)
soup = BeautifulSoup(r.text,"lxml")
infos = soup.select("div.article")
for info in infos:
id = info.select("h2")[0].text.strip()
try:
level = info.select("div.articleGender")[0].text
sex_info = info.select("div.articleGender")[0].get("class")[1]
if sex_info == "womenIcon":
sex = "女"
elif sex_info == "manIcon":
sex = "男"
except:
level = ""
sex = "未知"
content = info.select("div.content")[0].text.strip().replace('\u200b','').replace('\xba','')
laugh = info.select("span.stats-vote i")[0].text
comment = info.select("span.stats-comments i")[0].text.replace('\u2718','')
f.write(id +' '+ level +' '+ sex +' '+ content +' '+ laugh +' '+ comment+'\n')
#print(id,level,sex,content,laugh,comment)
if __name__ == "__main__":
url_list = ["https://www.qiushibaike.com/text/page/{}/".format(i) for i in range(1,14)]
info_lists = []
f = open("F:/qiushibaike.txt",'a+')
for url in url_list:
get_info(url)
time.sleep(1)
f.close()