import csv
import requests
from bs4 import BeautifulSoup
import time
from requests.exceptions import RequestException
from multiprocessing.dummy import Pool as ThreadPool
simple_book = 'http://www.reibang.com'
user_agent = 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36'
headers = {'User-Agent': user_agent}
base_url = 'http://www.reibang.com/recommendations/users?page=' # 推薦作者頁面在此基礎(chǔ)上進(jìn)行迭代
recommend_author = {}
# UserName=set()
# def write_to_csv(name,url,follow_num,fans_num,article_num,word_num,get_like):
# csvWrite.writerow([name,url,follow_num,fans_num,article_num,word_num,get_like])
jishu=0
def download(url):
try:
r = requests.get(url, headers=headers)
return r
except RequestException as e:
print("The problem is {}".format(e))
# http://www.reibang.com/users/3aa040bf0610/followers?page=2
followStr = '/followers?page='
def get_User_info(url):
# i=0
try:
r = requests.get(url, headers=headers)
# print(r.url)
soup=BeautifulSoup(r.text,'lxml')
user_list=soup.find_all('div',class_='info')
for i in range(len(user_list)):
name=user_list[i].find('a',class_='name')
if name == None:
continue
else:
follow_fan_article=user_list[i].find_all('div',class_='meta')
follow = follow_fan_article[0].select('span:nth-of-type(1)')[0].text.strip()
# print(follow)
fan = follow_fan_article[0].select('span:nth-of-type(2)')[0].text.strip()
article = follow_fan_article[0].select('span:nth-of-type(2)')[0].text.strip()
word=follow_fan_article[1].text.strip().replace('\n','')
# print("第{}頁用戶".format(i))
# print(jishu)
# jishu = 1+jishu
not_recommend_csvWrite.writerow([name.text,follow,fan,article,word])
# print(name.text,follow,fan,article,word)
except RequestException as e:
print("The problem is {}".format(e))
def get_not_recommend_author_info(url, name,fan_num):
# index = 1
fan_num=int(fan_num)
pool = ThreadPool(4)
UserUrlList=[]
if(fan_num%9 == 0):max_index=fan_num//9
else:max_index=fan_num//9+1
print("{}下請求的用戶頁面!".format(name))
print(name,url,'粉絲數(shù):',fan_num)
for index in range(1,100):
UserUrlList.append(url + followStr + str(index))
index+=1
print(url + followStr + str(index))
pool.map(get_User_info,UserUrlList)
def get_recommend_author_info():
page_index = 1
while True:
r = download(base_url + str(page_index))
print("第{}個(gè)請求頁面!".format(page_index))
soup = BeautifulSoup(r.text, 'lxml')
stop_mark = soup.find('div', class_='col-xs-8') # 通過定位頁面中的這個(gè)元素來停止頁面的請求
if stop_mark: # 如果存在該元素绘梦,則進(jìn)行推薦作者相關(guān)信息的獲取
author_name = soup.find_all('h4', class_='name') # 獲取作者姓名
author_url = soup.select('div[class~=wrap] > a') # 獲取推薦作者鏈接捶枢。此處通過css3來定位標(biāo)簽
for i in range(len(author_url)):
# recommend_author[author_name[i].text.strip()]=simple_book+author_url[i]['href'].strip()
authorHtml=download(simple_book+author_url[i]['href'].strip())
authorSoup=BeautifulSoup(authorHtml.text,'lxml')
recommend_author_info=authorSoup.select('div[class~=info] > ul > li') #返回的列表中包含了推薦作者的一些信息
name=author_name[i].text.strip()
url=simple_book+author_url[i]['href'].strip() #推薦作者首頁鏈接
follow_num=recommend_author_info[0].select('p')[0].text #關(guān)注人數(shù)
fans_num=recommend_author_info[1].select('p')[0].text #粉絲人數(shù)
# fans_url=recommend_author_info[1].select('a')[0]['href']
article_num = recommend_author_info[2].select('p')[0].text #文章數(shù)
word_num = recommend_author_info[3].select('p')[0].text #字?jǐn)?shù)
getLike_num = recommend_author_info[4].select('p')[0].text #獲得喜歡數(shù)
# print(name,url,fans_num)
# write_to_csv(name,url,follow_num,fans_num,article_num,word_num,getLike_num)
recommend_csvWrite.writerow([name,url,follow_num,fans_num,article_num,word_num,getLike_num]) #將推薦作者的相關(guān)信息寫入csv文件
get_not_recommend_author_info(url,name,fans_num)
page_index += 1
time.sleep(1)
else:
break # 當(dāng)請求的頁面無該元素時(shí)啸澡,則說明本頁面不存在推薦作者,跳出循環(huán)
start = time.time()
recommendFile = open('recommend_author.csv', 'a+', newline='', encoding='utf-8')
recommend_csvWrite = csv.writer(recommendFile)
recommend_csvWrite.writerow(['作者名', '首頁鏈接', '關(guān)注人數(shù)', '粉絲', '文章', '字?jǐn)?shù)', '收獲喜歡'])
notRecommendFile = open('User5.csv', 'a+', newline='', encoding='utf-8')
not_recommend_csvWrite = csv.writer(notRecommendFile)
not_recommend_csvWrite.writerow(['用戶名', '關(guān)注數(shù)', '粉絲數(shù)', '文章數(shù)'])
get_recommend_author_info()
#
end = time.time()
# print(UserName)
print("總耗時(shí) %0.3f" % (end - start))
#
推薦作者1.png
推薦作者2.png
用戶1.png
用戶2.png