request_html常用方法
from requests_html import HTMLSession
session = HTMLSession()
r = session.get('https://news.cnblogs.com/n/recommend')
news = r.html.find('h2.news_entry > a')
for new in news:
print(new.text)
print(new.absolute_links)
加載js,下載漫畫(huà)圖片
%%time
for i in range(1, 16):
r = session.get('http://www.gugu5.com/n/14178/556176.html?p=%s'%i)
r.html.render()
src = r.html.find('#qTcms_pic', first=True).attrs['src']
print(src)
display(Image(url=src))
http://html.python-requests.org/
小例子
from requests_html import HTMLSession
from IPython.display import display, Image
session = HTMLSession()
%%time
for i in range(1, 15):
r = session.get('http://www.gugu5.com/n/14178/531259.html?p=%s'%i)
r.html.render()
src = r.html.find('#qTcms_pic', first=True).attrs['src']
display(Image(url=src))
print('第%s頁(yè)'%i)
爬取貓眼top100電影
import requests
from bs4 import BeautifulSoup
import csv
def get_page(url):
headers = {
'Host': 'maoyan.com',
'Referer': 'http://maoyan.com/board',
'User-Agent': 'Mozilla/5.0'
}
r = requests.get(url, headers=headers)
return r.text
def parse(content):
movies = []
soup = BeautifulSoup(content, 'lxml')
dl = soup.find('dl', attrs={'class': 'board-wrapper'})
for dd in dl.find_all('dd'):
rank = dd.find('i').text
title = dd.find('div', attrs={'class': "movie-item-info"}).find('a').text
stars = dd.find('p', attrs={'class': 'star'}).text.strip().split(':')[-1].strip()
releasetime = dd.find('p', attrs={'class': 'releasetime'}).text.strip().split(':')[-1].strip()
score = dd.find('p', attrs={'class': 'score'}).text
movie = [rank, title, stars, releasetime, score]
movies.append(movie)
return movies
def write(rows, file):
with open(file, 'a', newline='') as f:
writer = csv.writer(f)
writer.writerows(rows)
def main():
url = 'http://maoyan.com/board/4?offset='
for i in range(10):
print(url + str(i*10))
content = get_page(url + str(i*10))
movies = parse(content)
# print(movies)
write(movies, '1.csv')
if __name__ == '__main__':
main()