一录粱、獲取網(wǎng)頁(yè)
import requests
# 獲取網(wǎng)頁(yè)
def get_page():
# 請(qǐng)求頭(有些反扒機(jī)制需要檢測(cè))
headers = {
"User-Agent": "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; 360SE)"
}
url = 'https://maoyan.com/board/4'
response = requests.get(url,headers=headers)
if response.status_code == 200:
return response.content.decode('utf-8')
return None
def main():
html = get_page()
print(html)
if __name__ == '__main__':
main()
解析網(wǎng)頁(yè)
- 在main()上面再定義一個(gè)parse_page()
import requests
import re
# 獲取網(wǎng)頁(yè)
def get_page():
# 請(qǐng)求頭(有些反扒機(jī)制需要檢測(cè))
headers = {
"User-Agent": "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; 360SE)"
}
url = 'https://maoyan.com/board/4'
response = requests.get(url)
if response.status_code == 200:
return response.content.decode('utf-8')
return None
# 解析網(wǎng)頁(yè)
def parse_page(html):
# 片名
pattern = re.compile('movieId.*?>.*?<img.*?<img.*?alt="(.*?)" class.*?', re.S)
movie_names = re.findall(pattern, html)
print('片名:',movie_names)
# 主演
pattern = re.compile('<p class="star">(.*?)</p>', re.S)
actors = re.findall(pattern,html)
# 去字符串前后空格
actors = [actor.strip() for actor in actors]
print('主演:',actors)
# 上映時(shí)間
pattern = re.compile('<p class="releasetime">(.*?)</p>', re.S)
time = re.findall(pattern, html)
time = [i.strip() for i in time]
print('上映時(shí)間:',time)
# 封面圖片
pattern = re.compile('movieId.*?>.*?<img.*?<img.*?data-src="(.*?)" alt.*?', re.S)
img = re.findall(pattern, html)
print('封面圖片:',img)
# 排名
pattern = re.compile('<i class="board-index board-index-(.*?)">.*?</i>', re.S)
rank = re.findall(pattern, html)
print('排名:',rank)
# 評(píng)分
pattern = re.compile('<p class="score"><i class="integer">(.*?)</i><i class="fraction">(.*?)</i></p>', re.S)
score = re.findall(pattern, html)
score = [''.join(i) for i in score]
print('評(píng)分:', score)
# 鏈接
pattern = re.compile('<div class="movie-item-info">.*?<p class="name"><a href="(.*?)" title', re.S)
detail = re.findall(pattern, html)
detail = [i.strip() for i in detail]
print('鏈接:', detail)
def main():
html = get_page()
# print(html)
parse_page(html)
if __name__ == '__main__':
main()
# 打印的內(nèi)容
>>>>
片名: ['霸王別姬', '肖申克的救贖', '羅馬假日', '這個(gè)殺手不太冷', '泰坦尼克號(hào)', '教父', '唐伯虎點(diǎn)秋香', '千與千尋', '魂斷藍(lán)橋', '亂世佳人']
主演: ['主演:張國(guó)榮,張豐毅,鞏俐', '主演:蒂姆·羅賓斯,摩根·弗里曼,鮑勃·岡頓', '主演:格利高里·派克,奧黛麗·赫本,埃迪·艾伯特', '主演:讓·雷諾,加里·奧德曼,娜塔莉·波特曼', '主演:萊昂納多·迪卡普里奧,凱特·溫絲萊特,比利·贊恩', '主演:馬龍·白蘭度,阿爾·帕西諾,詹姆斯·肯恩', '主演:周星馳,鞏俐,鄭佩佩', '主演:柊瑠美,入野自由,夏木真理', '主演:費(fèi)雯·麗,羅伯特·泰勒,露塞爾·沃特森', '主演:費(fèi)雯·麗,克拉克·蓋博,奧利維婭·德哈維蘭']
上映時(shí)間: ['上映時(shí)間:1993-01-01', '上映時(shí)間:1994-10-14(美國(guó))', '上映時(shí)間:1953-09-02(美國(guó))', '上映時(shí)間:1994-09-14(法國(guó))', '上映時(shí)間:1998-04-03', '上映時(shí)間:1972-03-24(美國(guó))', '上映時(shí)間:1993-07-01(中國(guó)香港)', '上映時(shí)間:2001-07-20(日本)', '上映時(shí)間:1940-05-17(美國(guó))', '上映時(shí)間:1939-12-15(美國(guó))']
封面圖片: ['https://p1.meituan.net/movie/20803f59291c47e1e116c11963ce019e68711.jpg@160w_220h_1e_1c', 'https://p0.meituan.net/movie/283292171619cdfd5b240c8fd093f1eb255670.jpg@160w_220h_1e_1c', 'https://p0.meituan.net/movie/54617769d96807e4d81804284ffe2a27239007.jpg@160w_220h_1e_1c', 'https://p0.meituan.net/movie/e55ec5d18ccc83ba7db68caae54f165f95924.jpg@160w_220h_1e_1c', 'https://p1.meituan.net/movie/0699ac97c82cf01638aa5023562d6134351277.jpg@160w_220h_1e_1c', 'https://p1.meituan.net/movie/f5a924f362f050881f2b8f82e852747c118515.jpg@160w_220h_1e_1c', 'https://p0.meituan.net/movie/da64660f82b98cdc1b8a3804e69609e041108.jpg@160w_220h_1e_1c', 'https://p0.meituan.net/movie/b076ce63e9860ecf1ee9839badee5228329384.jpg@160w_220h_1e_1c', 'https://p0.meituan.net/movie/46c29a8b8d8424bdda7715e6fd779c66235684.jpg@160w_220h_1e_1c', 'https://p0.meituan.net/movie/230e71d398e0c54730d58dc4bb6e4cca51662.jpg@160w_220h_1e_1c']
排名: ['1', '2', '3', '4', '5', '6', '7', '8', '9', '10']
評(píng)分: ['9.6', '9.5', '9.1', '9.5', '9.6', '9.3', '9.2', '9.3', '9.2', '9.1']
鏈接: ['/films/1203', '/films/1297', '/films/2641', '/films/4055', '/films/267', '/films/1247', '/films/837', '/films/1212', '/films/2760', '/films/7431']
下面附完整代碼(將爬取到的內(nèi)容存到本地)
import json
import re
import requests
# 獲取網(wǎng)頁(yè)
def get_page(page):
# 請(qǐng)求頭
headers = {
"User-Agent": "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; 360SE)"
}
url = 'https://maoyan.com/board/4?offset='+str(page)
# url = 'https://maoyan.com/board/4'
response = requests.get(url)
if response.status_code == 200:
return response.content.decode('utf-8')
return None
# 獲取所有 網(wǎng)頁(yè)
def get_all_pages():
result = []
for i in range(10):
page = i*10
html = get_page(page)
result_list = parse_page(html)
result.append(result_list)
return result
# 寫入圖片
def save_img(cover_url):
response = requests.get(cover_url)
filename = cover_url.split('/')[-1].split('@')[0]
with open('./images/%s' % filename,'wb')as f:
f.write(response.content)
# 解析網(wǎng)頁(yè)
def parse_page(html):
# 片名
pattern = re.compile('movieId.*?>.*?<img.*?<img.*?alt="(.*?)" class.*?', re.S)
movie_names = re.findall(pattern, html)
# print('片名:',movie_names)
# 主演
pattern = re.compile('<p class="star">(.*?)</p>', re.S)
actors = re.findall(pattern,html)
# 去字符串前后空格
actors = [actor.strip() for actor in actors]
# print('主演:',actors)
# 上映時(shí)間
pattern = re.compile('<p class="releasetime">(.*?)</p>', re.S)
time = re.findall(pattern, html)
time = [i.strip() for i in time]
# print('上映時(shí)間:',time)
# 封面圖片
pattern = re.compile('movieId.*?>.*?<img.*?<img.*?data-src="(.*?)" alt.*?', re.S)
img = re.findall(pattern, html)
# print('封面圖片:',img)
# 排名
pattern = re.compile('<i class="board-index board-index-(.*?)">.*?</i>', re.S)
rank = re.findall(pattern, html)
# print('排名:',rank)
# 評(píng)分
# '<p class="score"><i class="integer">(.*?)</i><i class="fraction">(.*?)</i></p>'
pattern = re.compile('<p class="score"><i class="integer">(.*?)</i><i class="fraction">(.*?)</i></p>', re.S)
score = re.findall(pattern, html)
score = [''.join(i) for i in score]
# print('評(píng)分:', score)
# 鏈接
# '<div class="movie-item-info">.*?<p class="name"><a href="(.*?)" title'
pattern = re.compile('<div class="movie-item-info">.*?<p class="name"><a href="(.*?)" title', re.S)
detail = re.findall(pattern, html)
detail = [i.strip() for i in detail]
# print('鏈接:', detail)
print('spider...')
# 組裝json
result_list = []
for i in range(len(movie_names)):
result_dict = {}
result_dict['movie_name'] = movie_names[i]
result_dict['actor'] = actors[i]
result_dict['time'] = time[i]
result_dict['img'] = img[i]
# 保存圖片到本地
save_img(result_dict['img'])
result_dict['rank'] = rank[i]
result_dict['score'] = score[i]
result_dict['detail'] = detail[i]
result_list.append(result_dict)
return result_list
def save_json_file(result):
json_str = json.dumps(result, ensure_ascii=False)
with open('maoyan.json','w', encoding='utf-8')as f:
f.write(json_str)
def main():
# html = get_page()
# print(html)
# parse_page(html)
result = get_all_pages()
print(result)
save_json_file(result)
if __name__ == '__main__':
main()