本文內(nèi)容參考來自:
今日頭條街拍圖集首頁是利用ajax異步加載方式加載數(shù)據(jù)。
詳情頁組圖的URL在html頁面的javascript中,使用正則表達(dá)式匹配。
完整代碼如下:
1.toutiao_spider.py (爬取圖片并且下載到本地,同時保存信息到mongoDB中)
#-*- coding: utf-8 -*-
import re
from urllib.parse import urlencode
import pymongo
from requests.exceptions import RequestException
from bs4 import BeautifulSoup
import requests
import json
import os
from hashlib import md5
from Mongo_config import *
from multiprocessing import Pool
client = pymongo.MongoClient(MONGO_URL, connect=False)
db = client[MONGON_DB]
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:56.0) Gecko/20100101 Firefox/56.0',
'Content-Type': 'application/x-www-form-urlencoded',
'Connection': 'Keep-Alive',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8'
}
#獲取頁面信息
def get_page_index(offset, keyword):
data = {
'offset': offset,
'format': 'json',
'keyword': keyword,
'autoload': 'true',
'count': 20,
'cur_tab': 1,
'from':'gallery'
}
url = 'https://www.toutiao.com/search_content/?' + urlencode(data)
try:
response = requests.get(url, headers = headers)
if response.status_code == 200:
return response.text
return None
except RequestException:
print('請求索引出錯')
return None
#索引
def parse_page_index(html):
data = json.loads(html)
if data and 'data' in data.keys():
for item in data.get('data'):
yield item.get('article_url')
#獲取詳情頁信息
def get_page_detail(url):
try:
response = requests.get(url, headers = headers)
if response.status_code == 200:
return response.text
return None
except RequestException:
print('請求詳情頁出錯', url)
return None
# 獲取頁面詳情
def parse_page_detail(html, url):
soup = BeautifulSoup(html, 'lxml')
# 獲取標(biāo)題
title = soup.select('title')[0].get_text()
# 圖片正則表達(dá)式對象
images_pattern = re.compile('gallery: JSON.parse\("(.*?)"\)', re.S)
result = re.search(images_pattern, html)
# 替換不需要的數(shù)據(jù)
json_images = re.sub(r'\\{1,2}', '', result.group(1))
if result:
images_data = json.loads(json_images)
if images_data and 'sub_images' in images_data.keys():
sub_images = images_data.get('sub_images')
# 轉(zhuǎn)換成數(shù)組
images = [item.get('url') for item in sub_images]
# 下載圖片
for image in images: down_load_images(image)
return{
'title': title,
'url': url,
'images': images
}
# 下載圖片
def down_load_images(url):
print('正在下載',url)
try:
response = requests.get(url, headers = headers)
if response.status_code == 200:
save_images(response.content)
return None
except RequestException:
print('請求圖片出錯', url)
return None
# 存儲圖片
def save_images(content):
file_path = '{0}/{1}.{2}'.format(os.getcwd(), md5(content).hexdigest(), 'jpg')
if not os.path.exists(file_path):
with open(file_path, 'wb') as f:
f.write(content)
f.close()
# 存儲到mongoDB
def save_to_mongo(result):
if db[MONGO_TABLE].insert(result):
print('存儲成功', result)
return True
return False
def main(offset):
html = get_page_index(offset, KEYWORD)
for url in parse_page_index(html):
html = get_page_detail(url)
if html:
result = parse_page_detail(html, url)
if result: save_to_mongo(result)
print(result)
if __name__ == '__main__':
groups = [x*20 for x in range(GROUP_START, GROUP_END + 1)]
pool = Pool()
pool.map(main, groups)
2.Mongo_config (定義Mongodb全局配置信息)
#-*- coding: utf-8 -*-
MONGO_URL = 'localhost'
MONGON_DB = 'toutiao'
MONGO_TABLE = 'toutiao'
# 如果沒有mongodb密碼就不用寫數(shù)據(jù)庫密碼了,如果有就需要填寫一下
GROUP_START = 1
GROUP_END = 20
KEYWORD = '街拍'
3、運(yùn)行結(jié)果