# -*- coding:utf-8 -*-
# 抓取今日頭條街拍圖集
import json
from multiprocessing import Pool
from urllib.parse import urlencode
import pymysql
import requests
import re
from bs4 import BeautifulSoup
import os
from requests.exceptions import RequestException
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.67 Safari/537.36"
, "accept-language": "zh-CN,zh;q=0.9"
, "accept-encoding": "gzip"}
# 抓取頁面所需參數(shù)
GROUP_START=0
GROUP_END=20
KEYWORD='街拍'
# MySQL 數(shù)據(jù)庫配置
HOST = 'localhost'
DB = 'sys'
UNAME = 'root'
PWD = ''
PORT = 3306
# 獲取索引頁
def get_page_index(offset,keyword):
data = {
'offset': offset,
'format': 'json',
'keyword': keyword,
'autoload': 'true',
'count': '20',
'cur_tab': '3',
'from': 'gallery'
}
# 構(gòu)建要抓取的頁面URL
url = 'https://www.toutiao.com/search_content/?' + urlencode(data)
try:
response = requests.get(url)
if response.status_code == 200:
return response.text
return None
except RequestException:
print("請求索引頁出錯")
return None
# 獲取索引頁內(nèi)容,獲取所有圖集url
def parse_page_index(html):
data = json.loads(html) # 將html轉(zhuǎn)換為json
if data and 'data' in data:
for item in data.get('data'):
yield item.get('article_url')
# 獲取詳情頁內(nèi)容
def get_page_detail(url):
try:
response = requests.get(url,headers = headers)
if response.status_code == 200:
return response.text
return None
except RequestException:
print("請求詳情頁出錯",url)
return None
# 通過BeautifulSoup解析詳情頁內(nèi)容
def parse_page_detail(html,url):
soup = BeautifulSoup(html,'html.parser')
try:
title = soup.select('title')[0].text
print(title)
pattern = re.compile('JSON.parse\("(.*?)\),') # 通過正則抓取相關(guān)js內(nèi)容
result = re.search(pattern,html)
except Exception:
print('解析頁面異常。')
return None
if result:
res = result.group(1).replace('\\','')
pattern = re.compile('"url":"(.*?)"')
urls1 = re.findall(pattern,res)
urls2 = []
[urls2.append(i) for i in urls1 if not i in urls2] # 去重
# 下載圖片并將數(shù)據(jù)保存到MYSQL
for image_url in urls2:
download_image(image_url)
save_to_mysql({
'title': title,
'url': url,
'image_url': image_url
})
return {
'title':title,
'url':url,
'images':urls2
}
# 下載圖片
def download_image(url):
try:
response = requests.get(url,headers = headers)
filename = url.split('/')[-1]
if response.status_code == 200:
save_image(response.content,filename)
# return response.text
return None
except RequestException:
print("請求圖片頁錯誤",url)
return None
# 保存圖片文件
def save_image(content,filename):
file_path = '{0}/{1}.{2}'.format(os.getcwd()+"/toutiaojiepai",filename,'jpg')
print("正在下載:",file_path)
if not os.path.exists(file_path):
with open(file_path,'wb') as f:
f.write(content)
f.close()
# 圖片信息保存到數(shù)據(jù)庫
def save_to_mysql(data):
try:
conn = pymysql.connect(host=HOST, port=PORT, user=UNAME, password=PWD, db=DB,
charset="utf8")
cursor = conn.cursor()
insert_sql = 'insert into meituijiepai values("'+data["title"]+'","'+data['image_url']+'")'
print(insert_sql)
cursor.execute(insert_sql)
conn.commit()
cursor.close()
conn.close()
except Exception as e:
print('wrong' + e)
# 創(chuàng)建目錄
def mkdir(path):
path = path.strip()
path = path.rstrip("\\")
is_exists = os.path.exists(path)
if not is_exists:
os.makedirs(path)
print(path + ' 創(chuàng)建成功')
return True
else:
print(path + ' 目錄已存在')
return False
def main(offset):
print("開始:"+str(offset),KEYWORD)
html = get_page_index(offset,KEYWORD)
mkdir(os.getcwd() + "/toutiaojiepai")
for url in parse_page_index(html):
html = get_page_detail(url)
result = parse_page_detail(html,url)
print(result)
if __name__ == '__main__':
groups = [x*20 for x in range(GROUP_START,GROUP_END+1)]
pool = Pool()
pool.map(main,groups)
效果如下: