Python3+Scrapy+phantomJs+Selenium爬取今日頭條
在實(shí)現(xiàn)爬蟲的過程中,我們不可避免的會爬取又js以及Ajax等動態(tài)網(wǎng)頁技術(shù)生成網(wǎng)頁內(nèi)容的網(wǎng)站塞栅,今日頭條就是一個很好的例子。
本文所要介紹的是基于Python3沦零,配合Scrapy+phantomjs+selenium框架的動態(tài)網(wǎng)頁爬取技術(shù)餐曼。
本文所實(shí)現(xiàn)的2個項(xiàng)目已上傳至Github中,求Star~ 1. 爬取今日頭條新聞列表URL: 2. 爬取今日頭條新聞內(nèi)容:
靜態(tài)網(wǎng)頁爬取技術(shù)以及windows下爬蟲環(huán)境搭建移步上幾篇博客德频,必要的安裝軟件也在上一篇博客中提供苍息。
本文介紹使用PhantongJs + Selenium實(shí)現(xiàn)新聞內(nèi)容的爬取,爬取新聞列表的url也是相同的原理壹置,不再贅述竞思。
項(xiàng)目結(jié)構(gòu)
項(xiàng)目原理
底層代碼使用Python3,網(wǎng)絡(luò)爬蟲基礎(chǔ)框架采用Scrapy钞护,由于爬取的是動態(tài)網(wǎng)頁盖喷,整個網(wǎng)頁并不是直接生成頁面,動過Ajax等技術(shù)動態(tài)生成难咕。所以這里考慮采用 PhantomJs+Selenium模擬實(shí)現(xiàn)一個無界面的瀏覽器课梳,去模擬用戶操作,抓取網(wǎng)頁代碼內(nèi)容余佃。
代碼文件說明
項(xiàng)目結(jié)構(gòu)從上到下依次為:
middleware.py:整個項(xiàng)目的核心暮刃,用于啟動中間件,在Scrapy抓取調(diào)用request的過程中實(shí)現(xiàn)模擬用戶操作瀏覽器
ContentSpider.py:爬蟲類文件咙冗,定義爬蟲
commonUtils:工具類
items.py:爬蟲所抓取到的字段存儲類
pipelines.py:抓取到的數(shù)據(jù)處理類
這5個為關(guān)鍵類代碼沾歪,其余的代碼為業(yè)務(wù)相關(guān)代碼。
關(guān)鍵代碼講解
middleware.py
douguo request middleware
for the page which loaded by js/ajax
ang changes should be recored here:
@author zhangjianfei
@date 2017/05/04
from selenium import webdriver
from scrapy.http import HtmlResponse
from DgSpiderPhantomJS import settings
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
import time
import random
class JavaScriptMiddleware(object):
print("LOGS Starting Middleware ...")
def process_request(self, request, spider):
print("LOGS: process_request is starting ...")
# 開啟虛擬瀏覽器參數(shù)
dcap = dict(DesiredCapabilities.PHANTOMJS)
# 設(shè)置agents
dcap["phantomjs.page.settings.userAgent"] = (random.choice(settings.USER_AGENTS))
# 啟動phantomjs
driver = webdriver.PhantomJS(executable_path=r"D:\phantomjs-2.1.1\bin\phantomjs.exe", desired_capabilities=dcap)
# 設(shè)置60秒頁面超時(shí)返回
driver.set_page_load_timeout(60)
# 設(shè)置60秒腳本超時(shí)時(shí)間
driver.set_script_timeout(60)
# get page request
driver.get(request.url)
# simulate user behavior
js = "document.body.scrollTop=10000"
driver.execute_script(js) # 可執(zhí)行js雾消,模仿用戶操作灾搏。此處為將頁面拉至1000。
# 等待異步請求響應(yīng)
driver.implicitly_wait(20)
# 獲取頁面源碼
body = driver.page_source
return HtmlResponse(driver.current_url, body=body, encoding='utf-8', request=request)
-- coding: utf-8 --
import scrapy
import random
import time
from DgSpiderPhantomJS.items import DgspiderPostItem
from scrapy.selector import Selector
from DgSpiderPhantomJS import urlSettings
from DgSpiderPhantomJS import contentSettings
from DgSpiderPhantomJS.mysqlUtils import dbhandle_update_status
from DgSpiderPhantomJS.mysqlUtils import dbhandle_geturl
class DgContentSpider(scrapy.Spider):
print('LOGS: Spider Content_Spider Staring ...')
sleep_time = random.randint(60, 90)
print("LOGS: Sleeping :" + str(sleep_time))
time.sleep(sleep_time)
# get url from db
result = dbhandle_geturl()
url = result[0]
# spider_name = result[1]
site = result[2]
gid = result[3]
module = result[4]
# set spider name
name = 'Content_Spider'
# name = 'DgUrlSpiderPhantomJS'
# set domains
allowed_domains = [site]
# set scrapy url
start_urls = [url]
# change status
"""對于爬去網(wǎng)頁立润,無論是否爬取成功都將設(shè)置status為1狂窑,避免死循環(huán)"""
dbhandle_update_status(url, 1)
# scrapy crawl
def parse(self, response):
# init the item
item = DgspiderPostItem()
# get the page source
sel = Selector(response)
print(sel)
# get post title
title_date = sel.xpath(contentSettings.POST_TITLE_XPATH)
item['title'] = title_date.xpath('string(.)').extract()
# get post page source
item['text'] = sel.xpath(contentSettings.POST_CONTENT_XPATH).extract()
# get url
item['url'] = DgContentSpider.url
yield item
-- coding: utf-8 --
Define here the models for your scraped items
See documentation in:
http://doc.scrapy.org/en/latest/topics/items.html
import scrapy
class DgspiderUrlItem(scrapy.Item):
url = scrapy.Field()
class DgspiderPostItem(scrapy.Item):
url = scrapy.Field()
title = scrapy.Field()
text = scrapy.Field()
-- coding: utf-8 --
Define your item pipelines here
Don't forget to add your pipeline to the ITEM_PIPELINES setting
See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
import re
import datetime
import urllib.request
from DgSpiderPhantomJS import urlSettings
from DgSpiderPhantomJS import contentSettings
from DgSpiderPhantomJS.mysqlUtils import dbhandle_insert_content
from DgSpiderPhantomJS.uploadUtils import uploadImage
from DgSpiderPhantomJS.mysqlUtils import dbhandle_online
from DgSpiderPhantomJS.PostHandle import post_handel
from DgSpiderPhantomJS.mysqlUtils import dbhandle_update_status
from bs4 import BeautifulSoup
from DgSpiderPhantomJS.commonUtils import get_random_user
from DgSpiderPhantomJS.commonUtils import get_linkmd5id
class DgspiderphantomjsPipeline(object):
# post構(gòu)造reply
cs = []
# 帖子title
title = ''
# 帖子文本
text = ''
# 當(dāng)前爬取的url
url = ''
# 隨機(jī)用戶ID
user_id = ''
# 圖片flag
has_img = 0
# get title flag
get_title_flag = 0
def __init__(self):
DgspiderphantomjsPipeline.user_id = get_random_user(contentSettings.CREATE_POST_USER)
# process the data
def process_item(self, item, spider):
self.get_title_flag += 1
# 獲取當(dāng)前網(wǎng)頁url
DgspiderphantomjsPipeline.url = item['url']
# 獲取post title
if len(item['title']) == 0:
title_tmp = ''
else:
title_tmp = item['title'][0]
# 替換標(biāo)題中可能會引起 sql syntax 的符號
# 對于分頁的文章,只取得第一頁的標(biāo)題
if self.get_title_flag == 1:
# 使用beautifulSoup格什化標(biāo)題
soup_title = BeautifulSoup(title_tmp, "lxml")
title = ''
# 對于bs之后的html樹形結(jié)構(gòu)桑腮,不使用.prettify()泉哈,對于bs, prettify后每一個標(biāo)簽自動換行,造成多個破讨、
# 多行的空格丛晦、換行,使用stripped_strings獲取文本
for string in soup_title.stripped_strings:
title += string
title = title.replace("'", "”").replace('"', '“')
DgspiderphantomjsPipeline.title = title
# 獲取正post內(nèi)容
if len(item['text']) == 0:
text_temp = ''
else:
text_temp = item['text'][0]
soup = BeautifulSoup(text_temp, "lxml")
text_temp = str(soup)
# 獲取圖片
reg_img = re.compile(r'<img.*?>')
imgs = reg_img.findall(text_temp)
for img in imgs:
DgspiderphantomjsPipeline.has_img = 1
# matchObj = re.search('.*src="(.*)"{2}.*', img, re.M | re.I)
match_obj = re.search('.*src="(.*)".*', img, re.M | re.I)
img_url_tmp = match_obj.group(1)
# 去除所有Http:標(biāo)簽
img_url_tmp = img_url_tmp.replace("http:", "")
# 對于![a.jpg](http://a.jpg)這種情況單獨(dú)處理
imgUrl_tmp_list = img_url_tmp.split('"')
img_url_tmp = imgUrl_tmp_list[0]
# 加入http
imgUrl = 'http:' + img_url_tmp
list_name = imgUrl.split('/')
file_name = list_name[len(list_name)-1]
# if os.path.exists(settings.IMAGES_STORE):
# os.makedirs(settings.IMAGES_STORE)
# 獲取圖片本地存儲路徑
file_path = contentSettings.IMAGES_STORE + file_name
# 獲取圖片并上傳至本地
urllib.request.urlretrieve(imgUrl, file_path)
upload_img_result_json = uploadImage(file_path, 'image/jpeg', DgspiderphantomjsPipeline.user_id)
# 獲取上傳之后返回的服務(wù)器圖片路徑提陶、寬烫沙、高
img_u = upload_img_result_json['result']['image_url']
img_w = upload_img_result_json['result']['w']
img_h = upload_img_result_json['result']['h']
img_upload_flag = str(img_u)+';'+str(img_w)+';'+str(img_h)
# 在圖片前后插入字符標(biāo)記
text_temp = text_temp.replace(img, '[dgimg]' + img_upload_flag + '[/dgimg]')
# 替換<strong>標(biāo)簽
text_temp = text_temp.replace('<strong>', '').replace('</strong>', '')
# 使用beautifulSoup格什化HTML
soup = BeautifulSoup(text_temp, "lxml")
text = ''
# 對于bs之后的html樹形結(jié)構(gòu),不使用.prettify()隙笆,對于bs, prettify后每一個標(biāo)簽自動換行锌蓄,造成多個升筏、
# 多行的空格、換行
for string in soup.stripped_strings:
text += string + '\n\n'
# 替換因?yàn)殡p引號為中文雙引號瘸爽,避免 mysql syntax
DgspiderphantomjsPipeline.text = self.text + text.replace('"', '“')
return item
# spider開啟時(shí)被調(diào)用
def open_spider(self, spider):
pass
# sipder 關(guān)閉時(shí)被調(diào)用
def close_spider(self, spider):
# 數(shù)據(jù)入庫:235
url = DgspiderphantomjsPipeline.url
title = DgspiderphantomjsPipeline.title
content = DgspiderphantomjsPipeline.text
user_id = DgspiderphantomjsPipeline.user_id
create_time = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
dbhandle_insert_content(url, title, content, user_id, DgspiderphantomjsPipeline.has_img, create_time)
# 處理文本您访、設(shè)置status、上傳至dgCommunity.dg_post
# 如果判斷has_img為1剪决,那么上傳帖子
if DgspiderphantomjsPipeline.has_img == 1:
if title.strip() != '' and content.strip() != '':
spider.logger.info('status=2 , has_img=1, title and content is not null! Uploading post into db...')
post_handel(url)
else:
spider.logger.info('status=1 , has_img=1, but title or content is null! ready to exit...')
pass
else:
spider.logger.info('status=1 , has_img=0, changing status and ready to exit...')
pass
轉(zhuǎn)自:
http://blog.csdn.net/qq_31573519/article/details/74248559
灵汪、