因?yàn)閷?duì)request,cooick等不甚了解,所以選用最簡(jiǎn)單的selenium爬取
selenium 的特點(diǎn)是所見即所得,爬取到的網(wǎng)頁(yè)結(jié)構(gòu)和正常加載的一樣
配置也很簡(jiǎn)單,使用driver將谷歌瀏覽器驅(qū)動(dòng)起來即可
功能需求很簡(jiǎn)單:
- 爬取個(gè)人的動(dòng)態(tài)
- 保存至數(shù)據(jù)庫(kù)
- 如果檢測(cè)到更,新通過郵件通知
一 分析個(gè)人界面的網(wǎng)頁(yè)url:
可以看出 id 后面的XXXXXXXX(位數(shù)不固定)標(biāo)識(shí)了每個(gè)用戶,想要更換用戶只需要找到對(duì)應(yīng)用戶的id即可
二 分析網(wǎng)頁(yè)結(jié)構(gòu):
可以很容的看出 網(wǎng)頁(yè)的結(jié)構(gòu)為如下
注意: ??爬取時(shí)需要從frame 切換到 iframe
<iframe>
<html>
<div>.........</div> //為要爬取的內(nèi)容
</html>
</iframe>
三 分析如何獲取元素:
selenium提供了方法有很多,因?yàn)橛械膁iv的id是隨機(jī)生成的,class結(jié)構(gòu)也比較復(fù)雜;我使用了full xpath的方法,獲取方法也比較簡(jiǎn)單,只需要使用chrome瀏覽器,
在網(wǎng)頁(yè)任意位置單擊左鍵 --> 點(diǎn)擊檢查 --> 選中要獲取的元素標(biāo)簽可以是<div> ,<li>,<a>,<span >--> 再次點(diǎn)擊左鍵 --> 選擇copy --> 選擇copy full xpath
此時(shí)會(huì)得到如下的串:
是該元素從網(wǎng)頁(yè)<html>標(biāo)簽下的結(jié)構(gòu)
/html/body/div[3]/div/div[2]/div[1]/div/div/div/div/ul/li[1]/div[2]/div/div[4]/div/div/div[2]/h3/a
通過.text方法即可獲取其中的內(nèi)容
源碼:
import re
from datetime import datetime,timedelta
import smtplib
from email.mime.text import MIMEText
from email.header import Header
from selenium.webdriver.chrome.options import Options
from selenium import webdriver
import pymysql
"""
selenium 模塊 爬去動(dòng)態(tài)返回入庫(kù)系統(tǒng)需要的基本信息
"""
def eye (url):
# 配置谷歌瀏覽器無界面運(yùn)行
chrome_options = Options()
chrome_options.add_argument('--no-sandbox')
chrome_options.add_argument('--disable-dev-shm-usage')
chrome_options.add_argument('--headless')
chrome_options.add_argument('blink-settings=imagesEnabled=false')
chrome_options.add_argument('--disable-gpu')
chrome_options.add_experimental_option("excludeSwitches", ["ignore-certificate-errors",
"enable-automation"])
driver = webdriver.Chrome(chrome_options=chrome_options)
driver.get(url)
driver.implicitly_wait(1) # 顯式等待1秒
driver.switch_to.frame('contentFrame') # 切入contentFrame
##使用 fullxpath獲取元素的內(nèi)容
name_session = driver.find_elements_by_xpath('/html/body/div[3]/div/dl/dd/div[1]/div/h2/span[1]')
name = name_session[0].text
dynamic_session = driver.find_elements_by_xpath('/html/body/div[3]/div/dl/dd/ul/li[1]/a/strong')
dynamic = dynamic_session[0].text
addtimes_session = driver.find_elements_by_xpath(
'/html/body/div[3]/div/div[2]/div[1]/div/div/div/div/ul/li/div[2]/div/div[2]/a')
addtimes = []
for item in addtimes_session:
addtimes.append(item.text)
comments_session = driver.find_elements_by_xpath(
'/html/body/div[3]/div/div[2]/div[1]/div/div/div/div/ul/li/div[2]/div/div[3]')
comments = []
for item in comments_session:
comments.append(item.text)
songs_session = driver.find_elements_by_xpath(
'/html/body/div[3]/div/div[2]/div[1]/div/div/div/div/ul/li/div[2]/div/div[4]/div/div/div[2]/h3/a')
songs = []
for item in songs_session:
songs.append(item.text)
singers_session = driver.find_elements_by_xpath(
'/html/body/div[3]/div/div[2]/div[1]/div/div/div/div/ul/li/div[2]/div/div[4]/div/div/div[2]/h4/a')
singers = []
for item in singers_session:
singers.append(item.text)
driver.quit()
return name,dynamic,addtimes,comments,songs,singers
"""
數(shù)據(jù)持久化模塊 將爬去的信息存儲(chǔ)到數(shù)據(jù)庫(kù)中
"""
def keep(url):
flag= True
email_message = ''
error_message = ''
name, dynamic, addtimes, comments, songs, singers = eye(url)
db = pymysql.connect("XXXX", "root", "XXXXXX.", "數(shù)據(jù)庫(kù)名稱")
cursor = db.cursor()
cursor.execute("SELECT VERSION()")
data = cursor.fetchone()
print("Database version : %s " % data)
#keep_eye_on_title_on 標(biāo)題內(nèi)容存入數(shù)據(jù)庫(kù)
# 先檢察動(dòng)態(tài)數(shù)是否改變(不否認(rèn)刪除一條增加一條的情況祷安,但是我又懶得寫)
sql = 'select dynamic from eye_on_title order by `date` desc limit 1 '
try:
cursor.execute(sql)
result = cursor.fetchall()
dynamic_number = result[0][0]
print("查詢成功")
except:
print("出錯(cuò)")
if dynamic_number != dynamic:
today = datetime.today()
sql = "insert into eye_on_title (name,dynamic,date) values ( '%s', '%s', '%s')" % (name, dynamic, today)
print(sql)
try:
cursor.execute(sql)
db.commit()
print("保存成功")
if dynamic == 0:
email_message = email_message + "keep_an_eye_on失敗,計(jì)劃暴露或結(jié)束請(qǐng)求撤離" + dynamic
else:
email_message = email_message + "提示:動(dòng)態(tài)更新 " + dynamic + "\n"
except:
db.rollback()
print("出錯(cuò)")
##檢察更新
for i in range(len(comments)):
songName = re.sub(r'\'', "\\'", songs[i]) # 匹配掉 歌名中的 ' 單引號(hào)
sql = "select * from eye_on_timeline where comment = '%s' and song = '%s' " % (comments[i], songName)
print(sql)
cursor.execute(sql)
result = cursor.fetchall()
if result:
print("此條動(dòng)態(tài)已存在")
if i + 1 == len(comments):
error_message = " \n(刪除后未添加新的內(nèi)容) " + today.strftime(
"%m月%d日 %H:%M") # 循環(huán)條件左開右閉 所以 i+1 才可以等于 len(comments)取巧寫法不好
else:
print("未檢測(cè)到此條動(dòng)態(tài)津滞,準(zhǔn)備寫入 ")
##處理時(shí)間問題
if addtimes[i] == "剛剛":
print("剛剛")
addtime = (datetime.now() + timedelta(minutes=-1)).strftime("%m月%d日 %H:%M")
sql = "insert into eye_on_timeline (song,singer,`comment`,addtine)values('%s','%s','%s','%s') " % (
songName, singers[i], comments[i], addtime)
elif (addtimes[i])[-3:] == "分鐘前":
print('分鐘前')
reducetime = (addtimes[i])[:-3]
print(reducetime, "計(jì)算時(shí)間")
addtime = (datetime.now() + timedelta(minutes=-int(reducetime))).strftime("%m月%d日 %H:%M")
sql = "insert into eye_on_timeline (song,singer,`comment`,addtine)values('%s','%s','%s','%s') " % (
songName, singers[i], comments[i], addtime)
elif (addtimes[i])[:2] == "昨天":
print("昨天")
addtime = (datetime.now() + timedelta(days=-1)).strftime("%m月%d日 %H:%M")
sql = "insert into eye_on_timeline (song,singer,`comment`,addtine)values('%s','%s','%s','%s') " % (
songName, singers[i], comments[i], addtime)
else:
sql = "insert into eye_on_timeline (song,singer,`comment`,addtine)values('%s','%s','%s','%s') " % (
songName, singers[i], comments[i], addtimes[i])
print(sql)
sql_message = ("分享歌曲: " + songs[i]) + (" 歌手: " + singers[i]) + (" 評(píng)論: " + comments[i] + "\n")
email_message = email_message + sql_message
try:
cursor.execute(sql)
db.commit()
print("保存成功")
except:
db.rollback()
print("出錯(cuò)")
else:
print("沒有更新")
flag = False
db.close()
return flag,email_message ,error_message
"""郵件模塊將檢測(cè)到的更新信息發(fā)送到郵箱內(nèi)提醒"""
def mail (email_message,url,error_message,flag):
if flag == True:
from_addr = 'XXXXXXXX@qq.com' # 郵件發(fā)送賬號(hào)
to_addrs = 'XXXXXXXXX@qq.com' # 接收郵件賬號(hào)
qqCode = 'XXXXXXX' # 授權(quán)碼(這個(gè)要填自己獲取到的)
smtp_server = 'smtp.qq.com' # 固定寫死
smtp_port = 465 # 固定端口
# 配置服務(wù)器
stmp = smtplib.SMTP_SSL(smtp_server, smtp_port)
stmp.login(from_addr, qqCode)
# 組裝發(fā)送內(nèi)容
email_message = email_message + ("點(diǎn)擊查看: " + url) + error_message
print(email_message)
message = MIMEText(email_message, 'plain', 'utf-8') # 發(fā)送的內(nèi)容
message['From'] = Header("EYE", 'utf-8') # 發(fā)件人
message['To'] = Header("boss", 'utf-8') # 收件人
subject = 'Keep_an_eye_on 計(jì)劃'
message['Subject'] = Header('Keep_an_eye_on', 'utf-8') # 郵件標(biāo)題
try:
stmp.sendmail(from_addr, to_addrs, message.as_string())
except Exception as e:
print('郵件發(fā)送失敗--' + str(e))
print('郵件發(fā)送成功')
if __name__ == '__main__':
url = 'https://music.163.com/#/user/event?id=XXXXXXXXX'#所要爬去的網(wǎng)易云動(dòng)態(tài)頁(yè)面
flag,email_message ,error_message=keep(url)
mail(email_message,url,error_message,flag)
"""整體有三大模塊:
1. 爬取模塊,使用selenium 爬取網(wǎng)易云的動(dòng)態(tài)上的基本信息
2. 入庫(kù)模塊,使用pymysql 將爬取到的信息存入數(shù)據(jù)庫(kù)
3. 郵件模塊,使用smtp 將數(shù)據(jù)發(fā)送到用戶郵箱以題型
4. 需要添加一個(gè)日志模塊 保證服務(wù)持久運(yùn)行谎砾,報(bào)錯(cuò)有據(jù)可循
"""