使用python+selenium爬取qq空間好友動態(tài)
分析過程如下:
打開qq空間網(wǎng)址:https://qzone.qq.com/ 温技,內(nèi)容如下:
要想用selenium登陸qq空間躬厌,必須點(diǎn)擊賬號密碼登陸按鈕然后再填寫賬號密碼登陸恢口。
點(diǎn)擊賬號密碼按鈕后跳轉(zhuǎn)到如下頁面:
以上過程實現(xiàn)代碼:
# 這是你的chromedriver的對應(yīng)版本文件
chrome_driver = r'E:\迅雷下載\chromedriver_win32\chromedriver.exe'
driver = webdriver.Chrome(executable_path=chrome_driver)
driver.get('https://qzone.qq.com/')
# driver.
# 切換網(wǎng)頁框架
driver.switch_to.frame(driver.find_element_by_id('login_frame'))
# print(driver.page_source)
# 切換到賬戶密碼輸入界面
driver.find_element_by_id('switcher_plogin').click()
接下來就是輸入賬號稚铣、密碼,點(diǎn)擊登陸景鼠。
代碼如下:
# 輸入賬號
driver.find_element_by_id('u').clear()
driver.find_element_by_id('u').send_keys('****') # 此處填寫賬號
# 輸入密碼
driver.find_element_by_id('p').clear()
driver.find_element_by_id('p').send_keys('****') # 此處填寫密碼
# 登陸賬號
driver.find_element_by_id('login_button').click()
# 等待三秒讓瀏覽器加載完
time.sleep(3)
登陸過后就進(jìn)入了qq空間葛菇,但有可能不是好友動態(tài)頁面,這是就需要用selenium來模擬點(diǎn)擊跳轉(zhuǎn)到好友動態(tài)頁面:
代碼如下:
driver.find_element_by_xpath('//*[@id="tab_menu_friend"]/div[3]').click()
# 休息3秒等待頁面加載完
time.sleep(3)
這時我們就進(jìn)入了qq空間好友動態(tài)頁面逞泄,但是我發(fā)現(xiàn)好友動態(tài)是頁面局部刷新加載出來的患整,所以要去查找動態(tài)加載文件。經(jīng)過查找喷众,我發(fā)現(xiàn)動態(tài)加載信息存放在feeds_3_html....文件下各谚。
使用代碼直接獲取這個頁面會報錯,因為這個頁面不僅需要登陸到千,而且他請求地址中的g_tk查詢字符串還是通過加密構(gòu)造的昌渤,其中有兩個字段非常關(guān)鍵,一個是begintime憔四,還有一個是加密得到的g_tk膀息。
begintime這個字段是你的動態(tài)請求中第一條動態(tài)的上一條動態(tài)的發(fā)布時間的時間戳。
g_tk這個字段是在jQuery中加密的字段了赵。
jQuery中加密代碼如下:
getACSRFToken:function(url) {
url = QZFL.util.URI(url);
var skey;
if (url) {
if (url.host && url.host.indexOf("qzone.qq.com") > 0) {
try {
skey = QZONE.FP._t.QZFL.cookie.get("p_skey");
} catch (err) {
skey = QZFL.cookie.get("p_skey");
}
} else {
if (url.host && url.host.indexOf("qq.com") > 0) {
skey = QZFL.cookie.get("skey");
}
}
}
if (!skey) {
skey = QZFL.cookie.get("p_skey") || (QZFL.cookie.get("skey") || (QZFL.cookie.get("rv2") || ""));
}
var hash = 5381;
for (var i = 0, len = skey.length;i < len;++i) {
hash += (hash << 5) + skey.charCodeAt(i);
}
return hash & 2147483647;
為了獲取g_tk潜支,首先是要獲取登陸過后的cookies。
代碼如下:
# 獲取cookie為字典形式
cookie_dict = {i['name']: i['value'] for i in driver.get_cookies()}
# 把cookie轉(zhuǎn)化為字符串形式:name1=value1; name2=value2;
cookie_str = ''
for key, value in cookie_dict.items():
cookie_str += key + '=' + value + '; '
用python實現(xiàn)的加密代碼如下:
# -*- coding: UTF-8 -*-
import re
class GetGTK(object):
def __init__(self, cookiestr):
self.cookieStr = cookiestr
self.p_skey = None
self.skey = None
self.rv2 = None
def getNewGTK(self):
skey = self.p_skey or self.skey or self.rv2
hash = 5381
for i in range(0, len(skey)):
hash += (hash << 5) + ord(skey[i])
return hash & 2147483647
def handler(self):
if re.search(r'p_skey=(?P<p_skey>[^;]*)', self.cookieStr):
self.p_skey = re.search(r'p_skey=(?P<p_skey>[^;]*)', self.cookieStr).group('p_skey')
else:
self.p_skey = None
if re.search(r'skey=(?P<skey>[^;]*)', self.cookieStr):
self.skey = re.search(r'skey=(?P<skey>[^;]*)', self.cookieStr).group('skey')
else:
self.skey = None
if re.search(r'rv2=(?P<rv2>[^;]*)', self.cookieStr):
self.rv2 = re.search(r'rv2=(?P<rv2>[^;]*)', self.cookieStr).group('rv2')
else:
self.rv2 = None
def run(self):
self.handler()
return self.getNewGTK()
if __name__ == '__main__':
cookiestr = "cookies" # 這是你的登陸后的cookie
getGTK = GetGTK(cookiestr)
g_tk = getGTK.run()
print(g_tk)
獲取begintime柿汛。
代碼如下:
basetime = driver.find_elements_by_xpath('//*[@id="feed_friend_list"]//li[@class="f-single f-s-s"]').pop().get_attribute(
'id').split('_')[4]
獲取begintime可以直接在id里面獲取冗酿,id中包含了發(fā)布動態(tài)的時間戳。
有了begintime和g_tk后络断,我們就可以組裝url了裁替,然后就可以用requests加上cookies信息請求url,就可以獲取到空間好友動態(tài)了貌笨。
# 構(gòu)造url
url = 'https://user.qzone.qq.com/proxy/domain/ic2.qzone.qq.com/cgi-bin/feeds/feeds3_html_more?uin=1392853401&begintime={}&g_tk={}'.format(begintime, g_tk)
# 發(fā)起請求
res = requests.get(base_url, cookies=cookie_dict)
print(res.content.decode())
獲取的結(jié)果如下
再在瀏覽器中請求這個url弱判,得到結(jié)果如下
發(fā)現(xiàn)用代碼抓取的空間動態(tài)信息正確,接下來就是用一般的數(shù)據(jù)處理方法來清洗數(shù)據(jù)(xpath,re,或者beautifulsoup)躁绸,要注意的是構(gòu)造下一個請求的begintime要用到上一個請求結(jié)果中最后一條消息的發(fā)布時間的時間戳裕循。例如下圖中最后一個動態(tài)的發(fā)布時間戳為1563797364臣嚣。
下一個Ajax請求的begintime就是1563797364。
這樣就可以構(gòu)造連續(xù)的請求來獲取空間好友動態(tài)消息剥哑。
最后附上源代碼:
from selenium import webdriver
import time
import requests
# 導(dǎo)入密鑰構(gòu)造類
from get_g_tk import GetGTK
from lxml import etree
import demjson
import pymongo
myclient = pymongo.MongoClient('mongodb://localhost:27017/')
mydb = myclient['QQDongTaiInfo']
mycollection = mydb['QQDongTaiInfo']
class GetQQDongTaiInfo(object):
chrome_driver = r'E:\迅雷下載\chromedriver_win32\chromedriver.exe'
def __init__(self, username, password):
self.driver = webdriver.Chrome(executable_path=GetQQDongTaiInfo.chrome_driver)
self.cookies = {}
self.username = username
self.password = password
self.base_url = 'https://user.qzone.qq.com/proxy/domain/ic2.qzone.qq.com/cgi-bin/feeds/feeds3_html_more?uin={}&begintime={}&g_tk={}'
# g_tk為jquery中加密的字段硅则,用登陸的cookie信息進(jìn)行加密
self.g_tk = None
self.begintime = None
def login_qq_zone(self):
self.driver.get('https://qzone.qq.com/')
# 切換網(wǎng)頁框架
self.driver.switch_to.frame(self.driver.find_element_by_id('login_frame'))
# 切換到賬戶密碼輸入界面
self.driver.find_element_by_id('switcher_plogin').click()
# 輸入賬號
self.driver.find_element_by_id('u').clear()
self.driver.find_element_by_id('u').send_keys(self.username)
# 輸入密碼
self.driver.find_element_by_id('p').clear()
self.driver.find_element_by_id('p').send_keys(self.password)
# 登陸賬號
self.driver.find_element_by_id('login_button').click()
time.sleep(3)
self.driver.find_element_by_xpath('//*[@id="tab_menu_friend"]/div[3]').click()
time.sleep(3)
self.cookies = {i['name']: i['value'] for i in self.driver.get_cookies()}
def get_static_html_info(self):
page_source = self.driver.page_source
self.begintime = self.driver.find_elements_by_xpath(
'//*[@id="feed_friend_list"]//li[@class="f-single f-s-s"]').pop().get_attribute(
'id').split('_')[4]
html = etree.HTML(page_source)
# 獲取靜態(tài)網(wǎng)頁中的動態(tài)消息
dongtai_contents = html.xpath('//li[@class="f-single f-s-s"]')
# print(dongtai_content)
single_info = dict()
for temp in dongtai_contents:
# 動態(tài)內(nèi)容
single_info['content'] = temp.xpath(".//div[starts-with(@id,'feed_')]/div[@class='f-info']/text()")
# print(single_info['content'])
# 動態(tài)發(fā)布者名稱
single_info['publisher_name'] = temp.xpath(".//a[contains(@class,'f-name')]/text()")
# 動態(tài)發(fā)布時間戳
single_info['push_date'] = temp.xpath(".//*[starts-with(@id,'hex_')]/i/@data-abstime")
# 動態(tài)瀏覽次數(shù)
single_info['view_count'] = temp.xpath(".//a[contains(@class,'state qz_feed_plugin')]/text()")
# 動態(tài)評論
single_info['comments-content'] = temp.xpath(".//div[@class='comments-content']//text()")
# 點(diǎn)贊次數(shù)
# print(temp.xpath(".//span[@class='f-like-cnt']/text()"))
single_info['like'] = temp.xpath(".//span[@class='f-like-cnt']/text()")
# print(single_info)
self.save_to_mongdb(single_info)
self.cookies = {i['name']: i['value'] for i in self.driver.get_cookies()}
cookie_str = ''
for key, value in self.cookies.items():
cookie_str += key + '=' + value + '; '
self.g_tk = GetGTK(cookie_str).run()
def get_dynamic_info(self):
requests_url = self.base_url.format(self.username, self.begintime, self.g_tk)
print(requests_url)
res = requests.get(requests_url, cookies=self.cookies).content.decode()
res_dict = demjson.decode(res[10: -3])
# 如果沒有請求到正確數(shù)據(jù),再次發(fā)出請求
try:
res_datas = res_dict['data']['data']
except KeyError:
self.get_dynamic_info()
return None
res_datas = [temp for temp in res_datas if isinstance(temp, dict)]
# res_datas_len = len(res_datas)
for temp in res_datas:
single_info = dict()
html = etree.HTML(temp['html'])
# 動態(tài)內(nèi)容
single_info['content'] = html.xpath("http://div[@class='f-info']/text()")
# print(single_info['content'])
# 動態(tài)發(fā)布者名稱
single_info['publisher_name'] = temp['nickname']
# 動態(tài)發(fā)布時間戳
single_info['push_date'] = temp['abstime']
# 動態(tài)瀏覽次數(shù)
single_info['view_count'] = html.xpath("http://a[@class='state qz_feed_plugin']/text()")
# 動態(tài)評論
single_info['comments-content'] = html.xpath("http://div[@class='comments-content']//text()")
# 點(diǎn)贊次數(shù)
# print(temp.xpath(".//span[@class='f-like-cnt']/text()"))
single_info['like'] = html.xpath(".//span[@class='f-like-cnt']/text()")
# print(single_info)
self.save_to_mongdb(single_info)
if temp == res_datas[-1]:
self.begintime = single_info['push_date']
self.get_dynamic_info()
def save_to_mongdb(self, single_info):
if mycollection.find({'push_date': single_info['push_date']}).count() == 0:
mycollection.insert_one(single_info.copy())
print('插入成功')
else:
print('插入失敗')
def run(self):
self.login_qq_zone()
self.get_static_html_info()
self.get_dynamic_info()
if __name__ == "__main__":
username = '***' # qq賬號
password = '***' # qq密碼
Demo = GetQQDongTaiInfo(username, password)
Demo.run()
結(jié)果保存在了mongdb數(shù)據(jù)庫中株婴,結(jié)果如下:
以上就是用selenium+python獲取qq空間好友動態(tài)的全部流程怎虫,謝謝瀏覽。