早些日子有人問我我的微信里面有一共多少朋友蓖扑,我就隨后拉倒了通訊錄最下面就找到了微信一共有多少位好友。然后他又問我台舱,這里面你認(rèn)識多少人律杠?
這一句話問的我很無語。一千多個(gè)好友我真的不知道認(rèn)識的人有多少。他還緊追著不放了俩功,你知道你微信朋友的男女比例嘛幻枉?你知道你微信朋友大部分來自什么地方嗎?
以下的代碼內(nèi)容只涉及一些簡單的Python知識诡蜓,稍微有一點(diǎn)Python知識的朋友都可以讀下去熬甫。 如果你沒有Python的知識你可能需要去學(xué)習(xí)一下Python,當(dāng)然你也可以不用學(xué)蔓罚,搭建好Python的環(huán)境就好椿肩,期間可能需要用到一些庫需要自己去解決一下,在下文中也會詳細(xì)訴述豺谈。
第一步:首先抓取微信朋友的資料
既然是要做統(tǒng)計(jì)和分析郑象,第一步就是微信朋友的所有可以抓取的資料抓取出來。所謂有用的資料大致來說有以下幾個(gè)內(nèi)容:
昵稱茬末、微信號厂榛、城市、性別丽惭、星標(biāo)好友击奶、頭像、個(gè)性簽名责掏、備注
每一項(xiàng)或者聯(lián)合項(xiàng)可以做的統(tǒng)計(jì)
性別:好友性別統(tǒng)計(jì)
城市:好友地區(qū)分布
備注+昵稱:大致統(tǒng)計(jì)認(rèn)識的好友比例
頭像:人臉識別
那么如何抓取呢柜砾?這里使用了之前有一位大神寫的如何找出被刪的好友的代碼,修改部分為從提取json數(shù)據(jù)截?cái)嗷怀模瑢Ψ祷氐膉son數(shù)據(jù)進(jìn)行提取分別找到了以下的所需要的信息:
小編給大家推薦一個(gè)學(xué)習(xí)氛圍超好的地方痰驱,python交流企鵝裙:【611+530+101】適合在校大學(xué)生,小白瞳浦,想轉(zhuǎn)行担映,想通過這個(gè)找工作的加入。裙里有大量學(xué)習(xí)資料术幔,有大神解答交流問題另萤,每晚都有免費(fèi)的直播課程
代碼修改為:
#!/usr/bin/env python
# encoding=utf-8
from __future__ import print_function
import os
import requests
import re
import time
import xml.dom.minidom
import json
import sys
import math
import subprocess
import ssl
import threading
import urllib,urllib2
DEBUG = False
MAX_GROUP_NUM = 2 # 每組人數(shù)
INTERFACE_CALLING_INTERVAL = 5 # 接口調(diào)用時(shí)間間隔, 間隔太短容易出現(xiàn)"操作太頻繁", 會被限制操作半小時(shí)左右
MAX_PROGRESS_LEN = 50
QRImagePath = os.path.join(os.getcwd(), 'qrcode.jpg')
tip = 0
uuid = ''
base_uri = ''
redirect_uri = ''
push_uri = ''
skey = ''
wxsid = ''
wxuin = ''
pass_ticket = ''
deviceId = 'e000000000000000'
BaseRequest = {}
ContactList = []
My = []
SyncKey = []
try:
xrange
range = xrange
except:
# python 3
pass
def responseState(func, BaseResponse):
ErrMsg = BaseResponse['ErrMsg']
Ret = BaseResponse['Ret']
if DEBUG or Ret != 0:
print('func: %s, Ret: %d, ErrMsg: %s' % (func, Ret, ErrMsg))
if Ret != 0:
return False
return True
def getUUID():
global uuid
url = 'https://login.weixin.qq.com/jslogin'
params = {
'appid': 'wx782c26e4c19acffb',
'fun': 'new',
'lang': 'zh_CN',
'_': int(time.time()),
}
r= myRequests.get(url=url, params=params)
r.encoding = 'utf-8'
data = r.text
# print(data)
# window.QRLogin.code = 200; window.QRLogin.uuid = "oZwt_bFfRg==";
regx = r'window.QRLogin.code = (d+); window.QRLogin.uuid = "(S+?)"'
pm = re.search(regx, data)
code = pm.group(1)
uuid = pm.group(2)
if code == '200':
return True
return False
def showQRImage():
global tip
url = 'https://login.weixin.qq.com/qrcode/' + uuid
params = {
't': 'webwx',
'_': int(time.time()),
}
r = myRequests.get(url=url, params=params)
tip = 1
f = open(QRImagePath, 'wb')
f.write(r.content)
f.close()
time.sleep(1)
if sys.platform.find('darwin') >= 0:
subprocess.call(['open', QRImagePath])
else:
subprocess.call(['xdg-open', QRImagePath])
print('請使用微信掃描二維碼以登錄')
def waitForLogin():
global tip, base_uri, redirect_uri, push_uri
url = 'https://login.weixin.qq.com/cgi-bin/mmwebwx-bin/login?tip=%s&uuid=%s&_=%s' % (
tip, uuid, int(time.time()))
r = myRequests.get(url=url)
r.encoding = 'utf-8'
data = r.text
# print(data)
# window.code=500;
regx = r'window.code=(d+);'
pm = re.search(regx, data)
code = pm.group(1)
if code == '201': # 已掃描
print('成功掃描,請?jiān)谑謾C(jī)上點(diǎn)擊確認(rèn)以登錄')
tip = 0
elif code == '200': # 已登錄
print('正在登錄...')
regx = r'window.redirect_uri="(S+?)";'
pm = re.search(regx, data)
redirect_uri = pm.group(1) + '&fun=new'
base_uri = redirect_uri[:redirect_uri.rfind('/')]
# push_uri與base_uri對應(yīng)關(guān)系(排名分先后)(就是這么奇葩..)
services = [
('wx2.qq.com', 'webpush2.weixin.qq.com'),
('qq.com', 'webpush.weixin.qq.com'),
('web1.wechat.com', 'webpush1.wechat.com'),
('web2.wechat.com', 'webpush2.wechat.com'),
('wechat.com', 'webpush.wechat.com'),
('web1.wechatapp.com', 'webpush1.wechatapp.com'),
]
push_uri = base_uri
for (searchUrl, pushUrl) in services:
if base_uri.find(searchUrl) >= 0:
push_uri = 'https://%s/cgi-bin/mmwebwx-bin' % pushUrl
break
# closeQRImage
if sys.platform.find('darwin') >= 0: # for OSX with Preview
os.system("osascript -e 'quit app "Preview"'")
elif code == '408': # 超時(shí)
pass
# elif code == '400' or code == '500':
return code
def login():
global skey, wxsid, wxuin, pass_ticket, BaseRequest
r = myRequests.get(url=redirect_uri)
r.encoding = 'utf-8'
data = r.text
# print(data)
doc = xml.dom.minidom.parseString(data)
root = doc.documentElement
for node in root.childNodes:
if node.nodeName == 'skey':
skey = node.childNodes[0].data
elif node.nodeName == 'wxsid':
wxsid = node.childNodes[0].data
elif node.nodeName == 'wxuin':
wxuin = node.childNodes[0].data
elif node.nodeName == 'pass_ticket':
pass_ticket = node.childNodes[0].data
# print('skey: %s, wxsid: %s, wxuin: %s, pass_ticket: %s' % (skey, wxsid,
# wxuin, pass_ticket))
if not all((skey, wxsid, wxuin, pass_ticket)):
return False
BaseRequest = {
'Uin': int(wxuin),
'Sid': wxsid,
'Skey': skey,
'DeviceID': deviceId,
}
return True
def webwxinit():
url = (base_uri +
'/webwxinit?pass_ticket=%s&skey=%s&r=%s' % (
pass_ticket, skey, int(time.time())) )
params = {'BaseRequest': BaseRequest }
headers = {'content-type': 'application/json; charset=UTF-8'}
r = myRequests.post(url=url, data=json.dumps(params),headers=headers)
r.encoding = 'utf-8'
data = r.json()
if DEBUG:
f = open(os.path.join(os.getcwd(), 'webwxinit.json'), 'wb')
f.write(r.content)
f.close()
# print(data)
global ContactList, My, SyncKey
dic = data
ContactList = dic['ContactList']
My = dic['User']
SyncKey = dic['SyncKey']
state = responseState('webwxinit', dic['BaseResponse'])
return state
def webwxgetcontact():
url = (base_uri +
'/webwxgetcontact?pass_ticket=%s&skey=%s&r=%s' % (
pass_ticket, skey, int(time.time())) )
headers = {'content-type': 'application/json; charset=UTF-8'}
r = myRequests.post(url=url,headers=headers)
r.encoding = 'utf-8'
data = r.json()
if DEBUG:
f = open(os.path.join(os.getcwd(), 'webwxgetcontact.json'), 'wb')
f.write(r.content)
f.close()
dic = data
MemberList = dic['MemberList']
# 倒序遍歷,不然刪除的時(shí)候出問題..
SpecialUsers = ["newsapp", "fmessage", "filehelper", "weibo", "qqmail", "tmessage", "qmessage", "qqsync", "floatbottle", "lbsapp", "shakeapp", "medianote", "qqfriend", "readerapp", "blogapp", "facebookapp", "masssendapp",
"meishiapp", "feedsapp", "voip", "blogappweixin", "weixin", "brandsessionholder", "weixinreminder", "wxid_novlwrv3lqwv11", "gh_22b87fa7cb3c", "officialaccounts", "notification_messages", "wxitil", "userexperience_alarm"]
for i in range(len(MemberList) - 1, -1, -1):
Member = MemberList[i]
if Member['VerifyFlag'] & 8 != 0: # 公眾號/服務(wù)號
MemberList.remove(Member)
elif Member['UserName'] in SpecialUsers: # 特殊賬號
MemberList.remove(Member)
elif Member['UserName'].find('@@') != -1: # 群聊
MemberList.remove(Member)
elif Member['UserName'] == My['UserName']: # 自己
MemberList.remove(Member)
return MemberList
def syncKey():
SyncKeyItems = ['%s_%s' % (item['Key'], item['Val'])
for item in SyncKey['List']]
SyncKeyStr = '|'.join(SyncKeyItems)
return SyncKeyStr
def syncCheck():
url = push_uri + '/synccheck?'
params = {
'skey': BaseRequest['Skey'],
'sid': BaseRequest['Sid'],
'uin': BaseRequest['Uin'],
'deviceId': BaseRequest['DeviceID'],
'synckey': syncKey(),
'r': int(time.time()),
}
r = myRequests.get(url=url,params=params)
r.encoding = 'utf-8'
data = r.text
# print(data)
# window.synccheck={retcode:"0",selector:"2"}
regx = r'window.synccheck={retcode:"(d+)",selector:"(d+)"}'
pm = re.search(regx, data)
retcode = pm.group(1)
selector = pm.group(2)
return selector
def webwxsync():
global SyncKey
url = base_uri + '/webwxsync?lang=zh_CN&skey=%s&sid=%s&pass_ticket=%s' % (
BaseRequest['Skey'], BaseRequest['Sid'], urllib.quote_plus(pass_ticket))
params = {
'BaseRequest': BaseRequest,
'SyncKey': SyncKey,
'rr': ~int(time.time()),
}
headers = {'content-type': 'application/json; charset=UTF-8'}
r = myRequests.post(url=url, data=json.dumps(params))
r.encoding = 'utf-8'
data = r.json()
# print(data)
dic = data
SyncKey = dic['SyncKey']
state = responseState('webwxsync', dic['BaseResponse'])
return state
def heartBeatLoop():
while True:
selector = syncCheck()
if selector != '0':
webwxsync()
time.sleep(1)
def main():
global myRequests
if hasattr(ssl, '_create_unverified_context'):
ssl._create_default_https_context = ssl._create_unverified_context
headers = {'User-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/44.0.2403.125 Safari/537.36'}
myRequests = requests.Session()
myRequests.headers.update(headers)
if not getUUID():
print('獲取uuid失敗')
return
print('正在獲取二維碼圖片...')
showQRImage()
while waitForLogin() != '200':
pass
os.remove(QRImagePath)
if not login():
print('登錄失敗')
return
if not webwxinit():
print('初始化失敗')
return
MemberList = webwxgetcontact()
threading.Thread(target=heartBeatLoop)
MemberCount = len(MemberList)
print('通訊錄共%s位好友' % MemberCount)
d = {}
imageIndex = 0
for Member in MemberList:
imageIndex = imageIndex + 1
name = '/root/Desktop/friendImage/image'+str(imageIndex)+'.jpg'
imageUrl = 'https://wx.qq.com'+Member['HeadImgUrl']
r = myRequests.get(url=imageUrl,headers=headers)
imageContent = (r.content)
fileImage = open(name,'wb')
fileImage.write(imageContent)
fileImage.close()
print('正在下載第:'+str(imageIndex)+'位好友頭像')
d[Member['UserName']] = (Member['NickName'], Member['RemarkName'])
city = Member['City']
city = 'nocity' if city == '' else city
name = Member['NickName']
name = 'noname' if name == '' else name
sign = Member['Signature']
sign = 'nosign' if sign == '' else sign
remark = Member['RemarkName']
remark = 'noremark' if remark == '' else remark
alias = Member['Alias']
alias = 'noalias' if alias == '' else alias
nick = Member['NickName']
nick = 'nonick' if nick == '' else nick
print(name,' ^+*+^ ',city,' ^+*+^ ',Member['Sex'],' ^+*+^ ',Member['StarFriend'],' ^+*+^ ',sign,' ^+*+^ ',remark,' ^+*+^ ',alias,' ^+*+^ ',nick )
if __name__ == '__main__':
main()
print('回車鍵退出...')
input()
所返回的json結(jié)果如下圖所示
昵稱、微信號诅挑、城市四敞、性別、星標(biāo)好友拔妥、頭像忿危、個(gè)性簽名、備注没龙。提取以上信息铺厨,對頭像圖片進(jìn)行下載缎玫,并對數(shù)據(jù)進(jìn)行簡單的清洗等等,最后一列為微信號不方便顯示解滓。
第二步:性別統(tǒng)計(jì)和地區(qū)分布
使用python的pandas科學(xué)計(jì)算庫進(jìn)行簡單的統(tǒng)計(jì)赃磨,如果你沒有用過,可以轉(zhuǎn)至如下鏈接進(jìn)行安裝學(xué)習(xí):【原】十分鐘搞定pandas
只要掌握了非常簡單的pandas只是就可以繼續(xù)往下看做以下統(tǒng)計(jì)
(1)洼裤、所有好友的男女比例
(2)邻辉、所有好友的城市分布
(3)、統(tǒng)計(jì)認(rèn)識的朋友以及占所有朋友的百分比
統(tǒng)計(jì)方法:所有朋友 - 沒有備注的朋友 - 備注與昵稱相同的朋友
(4)腮鞍、統(tǒng)計(jì)認(rèn)識的朋友中的男女比例
統(tǒng)計(jì)方法:對三的結(jié)果再進(jìn)行男女劃分即可得到結(jié)果
把結(jié)果做成簡單的圖表(主要使用了百度的 echarts 作圖)
使用地圖慧江蘇省好友分布值骇,這個(gè)編碼我不知怎么回事,可能是瀏覽器問題移国,回頭我用其它瀏覽器查看一下吱瘩。
最后再生成省份好友分布地圖
最后運(yùn)用opencv的圖像識別進(jìn)行人像識別,統(tǒng)計(jì)微信好友中用人像作為頭像的好友人數(shù)迹缀。OpenCV的全稱是:Open Source Computer Vision Library使碾。OpenCV是一個(gè)基于BSD許可(開源)發(fā)行的跨平臺計(jì)算機(jī)視覺庫,可以運(yùn)行在Linux裹芝、Windows和Mac OS操作系統(tǒng)上部逮。它輕量級而且高效——由一系列 C 函數(shù)和少量 C++ 類構(gòu)成娜汁,同時(shí)提供了Python嫂易、Ruby、MATLAB等語言的接口掐禁,實(shí)現(xiàn)了圖像處理和計(jì)算機(jī)視覺方面的很多通用算法怜械。
如果你對opencv不是很了解,你可以按照以下的鏈接進(jìn)行學(xué)習(xí)傅事。
你可以去它的官網(wǎng):http://opencv.org/ (需要有一定的英語知識)
國內(nèi)也有一些比較好的博客資源缕允,比如以下兩個(gè)
如下開始是對抓取的朋友頭像進(jìn)行遍歷識別是否含有人臉,代碼如下蹭越。
#!/usr/bin/env python
'''
face detection using haar cascades
USAGE:
facedetect.py [--cascade ] [--nested-cascade ] []
'''
# Python 2/3 compatibility
from __future__ import print_function
import numpy as np
import cv2
# local modules
from video import create_capture
from common import clock, draw_str
def detect(img, cascade):
rects = cascade.detectMultiScale(img, scaleFactor=1.3, minNeighbors=4, minSize=(30, 30),
flags=cv2.CASCADE_SCALE_IMAGE)
if len(rects) == 0:
return []
rects[:,2:] += rects[:,:2]
return rects
def draw_rects(img, rects, color):
for x1, y1, x2, y2 in rects:
cv2.rectangle(img, (x1, y1), (x2, y2), color, 2)
if __name__ == '__main__':
import sys, getopt
print(__doc__)
count = 0
for i in range(1,1192):
print(str(i))
args, video_src = getopt.getopt(sys.argv[1:], '', ['cascade=', 'nested-cascade='])
try:
video_src = video_src[0]
except:
video_src = 0
args = dict(args)
cascade_fn = args.get('--cascade', "../../data/haarcascades/haarcascade_frontalface_alt.xml")
nested_fn = args.get('--nested-cascade', "../../data/haarcascades/haarcascade_eye.xml")
cascade = cv2.CascadeClassifier(cascade_fn)
nested = cv2.CascadeClassifier(nested_fn)
cam = create_capture(video_src, fallback='synth:bg=../data/friend/friendImage/image'+str(i)+'.jpg:noise=0.05')
ret, img = cam.read()
gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
gray = cv2.equalizeHist(gray)
rects = detect(gray, cascade)
vis = img.copy()
draw_rects(vis, rects, (0, 255, 0))
if not nested.empty():
if len(rects) == 0:
print('none')
else:
count = count + 1
print(str(count))
input()
執(zhí)行以上代碼統(tǒng)計(jì)出最后的結(jié)果
使用人像做頭像的好友:59 因此不使用人像的1133障本,看來使用人像的人還是很少的。
運(yùn)行提取人像頭像的代碼最后提取出的頭像如下所示 响鹃,不得不說Python的庫真是十分的有用驾霜。(因?yàn)樯婕暗诫[私,所以這里不會展示過多的頭像)
最近仍然在研究簽名以及頭像的可用之處买置,也是歡迎大家一起學(xué)習(xí)交流粪糙。同時(shí)希望以上的內(nèi)容可以提升一下大家的學(xué)習(xí)興趣。關(guān)于微信好友的更多挖掘會不斷進(jìn)行忿项。
(1)蓉冈、人像頭像與年齡之間的關(guān)系(由于微信沒有年齡城舞,于是想通過知乎進(jìn)行推算)
(2)、個(gè)性簽名與年齡性格之間的關(guān)系
(3)寞酿、微信號中所包含信息推算年齡層次家夺,預(yù)測當(dāng)前微信號年齡