本文用到的包為
import numpy as np
import matplotlib.pyplot as plt
import networkx as nx
from collections import defaultdict
from collections import Counter
from numpy import linalg as LA
import statsmodels.api as sm
import matplotlib.cm as cm
from datetime import datetime as dt
import sys
from os import listdir
from scipy.stats.stats import pearsonr
from matplotlib.dates import YearLocator
StackExchange(以下簡稱SE)是世界上最大的專業(yè)性問答社區(qū)之一娃弓。最早只有一個StackOverflow塘辅,后來慢慢發(fā)展出其他的問答社區(qū)挪蹭,現(xiàn)在一共有一百多社區(qū)浓镜。在這里可以看到所有社區(qū)锭魔。
[這個]問答給出了SE歷史數(shù)據(jù)的下載地址蜕企。本文給出對SE數(shù)據(jù)的初步處理示例咬荷。
首先定義一些數(shù)據(jù)處理函數(shù):
def dailyQA(site):
F = defaultdict(lambda:[0,0])
path='/Users/csid/Documents/bigdata/stackexchange/unzip/'
filename = path + site + '/Posts.xml'
with open(filename,'r') as f:
for line in f:
try:
label = line.split('PostTypeId=')[1][1:2]
day = line.split('CreationDate=')[1][1:11]
if label == '1':
F[day][0]+=1
if label == '2':
F[day][1]+=1
except:
pass
return F
#plot the monthly growth of sites in terms of Na and Nq
def plotMonth(site,ax,col):
M=defaultdict(lambda:np.array([0,0]))
f=F[site]
for i in f:
M[i[:7]]+=np.array(f[i])
ms=sorted(M.keys())[1:-1]
if len(ms)>3:
x,y = np.array([M[i] for i in ms]).T
mm=[dt.strptime(j,'%Y-%m') for j in ms]
#ax.vlines(mm[0], x[0], y[0],color=col,linestyle='-')
ax.fill_between(mm, x, y,color=col, alpha=0.1)
ax.plot(mm,x,color="white",linestyle='-',marker='',alpha=0.1)
ax.plot(mm,y,color="white",linestyle='-',marker='',alpha=0.1)
def plotMonthSpecial(site,ax,col):
M=defaultdict(lambda:np.array([0,0]))
f=F[site]
for i in f:
M[i[:7]]+=np.array(f[i])
ms=sorted(M.keys())[2:-1]
x,y = np.array([M[i] for i in ms]).T
mm=[dt.strptime(j,'%Y-%m') for j in ms]
ax.vlines(mm[0], x[0], y[0],color=col,linestyle='-')
ax.plot(mm,x,color=col,linestyle='-',marker='')
ax.plot(mm,y,color=col,linestyle='-',marker='')
通過下列代碼得到每個社區(qū)每天新增的問題和答案數(shù)
path='/Users/csid/Documents/bigdata/stackexchange/unzip/'
sites = [ f for f in listdir(path) if f[-1]=='m']
F={}
for i in sites:
flushPrint(sites.index(i))
F[i] = dailyQA(i)
好的可視化,需要層次分明轻掩,所以在繪制各個社區(qū)問答數(shù)量增長曲線時幸乒,往往需要排序來決定繪制的先后疊加順序。下列代碼將各個社區(qū)按照總的問答數(shù)量排序唇牧。
# plot good sites at first then plot bad sites
S={}
for i in sites:
q,a=zip(*F[i].values())
S[i]=sum(q),sum(a)
rsites=[i for i,j in sorted(S.items(),key=lambda x:-x[1][0])]
然后就可以繪制圖2了
每條帶子是一個社區(qū)罕扎,上界是每月新增答案數(shù),下界是每月新增問題數(shù)丐重。一共有110個社區(qū)腔召。顏色代表社區(qū)的問答總數(shù)(取對數(shù)再減去5)。我們還可以選擇性地標示出某些社區(qū)扮惦,例如本圖中標示出了物理類(藍色)和烹調(diào)類(深綠色)兩個社區(qū)臀蛛。
繪制代碼為
fig = plt.figure(figsize=(12, 5),facecolor='white')
ax = plt.subplot(111)
years = YearLocator()
cmap = cm.get_cmap('PiYG', 10)
for i in rsites:
c = int(np.log(S[i][0])-5)
plotMonth(i,ax,cmap(c))
plotMonthSpecial('physics.stackexchange.com',ax,'RoyalBlue')
plotMonthSpecial('cooking.stackexchange.com',ax,'DarkOliveGreen')
ax.set_yscale('log')
ax.set_ylim(1,10**6)
ax.set_xlabel('Time')
ax.set_ylabel('Monthly increased N of Q&A')
ax.xaxis.set_major_locator(years)
smm = plt.cm.ScalarMappable(cmap=cmap, norm=plt.Normalize(vmin=0, vmax=10))
smm._A = []
cbaxes = fig.add_axes([0.15, 0.85, 0.5, 0.015])
cbar = plt.colorbar(smm,cax=cbaxes,orientation='horizontal')
plt.show()
接下來,我們考慮使用節(jié)點到源和匯的流距離構(gòu)造一個相空間崖蜜,分析用戶在這個相空間中游走的軌跡產(chǎn)生的角度熵(把在兩個節(jié)點間的每一步跳躍合并到一個原點上掺栅,考察角度的分布)與社區(qū)可持續(xù)發(fā)展之間的關(guān)系。我們的假設(shè)是纳猪,用戶游走的熵越大氧卧,說明用戶越有創(chuàng)造性,對問答社區(qū)的長期發(fā)展也越有利氏堤。
首先要定義一系列函數(shù)
def userDailyAnswers(site):
C={}
filename = path + site + '/Posts.xml'
with open(filename,'r') as f:
for line in f:
try:
label = line.split('PostTypeId=')[1][1:2]
if label == '2':
date = line.split('CreationDate=')[1][1:11]
time = line.split('CreationDate=')[1][12:20]
author = int(line.split('OwnerUserId=')[1].split(r'"')[1])
questionID = int(line.split('ParentId=')[1].split(r'"')[1])
if date in C:
if author in C[date]:
C[date][author]+=[(time,questionID)]
else:
C[date][author]=[(time,questionID)]
else:
C[date]={author:[(time,questionID)]}
except:
pass
return C
# calculate entropy of path angles
def entropy(G,O,K,T):
angles=[]
for i,j in G.edges():
#wi = G[i][j]['weight']
dx,dy = np.array([O[j],K[j]])-np.array([O[i],K[i]])
dis = LA.norm(np.array([O[j],K[j]])-np.array([O[i],K[i]]))
if dy>=0:
angle = np.round(180*np.arccos(dx/dis)/np.pi,1)
else:
angle = 360-np.round(180*np.arccos(dx/dis)/np.pi,1)
angles.append(angle)
l = len(angles)
ps=np.array(Counter(angles).values())
ps=ps/float(ps.sum())
#ent = -(ps*np.log(ps)).sum()/np.log(l)
ent = -(ps*np.log(ps)).sum()
return ent
def getSiteFlowdata(site):
C=userDailyAnswers(site)
days=sorted(C.keys())
E=defaultdict(lambda:0)
n=0
maxuser=100
for day in days[len(days)/2:]:
d = C[day]
f = sorted(d.items(),key=lambda x:x[1])
for i,j in f:
if n<maxuser:
n+=1
q = [p for o,p in j]
q = ['source']+q+['sink']
for a,b in zip(q[:-1],q[1:]):
E[(a,b)]+=1
G=nx.DiGraph()
for x,y in E:
w = E[(x,y)]
G.add_edge(x,y,weight=w)
O = flowDistanceFromSource(G)
K = flowDistanceToSink(G)
T = G.out_degree(weight='weight')
return G,O,K,T
# orthogonal okplot
def okplot(G,O,K,T):
plt.plot([0,4],[0,4],'r-',alpha=0.5)
for i,j in G.edges():
wi = G[i][j]['weight']
x1,y1=O[i],K[i]
x2,y2=O[j],K[j]
dx=x2-x1
dy=y2-y1
plt.arrow(x1, y1, dx, dy, head_width=0.1, head_length=0.2, fc='gray', ec='gray',alpha=0.2)
#plt.text(x2,y2,wi,color='brown')
plt.xlabel(L_{oi},size=16)
plt.ylabel(L_{ik},size=16)
# rescaled orthogonal okplot
def rescaledokplot(G,O,K,T):
r = 0
Dx=0;Dy=0
tr=0
for i,j in G.edges():
wi = G[i][j]['weight']
x1,y1=O[i],K[i]
x2,y2=O[j],K[j]
dx=x2-x1
dy=y2-y1
Dx+=dx
Dy+=dy
rr = np.sqrt(dx**2+dy**2)
tr+=rr
if rr>r:
r=rr
plt.arrow(0, 0, dx, dy, head_width=0.05, head_length=0.1, fc='gray', ec='gray',alpha=0.1)
plt.arrow(0, 0, Dx/float(tr), Dy/float(tr), head_width=0.1,
head_length=0.2, fc='red', ec='red',alpha=0.7)
lim=2
plt.xlim(-lim,lim)
plt.ylim(-lim,lim)
接著就可以比較物理和烹調(diào)這兩個規(guī)模相近的社區(qū)沙绝,取其總天數(shù)一半時的一百個用戶產(chǎn)生的游走軌跡的角度熵
i='physics.stackexchange.com'
j='cooking.stackexchange.com'
G1,O1,K1,T1=getSiteFlowdata(i)
G2,O2,K2,T2=getSiteFlowdata(j)
# okplot demo
fig = plt.figure(figsize=(12, 6),facecolor='white')
ax = plt.subplot(121)
okplot(G1,O1,K1,T1)
ax = plt.subplot(122)
okplot(G2,O2,K2,T2)
plt.tight_layout()
plt.show()
得到下圖
可以通過下列代碼
entropy(G1,O1,K1,T1),entropy(G2,O2,K2,T2)
來計算得到兩個社區(qū)的熵分別為3.47和2.67互例。物理社區(qū)的熵更大奢入,實際發(fā)展也更好,驗證了我們的假設(shè)敲霍。
接下來俊马,我們考察所有社區(qū)在發(fā)展一半時的一百個用戶記錄,以此預(yù)測其最終發(fā)展規(guī)模
# construct network and calculate path entropy
D={}
for site in sites:
if site=='ebooks.stackexchange.com' or site=='stackoverflow.com':
continue
flushPrint(sites.index(site))
C=userDailyAnswers(site)
days=sorted(C.keys())
E=defaultdict(lambda:0)
n=0
maxuser=100
for day in days[len(days)/2:]:
d = C[day]
f = sorted(d.items(),key=lambda x:x[1])
for i,j in f:
if n<maxuser:
n+=1
q = [p for o,p in j]
q = ['source']+q+['sink']
for a,b in zip(q[:-1],q[1:]):
E[(a,b)]+=1
G=nx.DiGraph()
for x,y in E:
w = E[(x,y)]
G.add_edge(x,y,weight=w)
O = flowDistanceFromSource(G)
K = flowDistanceToSink(G)
T = G.out_degree(weight='weight')
D[site]=entropy(G,O,K,T)
l,a,q=np.array([(D[i],S[i][0],S[i][1]) for i in D if i in S and i!='aviation.stackexchange.com']).T
cs,beta,r2=OLSRegressFit(l,np.log(q))
fig = plt.figure(figsize=(8, 8))
plt.plot(l,q,linestyle='',marker='s',color='RoyalBlue',label='N of Questions')
plt.plot(l,a,linestyle='',marker='^',color='Chocolate',label='N of Answers')
plt.plot(l,np.exp(cs+beta*l),linestyle='-',marker='',color='Brown')
plt.yscale('log')
plt.legend(loc=1,numpoints=1)
plt.xlabel('Entropy of angles', size=16)
plt.ylabel('N of Questions & Answers', size=16)
plt.show()
得到下圖
考察其皮爾遜相關(guān)系數(shù)
pearsonr(l,np.log(q))
得到0.42肩杈,p-value小于0.001柴我。