人生苦短吁讨,我用Python
話不多說髓迎,先看看數(shù)據(jù)。
import numpy as np
import pandas as pd
import os
import matplotlib.pyplot as plt
os.chdir('C:/Users/花倫同學(xué)/Desktop/test/03/')#設(shè)置默認(rèn)路徑
data = pd.read_csv('知乎數(shù)據(jù)_201701utf8_2.csv',engine='python')#讀取原始數(shù)據(jù)
data.head(10)#查看前10條數(shù)據(jù)
預(yù)覽原數(shù)據(jù)
可以看到數(shù)據(jù)列表中建丧,在“回答”排龄、“提問”、“職業(yè)”等列中有NaN的空值翎朱,按照列的數(shù)據(jù)類型進(jìn)行數(shù)據(jù)清洗橄维,將空值替換為‘0’或者‘缺失數(shù)據(jù)’
創(chuàng)建數(shù)據(jù)清洗函數(shù)。
def data_cleaning(df):
cols = df.columns
for col in cols:
if df[col].dtype == 'object':
df[col].fillna('缺失數(shù)據(jù)', inplace = True)
else:
df[col].fillna(0, inplace = True)
return(df)
# 該函數(shù)可以將任意數(shù)據(jù)內(nèi)空值替換
# 列為文字則返回‘缺失數(shù)據(jù)’否則列為數(shù)字返回‘0’
這個函數(shù)我們后面的文字中也會用到拴曲,適用面很廣争舞。
data0 = data_cleaning(data)
data0.head(10)
清洗后數(shù)據(jù)
數(shù)據(jù)清洗完成后,就可以進(jìn)行數(shù)據(jù)分析了疗韵,這次主要從兩個維度進(jìn)行分析:
1.知友的全國分布情況兑障。(主要是知友數(shù)量和知友的密度)
2.各個高校知友的關(guān)注和被關(guān)注情況。
第一個緯度分析蕉汪。
data_city_num = data0.groupby('居住地').count()#按地域分組
data_city_num['知友數(shù)量'] = data_city_num["_id"]#得到知友各城市的數(shù)量
data_pop = pd.read_csv('六普常住人口數(shù).csv', engine = 'python')#讀取人口數(shù)據(jù)
#print(data_city_num.head(10))
data_pop['城市'] = data_pop['地區(qū)'].str[:-1]#建立新標(biāo)簽流译,去掉"市"
data_md = pd.merge(data_city_num, data_pop, left_index = True, right_on = '城市', how = 'inner')[['知友數(shù)量','城市','常住人口']]
#合并列表,為了下一步計算密度
data_md['知友密度'] = data_md['知友數(shù)量']/data_md['常住人口']#計算密度
#data_md.head(10)
為了方便比對者疤,數(shù)量和密度之間的相對情況福澡,這里創(chuàng)建一個標(biāo)準(zhǔn)化的函數(shù)對數(shù)據(jù)進(jìn)行標(biāo)準(zhǔn)化。
def standard(df,*cols):
colnames = []
for col in cols:
colname = col + '_std'
df[colname] = (df[col] - df[col].min())/(df[col].max()-df[col].min())*100
colnames.append(colname)
return(df,colnames)
#標(biāo)準(zhǔn)化數(shù)據(jù)驹马,返回dataframe和列名革砸,返回類型為元祖
data_md_std = standard(data_md,'知友密度','知友數(shù)量')[0]#獲取返回的dataframe
data_colnames = standard(data_md,'知友密度','知友數(shù)量')[1]#獲取返回的列名
data_md_std.head()
#data_colnames
data_top20_md = data_md_std.sort_values('知友密度_std',ascending = False)[['城市','知友密度_std']].iloc[:20]
data_top20_sl = data_md_std.sort_values('知友數(shù)量_std',ascending = False)[['城市','知友數(shù)量_std']].iloc[:20]
#提取數(shù)量和密度的前20行數(shù)據(jù),及top20
#print(data_top20_sl,data_top20_md)
然后制作圖表糯累,更直觀的查看數(shù)據(jù)算利。
fig1 = plt.figure(num=1,figsize=(12,4))
y1 = data_top20_sl['知友數(shù)量_std']
plt.bar(range(20),
y1,
width = 1,
facecolor = 'yellowgreen',
edgecolor = 'k',
tick_label = data_top20_sl['城市'])
plt.title('知友數(shù)量TOP20\n')
plt.grid(True, linestyle = '--', color = 'grey', axis = 'y')
for i, j in zip(range(20),y1):
plt.text(i-0.5,2,'%.1f' %j, color = 'k', fontsize = 12)#用循環(huán)給圖加標(biāo)注
fig2 = plt.figure(num=2,figsize = (12,4))
y2 = data_top20_md['知友密度_std']
plt.bar(range(20),
y2,
width = 1,
facecolor = 'lightblue',
edgecolor = 'k',
tick_label = data_top20_md['城市'])
plt.title('知友密度TOP20\n')
for i, j in zip(range(20),y2):
plt.text(i-0.5,2,'%.1f' %j, color = 'k',fontsize = 12)#用循環(huán)給圖加標(biāo)注
知友數(shù)量TOP20
知友密度TOP20
第二個緯度分析
根據(jù)關(guān)注者和被關(guān)注的情況進(jìn)行分組
data_gz = data0.groupby('教育經(jīng)歷').sum()[['關(guān)注','關(guān)注者']]
#data_gz.head(20)
data_top20_fans = data_gz.sort_values('關(guān)注者',ascending = False).drop(['缺失數(shù)據(jù)','醫(yī)學(xué)','我的老師,是山川和大地','重慶第一工程尸培養(yǎng)基地',
'五道口男子職業(yè)技術(shù)學(xué)院','為往圣繼絕學(xué)','本科'])[:20]
#查看數(shù)據(jù)后發(fā)現(xiàn)教育經(jīng)歷有很多亂填的泳姐,這里要把這些信息去掉(┑( ̄Д  ̄)┍)
data_top20_fans
關(guān)注人數(shù)高校top20
然后做數(shù)據(jù)的可視化效拭,結(jié)果就一目了然了。
plt.figure(figsize = (16,10))
x = data_top20_fans['關(guān)注']
y = data_top20_fans['關(guān)注者']
x_mean = data_top20_fans['關(guān)注'].mean()
y_mean = data_top20_fans['關(guān)注者'].mean()
plt.scatter(x,y,marker='.',
s = y/1000,
cmap = 'Blues',
c = x,
label = '學(xué)校')
plt.axvline(x_mean,label="平均關(guān)注人數(shù):%i人" % x_mean,color='r',linestyle="--",alpha=0.8) # 添加x軸參考線
plt.axhline(y_mean,label="平均粉絲數(shù):%i人" % y_mean,color='g',linestyle="--",alpha=0.8) # 添加y軸參考線
plt.legend(loc = 'upper left')
plt.grid()
for i,j,k in zip(x,y,data_top20_fans.index):
plt.text(i+500,j+500,k)
關(guān)注人數(shù)高校top20
我們可以清楚的看到浙江大學(xué)在關(guān)注人數(shù)和被關(guān)注人數(shù)都是遙遙領(lǐng)先,其次是北京大學(xué)缎患。北京電影學(xué)院的粉絲數(shù)量較多但關(guān)注的人數(shù)較少...
這里還介紹一種數(shù)據(jù)可視化的方法bokeh慕的,大家可以參考這個代碼試試。
from bokeh.plotting import figure, show, output_file
from bokeh.models import ColumnDataSource
from bokeh.models import HoverTool
df2 = data_top20_fans.copy()
df2.columns = ['gz','gzz']
df2.index.name = 'xx'
df2['size'] = df2['gzz'] / 20000
source = ColumnDataSource(df2)
hover = HoverTool(tooltips = [
('學(xué)校','@xx'),
('關(guān)注人數(shù)','@gz'),
('被關(guān)注人數(shù)','@gzz')
])
p = figure(plot_width = 800,plot_height = 600,title = 'top20',tools=[hover,'box_select,reset,wheel_zoom,pan,crosshair'])
p.circle(x = 'gz',y = 'gzz',source = source,size = 'size',fill_color = 'blue',fill_alpha = 0.6)
show(p)