本文采編自寒小陽(yáng)老師上課講義
案例:自行車(chē)租賃數(shù)據(jù)分析與可視化 {#案例:自行車(chē)租賃數(shù)據(jù)分析與可視化}
導(dǎo)入數(shù)據(jù),做簡(jiǎn)單的數(shù)據(jù)處理 {#步驟1:導(dǎo)入數(shù)據(jù)走趋,做簡(jiǎn)單的數(shù)據(jù)處理}
import pandas as pd # 讀取數(shù)據(jù)到DataFrame
import urllib # 獲取網(wǎng)絡(luò)數(shù)據(jù)
import tempfile # 創(chuàng)建臨時(shí)文件系統(tǒng)
import shutil # 文件操作
import zipfile # 壓縮解壓
temp_dir = tempfile.mkdtemp() # 建立臨時(shí)目錄
data_source = 'http://archive.ics.uci.edu/ml/machine-learning-databases/00275/Bike-Sharing-Dataset.zip' # 網(wǎng)絡(luò)數(shù)據(jù)地址
zipname = temp_dir + '/Bike-Sharing-Dataset.zip' # 拼接文件和路徑
urllib.urlretrieve(data_source, zipname) # 獲得數(shù)據(jù)
zip_ref = zipfile.ZipFile(zipname, 'r') # 創(chuàng)建一個(gè)ZipFile對(duì)象處理壓縮文件
zip_ref.extractall(temp_dir) # 解壓
zip_ref.close()
daily_path = 'data/day.csv'
daily_data = pd.read_csv(daily_path) # 讀取csv文件
daily_data['dteday'] = pd.to_datetime(daily_data['dteday']) # 把字符串?dāng)?shù)據(jù)傳換成日期數(shù)據(jù)
drop_list = ['instant', 'season', 'yr', 'mnth', 'holiday', 'workingday', 'weathersit', 'atemp', 'hum'] # 不關(guān)注的列
daily_data.drop(drop_list, inplace = True, axis = 1) # inplace=true在對(duì)象上直接操作
shutil.rmtree(temp_dir) # 刪除臨時(shí)文件目錄
daily_data.head() # 預(yù)覽數(shù)據(jù)
配置全局參數(shù) {#步驟2:配置參數(shù)}
from matplotlib import pyplot as plt
import pandas as pd
import numpy as np
# 在notebook中顯示繪圖結(jié)果
%matplotlib inline
# 設(shè)置一些全局的資源參數(shù)衅金,可以進(jìn)行個(gè)性化修改
import matplotlib
# 設(shè)置圖片尺寸 14" x 7"
# rc: resource configuration
matplotlib.rc('figure', figsize = (14, 7))
# 設(shè)置字體 14
matplotlib.rc('font', size = 14)
# 不顯示頂部和右側(cè)的坐標(biāo)線(xiàn)
matplotlib.rc('axes.spines', top = False, right = False)
# 不顯示網(wǎng)格
matplotlib.rc('axes', grid = False)
# 設(shè)置背景顏色是白色
matplotlib.rc('axes', facecolor = 'white')
關(guān)聯(lián)分析 {#步驟3:關(guān)聯(lián)分析}
# 包裝一個(gè)散點(diǎn)圖的函數(shù)便于復(fù)用
def scatterplot(x_data, y_data, x_label, y_label, title):
# 創(chuàng)建一個(gè)繪圖對(duì)象
fig, ax = plt.subplots()
# 設(shè)置數(shù)據(jù)、點(diǎn)的大小簿煌、點(diǎn)的顏色和透明度
ax.scatter(x_data, y_data, s = 10, color = '#539caf', alpha = 0.75)
# 添加標(biāo)題和坐標(biāo)說(shuō)明
ax.set_title(title)
ax.set_xlabel(x_label)
ax.set_ylabel(y_label)
# 繪制散點(diǎn)圖
scatterplot(x_data = daily_data['temp']
, y_data = daily_data['cnt']
, x_label = 'Normalized temperature (C)'
, y_label = 'Check outs'
, title = 'Number of Check Outs vs Temperature')
# 線(xiàn)性回歸
import statsmodels.api as sm # 最小二乘
from statsmodels.stats.outliers_influence import summary_table # 獲得匯總信息
x = sm.add_constant(daily_data['temp']) # 其實(shí)可以看成這個(gè)series 前面加了一列 變成datadrame 這一列全為1 作為線(xiàn)性回歸的常數(shù)項(xiàng)
y = daily_data['cnt']
regr = sm.OLS(y, x) # 普通最小二乘模型氮唯,ordinary least square model
res = regr.fit()
# 從模型獲得擬合數(shù)據(jù)
st, data, ss2 = summary_table(res, alpha=0.05) # 置信水平alpha=5%,st數(shù)據(jù)匯總姨伟,data數(shù)據(jù)詳情惩琉,ss2數(shù)據(jù)列名
fitted_values = data[:,2]#第三列是擬合值 如果有興趣可以自己去這個(gè)庫(kù)的包 返回的data是一個(gè)很大的dataframe 每一列都是它的含義
# 包裝曲線(xiàn)繪制函數(shù)
def lineplot(x_data, y_data, x_label, y_label, title):
# 創(chuàng)建繪圖對(duì)象
_, ax = plt.subplots()
# 繪制擬合曲線(xiàn),lw=linewidth夺荒,alpha=transparancy
ax.plot(x_data, y_data, lw = 2, color = '#539caf', alpha = 1)
# 添加標(biāo)題和坐標(biāo)說(shuō)明
ax.set_title(title)
ax.set_xlabel(x_label)
ax.set_ylabel(y_label)
# 調(diào)用繪圖函數(shù)
lineplot(x_data = daily_data['temp']
, y_data = fitted_values
, x_label = 'Normalized temperature (C)'
, y_label = 'Check outs'
, title = 'Line of Best Fit for Number of Check Outs vs Temperature')
帶置信區(qū)間的曲線(xiàn)圖 {#帶置信區(qū)間的曲線(xiàn)圖}
- 評(píng)估曲線(xiàn)擬合結(jié)果
# 獲得5%置信區(qū)間的上下界
predict_mean_ci_low, predict_mean_ci_upp = data[:,4:6].T# 4.5列為5%置信區(qū)間的上下界
# 創(chuàng)建置信區(qū)間DataFrame瞒渠,上下界
CI_df = pd.DataFrame(columns = ['x_data', 'low_CI', 'upper_CI'])
CI_df['x_data'] = daily_data['temp']
CI_df['low_CI'] = predict_mean_ci_low
CI_df['upper_CI'] = predict_mean_ci_upp
CI_df.sort_values('x_data', inplace = True) # 根據(jù)x_data進(jìn)行排序
# 繪制置信區(qū)間
def lineplotCI(x_data, y_data, sorted_x, low_CI, upper_CI, x_label, y_label, title):
# 創(chuàng)建繪圖對(duì)象
_, ax = plt.subplots()
# 繪制預(yù)測(cè)曲線(xiàn)
ax.plot(x_data, y_data, lw = 1, color = '#539caf', alpha = 1, label = 'Fit')
# 繪制置信區(qū)間良蒸,順序填充
ax.fill_between(sorted_x, low_CI, upper_CI, color = '#539caf', alpha = 0.4, label = '95% CI')
# 添加標(biāo)題和坐標(biāo)說(shuō)明
ax.set_title(title)
ax.set_xlabel(x_label)
ax.set_ylabel(y_label)
# 顯示圖例,配合label參數(shù)伍玖,loc=“best”自適應(yīng)方式
ax.legend(loc = 'best')
# Call the function to create plot
lineplotCI(x_data = daily_data['temp']
, y_data = fitted_values
, sorted_x = CI_df['x_data']
, low_CI = CI_df['low_CI']
, upper_CI = CI_df['upper_CI']
, x_label = 'Normalized temperature (C)'
, y_label = 'Check outs'
, title = 'Line of Best Fit for Number of Check Outs vs Temperature')
雙坐標(biāo)曲線(xiàn)圖 {#雙坐標(biāo)曲線(xiàn)圖}
- 曲線(xiàn)擬合不滿(mǎn)足置信閾值時(shí)嫩痰,考慮增加獨(dú)立變量
- 分析不同尺度多變量的關(guān)系
# 雙縱坐標(biāo)繪圖函數(shù)
def lineplot2y(x_data, x_label, y1_data, y1_color, y1_label, y2_data, y2_color, y2_label, title):
_, ax1 = plt.subplots()
ax1.plot(x_data, y1_data, color = y1_color)
# 添加標(biāo)題和坐標(biāo)說(shuō)明
ax1.set_ylabel(y1_label, color = y1_color)
ax1.set_xlabel(x_label)
ax1.set_title(title)
ax2 = ax1.twinx() # 兩個(gè)繪圖對(duì)象共享橫坐標(biāo)軸
ax2.plot(x_data, y2_data, color = y2_color)
ax2.set_ylabel(y2_label, color = y2_color)
# 右側(cè)坐標(biāo)軸可見(jiàn)
ax2.spines['right'].set_visible(True)
# 調(diào)用繪圖函數(shù)
lineplot2y(x_data = daily_data['dteday']
, x_label = 'Day'
, y1_data = daily_data['cnt']
, y1_color = '#539caf'
, y1_label = 'Check outs'
, y2_data = daily_data['windspeed']
, y2_color = '#7663b0'
, y2_label = 'Normalized windspeed'
, title = 'Check Outs and Windspeed Over Time')
分布分析? {#步驟4:分布分析}
直方圖(灰度圖) {#灰度圖}
- 粗略區(qū)間計(jì)數(shù)
# 繪制灰度圖的函數(shù)
def histogram(data, x_label, y_label, title):
_, ax = plt.subplots()
res = ax.hist(data, color = '#539caf', bins=10) # 設(shè)置bin的數(shù)量
ax.set_ylabel(y_label)
ax.set_xlabel(x_label)
ax.set_title(title)
return res
# 繪圖函數(shù)調(diào)用
res = histogram(data = daily_data['registered']
, x_label = 'Check outs'
, y_label = 'Frequency'
, title = 'Distribution of Registered Check Outs')
res[0] # value of bins
res[1] # boundary of bins
堆疊直方圖
- 比較兩個(gè)分布
# 繪制堆疊的直方圖
def overlaid_histogram(data1, data1_name, data1_color, data2, data2_name, data2_color, x_label, y_label, title):
# 歸一化數(shù)據(jù)區(qū)間,對(duì)齊兩個(gè)直方圖的bins
max_nbins = 10
data_range = [min(min(data1), min(data2)), max(max(data1), max(data2))]
binwidth = (data_range[1] - data_range[0]) / max_nbins
bins = np.arange(data_range[0], data_range[1] + binwidth, binwidth) # 生成直方圖bins區(qū)間
# Create the plot
_, ax = plt.subplots()
ax.hist(data1, bins = bins, color = data1_color, alpha = 1, label = data1_name)
ax.hist(data2, bins = bins, color = data2_color, alpha = 0.75, label = data2_name)
ax.set_ylabel(y_label)
ax.set_xlabel(x_label)
ax.set_title(title)
ax.legend(loc = 'best')
# Call the function to create plot
overlaid_histogram(data1 = daily_data['registered']
, data1_name = 'Registered'
, data1_color = '#539caf'
, data2 = daily_data['casual']
, data2_name = 'Casual'
, data2_color = '#7663b0'
, x_label = 'Check outs'
, y_label = 'Frequency'
, title = 'Distribution of Check Outs By Type')
registered:注冊(cè)的分布窍箍,正態(tài)分布串纺,why
casual:偶然的分布,疑似指數(shù)分布仔燕,why
以上兩個(gè)問(wèn)題均可以查詢(xún)這兩個(gè)概念得出結(jié)論
密度圖 {#密度圖}
- 精細(xì)刻畫(huà)概率分布
KDE: kernal density estimate
$$\hat{f}h(x) = \frac{1}{n}\sum\limits{i=1}^n K_h(x-x_i) = \frac{1}{nh}\sum\limits_{i=1}^n K(\frac{x-x_i}{h})$$
# 計(jì)算概率密度
from scipy.stats import gaussian_kde
data = daily_data['registered']
density_est = gaussian_kde(data) # kernal density estimate: https://en.wikipedia.org/wiki/Kernel_density_estimation
# 控制平滑程度造垛,數(shù)值越大魔招,越平滑
density_est.covariance_factor = lambda : .3
density_est._compute_covariance()
x_data = np.arange(min(data), max(data), 200)
# 繪制密度估計(jì)曲線(xiàn)
def densityplot(x_data, density_est, x_label, y_label, title):
_, ax = plt.subplots()
ax.plot(x_data, density_est(x_data), color = '#539caf', lw = 2)
ax.set_ylabel(y_label)
ax.set_xlabel(x_label)
ax.set_title(title)
# 調(diào)用繪圖函數(shù)
densityplot(x_data = x_data
, density_est = density_est
, x_label = 'Check outs'
, y_label = 'Frequency'
, title = 'Distribution of Registered Check Outs')
組間分析 {#步驟5:組間分析}
- 組間定量比較
- 分組粒度
- 組間聚類(lèi)
柱狀圖 {#柱狀圖}
- 一級(jí)類(lèi)間均值方差比較
# 分天分析統(tǒng)計(jì)特征
mean_total_co_day = daily_data[['weekday', 'cnt']].groupby('weekday').agg([np.mean, np.std])
mean_total_co_day.columns = mean_total_co_day.columns.droplevel()
# 定義繪制柱狀圖的函數(shù)
def barplot(x_data, y_data, error_data, x_label, y_label, title):
_, ax = plt.subplots()
# 柱狀圖
ax.bar(x_data, y_data, color = '#539caf', align = 'center')
# 繪制方差
# ls='none'去掉bar之間的連線(xiàn)
ax.errorbar(x_data, y_data, yerr = error_data, color = '#297083', ls = 'none', lw = 5)
ax.set_ylabel(y_label)
ax.set_xlabel(x_label)
ax.set_title(title)
# 繪圖函數(shù)調(diào)用
barplot(x_data = mean_total_co_day.index.values
, y_data = mean_total_co_day['mean']
, error_data = mean_total_co_day['std']
, x_label = 'Day of week'
, y_label = 'Check outs'
, title = 'Total Check Outs By Day of Week (0 = Sunday)')
mean_total_co_day.columns
daily_data[['weekday', 'cnt']].groupby('weekday').agg([np.mean, np.std])
registered casual
weekday
0 4228.828571 1872.496462
1 4338.123810 1793.073897
2 4510.663462 1826.911602
3 4548.538462 2038.095680
4 4667.259615 1939.433165
5 4690.288462 1874.624762
6 4550.542857 2196.692969
# 分天統(tǒng)計(jì)注冊(cè)和偶然使用的情況
mean_by_reg_co_day = daily_data[['weekday', 'registered', 'casual']].groupby('weekday').mean()
# 分天統(tǒng)計(jì)注冊(cè)和偶然使用的占比
mean_by_reg_co_day['total'] = mean_by_reg_co_day['registered'] + mean_by_reg_co_day['casual']
mean_by_reg_co_day['reg_prop'] = mean_by_reg_co_day['registered'] / mean_by_reg_co_day['total']
mean_by_reg_co_day['casual_prop'] = mean_by_reg_co_day['casual'] / mean_by_reg_co_day['total']
# 繪制堆積柱狀圖
def stackedbarplot(x_data, y_data_list, y_data_names, colors, x_label, y_label, title):
_, ax = plt.subplots()
# 循環(huán)繪制堆積柱狀圖
for i in range(0, len(y_data_list)):
if i == 0:
ax.bar(x_data, y_data_list[i], color = colors[i], align = 'center', label = y_data_names[i])
else:
# 采用堆積的方式晰搀,除了第一個(gè)分類(lèi),后面的分類(lèi)都從前一個(gè)分類(lèi)的柱狀圖接著畫(huà)
# 用歸一化保證最終累積結(jié)果為1
ax.bar(x_data, y_data_list[i], color = colors[i], bottom = y_data_list[i - 1], align = 'center', label = y_data_names[i])
ax.set_ylabel(y_label)
ax.set_xlabel(x_label)
ax.set_title(title)
ax.legend(loc = 'upper right') # 設(shè)定圖例位置
# 調(diào)用繪圖函數(shù)
stackedbarplot(x_data = mean_by_reg_co_day.index.values
, y_data_list = [mean_by_reg_co_day['reg_prop'], mean_by_reg_co_day['casual_prop']]
, y_data_names = ['Registered', 'Casual']
, colors = ['#539caf', '#7663b0']
, x_label = 'Day of week'
, y_label = 'Proportion of check outs'
, title = 'Check Outs By Registration Status and Day of Week (0 = Sunday)')
從這幅圖你看出了什么办斑?工作日 VS 節(jié)假日
為什么會(huì)有這樣的差別外恕?
分組柱狀圖 {#分組柱狀圖}
- 多級(jí)類(lèi)間絕對(duì)數(shù)值比較
# 繪制分組柱狀圖的函數(shù)
def groupedbarplot(x_data, y_data_list, y_data_names, colors, x_label, y_label, title):
_, ax = plt.subplots()
# 設(shè)置每一組柱狀圖的寬度
total_width = 0.8
# 設(shè)置每一個(gè)柱狀圖的寬度
ind_width = total_width / len(y_data_list)
# 計(jì)算每一個(gè)柱狀圖的中心偏移
alteration = np.arange(-total_width/2+ind_width/2, total_width/2+ind_width/2, ind_width)
# 分別繪制每一個(gè)柱狀圖
for i in range(0, len(y_data_list)):
# 橫向散開(kāi)繪制
ax.bar(x_data + alteration[i], y_data_list[i], color = colors[i], label = y_data_names[i], width = ind_width)
ax.set_ylabel(y_label)
ax.set_xlabel(x_label)
ax.set_title(title)
ax.legend(loc = 'upper right')
# 調(diào)用繪圖函數(shù)
groupedbarplot(x_data = mean_by_reg_co_day.index.values
, y_data_list = [mean_by_reg_co_day['registered'], mean_by_reg_co_day['casual']]
, y_data_names = ['Registered', 'Casual']
, colors = ['#539caf', '#7663b0']
, x_label = 'Day of week'
, y_label = 'Check outs'
, title = 'Check Outs By Registration Status and Day of Week (0 = Sunday)')
偏移前:ind_width/2
偏移后:total_width/2
偏移量:total_width/2-ind_width/2
箱式圖 {#箱式圖}
- 多級(jí)類(lèi)間數(shù)據(jù)分布比較
- 柱狀圖 + 堆疊灰度圖
# 只需要指定分類(lèi)的依據(jù),就能自動(dòng)繪制箱式圖
days = np.unique(daily_data['weekday'])
bp_data = []
for day in days:
bp_data.append(daily_data[daily_data['weekday'] == day]['cnt'].values)
# 定義繪圖函數(shù)
def boxplot(x_data, y_data, base_color, median_color, x_label, y_label, title):
_, ax = plt.subplots()
# 設(shè)置樣式
ax.boxplot(y_data
# 箱子是否顏色填充
, patch_artist = True
# 中位數(shù)線(xiàn)顏色
, medianprops = {'color': base_color}
# 箱子顏色設(shè)置乡翅,color:邊框顏色鳞疲,facecolor:填充顏色
, boxprops = {'color': base_color, 'facecolor': median_color}
# 貓須顏色whisker
, whiskerprops = {'color': median_color}
# 貓須界限顏色whisker cap
, capprops = {'color': base_color})
# 箱圖與x_data保持一致
ax.set_xticklabels(x_data)
ax.set_ylabel(y_label)
ax.set_xlabel(x_label)
ax.set_title(title)
# 調(diào)用繪圖函數(shù)
boxplot(x_data = days
, y_data = bp_data
, base_color = 'b'
, median_color = 'r'
, x_label = 'Day of week'
, y_label = 'Check outs'
, title = 'Total Check Outs By Day of Week (0 = Sunday)')
簡(jiǎn)單總結(jié) {#簡(jiǎn)單總結(jié)}
- 關(guān)聯(lián)分析、數(shù)值比較:散點(diǎn)圖蠕蚜、曲線(xiàn)圖
- 分布分析:灰度圖尚洽、密度圖
- 涉及分類(lèi)的分析:柱狀圖、箱式圖