因?yàn)楣ぷ鞯脑蚩辗保帉懥嗽S多用于差異檢驗(yàn)的統(tǒng)計(jì)分析工具怠李,使分析流程更加高效圾叼。鑒于差異檢驗(yàn)的原理較為基礎(chǔ),因此本文不對(duì)其進(jìn)行介紹捺癞,只展示編寫好的相應(yīng)分析代碼夷蚊。
1. 多變量均值與理論均值差異的單樣本t檢驗(yàn)
def multiVar_onesamp_ttest(df,varlist):
'''
varlist是列名稱的列表
'''
try:
for i in orderlist:
d_mean = df.loc[:,i].mean()
d_max = df.loc[:,i].max()
d_min = df.loc[:,i].min()
d_theory_mean = (d_max + d_min) / 2
t,p = stats.ttest_1samp(df[i],d_theory_mean)
sd = df.loc[:,i].std()
cohen_d = abs(d_mean - d_theory_mean) / sd
print('題目名稱:{}'.format(df[i].name))
print('均值為{0:.3f},標(biāo)準(zhǔn)差為:{4:.3f}髓介,理論均值為{1:.3f}惕鼓,均值與理論中值的差異檢驗(yàn),t={2:.3f},p={3:.3f}'
.format(d_mean,d_theory_mean,t,p,sd))
print('效應(yīng)量為{:.3f}'.format(cohen_d))
except:
print('參數(shù)輸入錯(cuò)誤唐础,請(qǐng)重新輸入箱歧。')
2. 方差分析
def anova(df,classes,varia):
'''
classes 和 varia分別是作為自變量的類別變量和作為因變量的連續(xù)變量在df里的列序號(hào)
'''
try:
from statsmodels.formula.api import ols
from statsmodels.stats.anova import anova_lm
from statsmodels.stats.multicomp import pairwise_tukeyhsd
nominal = df[classes].unique()
data_for_nominal = dict({})
for i in nominal:
data_for_nominal[i] = df[df[classes]==i][varia].tolist()
mean_i = np.mean(data_for_nominal[i])
std_i = np.std(data_for_nominal[i])
len_count_arrayi = len(data_for_nominal[i])
print('組別為“{0}”,n={3}一膨,該組均值為{1:.3f}呀邢,該組方差為{2:.3f}'.format(i,mean_i,std_i,len_count_arrayi))
fenlei = df[classes].name
lianxu = df[varia].name
df1 = df.copy()
df1.rename(columns={fenlei:'fenlei',lianxu:'lianxu'},inplace=True)
formula = 'lianxu' +'~'+'C(fenlei)'
data_anova = anova_lm(ols(formula,df1).fit())
print('方差分析結(jié)果如下:')
print(data_anova)
tukey = pairwise_tukeyhsd(df1['lianxu'],df1['fenlei'],alpha=0.05)
print('事后比較結(jié)果如下:')
print(tukey)
except:
print('參數(shù)輸入錯(cuò)誤,請(qǐng)重新輸入豹绪。')
3. 獨(dú)立樣本t檢驗(yàn)
def ind_ttest(df,cate,iv): # 輸入數(shù)據(jù)集价淌、分組變量和因變量序號(hào),就可以進(jìn)行分析
from scipy import stats
categories = df[cate].unique()
if len(categories) == 2:
x1 = categories[0]
x2 = categories[1]
from scipy.stats import levene
class_1 = df[df[cate] == x1][iv]
class_2 = df[df[cate] == x2][iv]
stat, p_lev = levene(class_1,class_2)
t,p = stats.ttest_ind(class_1,class_2)
mean_1 = round(np.mean(class_1),3)
std_1 = round(np.std(class_1),3)
mean_2 = round(np.mean(class_2),3)
std_2 = round(np.std(class_2),3)
sampleN_1 = len(class_1)
sampleN_2 = len(class_2)
DV = abs(mean_1 - mean_2)
co_std = ((std_1**2 + std_2**2)/2)**0.5
cohen_d = DV / co_std
print('方差齊性檢驗(yàn)的統(tǒng)計(jì)量為{0:.3f}森篷,p = {1:.2f}'.format(stat,p_lev))
print('類別為“{0}”的個(gè)案數(shù)n = {1}, 平均數(shù)M = {2:.3f}, 標(biāo)準(zhǔn)差std = {3:.3f}.'.format(x1,sampleN_1,mean_1,std_1))
print('類別為“{0}”的個(gè)案數(shù)n = {1}, 平均數(shù)M = {2:.3f}, 標(biāo)準(zhǔn)差std = {3:.3f}.'.format(x2,sampleN_2,mean_2,std_2))
print('獨(dú)立樣本t檢驗(yàn)的結(jié)果為 t = {0:.3f}, p = {1:.3f}, 兩組平均值的差值 = {2:.3f}'.format(t,p,DV))
print('效應(yīng)量 d = {0:.3f}'.format(cohen_d))
else:
print('分類變量不是二分類输钩,無(wú)法執(zhí)行獨(dú)立樣本t檢驗(yàn)。')
4. 卡方檢驗(yàn)
def chi(df,v1,v2): #數(shù)據(jù)集仲智,兩個(gè)分類變量的序號(hào)
from scipy.stats import chi2_contingency
#先計(jì)算出列聯(lián)表买乃,并且存儲(chǔ)在已有的空列聯(lián)表中
try:
v1_class = df[v1].unique().tolist()
v2_class = df[v2].unique().tolist()
df_contingencyT = pd.DataFrame(index=v1_class,columns=v2_class)
for i in v1_class:
for j in v2_class:
df_cut = df[[v1,v2]]
df_cut_v1 = df_cut[df_cut.iloc[:,0] == i]
df_cut_v1_v2 = df_cut_v1[df_cut_v1.iloc[:,1] == j]
freq = len(df_cut_v1_v2.iloc[:,0])
df_contingencyT.loc[i,j] = freq
chi2, p, dof, ex = chi2_contingency(df_contingencyT,correction=False)
print('卡方計(jì)算結(jié)果為 = {0:.3f}, p = {1:.3f}'.format(chi2,p))
print('自由度為 {}'.format(dof))
except:
print('參數(shù)輸入有誤,請(qǐng)重新寫入钓辆。')
5. 前后測(cè)單項(xiàng)李克特五點(diǎn)計(jì)分題目比較
def m_ttest_percent_compare(df,before,after): # df為dataframe,before為前測(cè)變量在dataframe中的序號(hào)剪验,after為后測(cè)的序號(hào)
try:
before_score = df.loc[:,before]
after_score = df.loc[:,after]
print('輸入的前測(cè)變量名為:{}'.format(df.loc[:,before].name))
print('輸入的后測(cè)變量名為:{}'.format(df.loc[:,after].name))
mean_b = round(before_score.mean(),3)
mean_a = round(after_score.mean(),3)
percent = round(abs(mean_a - mean_b)/mean_b * 100,3)
t,p = stats.ttest_rel(before_score,after_score)
before_VC = before_score.value_counts()
after_VC = after_score.value_counts()
print('前測(cè)平均分為:{}'.format(mean_b))
print('后測(cè)平均分為:{}'.format(mean_a))
print('前后測(cè)分?jǐn)?shù)差:{:.3f}'.format(abs(mean_a-mean_b)))
print('前后測(cè)分?jǐn)?shù)變化比率(變化絕對(duì)值除以前測(cè)分?jǐn)?shù)):{}%'.format(percent))
print('前后測(cè)相關(guān)樣本t檢驗(yàn)的結(jié)果,t={0:.3f},p={1:.3f}'.format(t,p))
print('前測(cè)得分統(tǒng)計(jì):')
for i in before_VC.index:
ratio = before_VC[i] / len(before_score) * 100
ratio = round(ratio,3)
print('分類名稱:{0}; 總數(shù):{1}; 占比:{2:.3f}%'.format(i,before_VC[i],ratio))
print('后測(cè)得分統(tǒng)計(jì):')
for i in after_VC.index:
ratio = after_VC[i] / len(after_score) * 100
ratio = round(ratio,3)
print('得分:{0}; 總數(shù):{1}; 占比:{2:.3f}%'.format(i,after_VC[i],ratio))
except:
print('輸入的參數(shù)不正確,請(qǐng)重新輸入前联。')
目前本人常用到的工具就是這些功戚,后續(xù)將繼續(xù)完善參數(shù)與非參數(shù)檢驗(yàn)的常見(jiàn)統(tǒng)計(jì)分析方法的python實(shí)現(xiàn)。