原理和公式參考:
- http://www.reibang.com/p/c7ba1aa543fe
- wikipedia: https://en.wikipedia.org/wiki/Tajima%27s_D
- https://www.google.com.hk/search?q=pvalue+for+tajima%27s+D&newwindow=1&client=safari&ei=URDsYZy_OZavoATIyJSYAg&ved=0ahUKEwjcq5ngxMX1AhWWF4gKHUgkBSMQ4dUDCA0&uact=5&oq=pvalue+for+tajima%27s+D&gs_lcp=Cgdnd3Mtd2l6EAMyBwgAEEcQsAMyBwgAEEcQsAMyBwgAEEcQsAMyBwgAEEcQsAMyBwgAEEcQsAMyBwgAEEcQsAMyBwgAEEcQsAMyBwgAEEcQsAMyBwgAEEcQsANKBAhBGABKBQhAEgExSgQIRhgAUABYAGCb0QRoAXABeACAAQCIAQCSAQCYAQDIAQnAAQE&sclient=gws-wiz
- https://zhuanlan.zhihu.com/p/52064863
1. 加載數(shù)據(jù)
##### load data
data<-read.table("mydata.genotype",header=T) #### change this
data<-data[data$Chr=="chr18"] #### select car
data1<-data[,4:dim(data)[2]]
2. 設(shè)定參數(shù)和兩兩個體的排列組合
nindiv<-dim(data1)[2]
nsnp<-dim(data1)[1]
pos_data<-data[,1:2] #### change this
pop1_num<-1:9 ###### change this (前9個是population1缤骨,后面是population2,
######這個只用于計算Fst尺借,對tajima‘s D沒有用)
pop2_num<-10:17 ##### change this
between_or_within_pop<-c()
for(i in 1:(nindiv-1)){
for(k in (i+1):nindiv){
print(paste0(i," and ",k))
if((i %in% pop1_num & k %in% pop2_num) | (i %in% pop2_num & k %in% pop1_num)){
between_or_within_pop<-c(between_or_within_pop,"between")
}else{between_or_within_pop<-c(between_or_within_pop,"within")}}
}
#### 17個個體绊起,兩兩組合類型一共136種。
#### 寫清楚哪些是群體間的燎斩,哪些是群體內(nèi)的(Fst計算要用)
#### save between pop pairs num and within pop pair num
nbetween<-sum(between_or_within_pop=="between")
nwithin<-sum(between_or_within_pop=="within")
3. 定義計算函數(shù)
##### define function
tajima_and_fst<-function(input_data){
theta_matrix<-matrix(nrow=dim(input_data)[1],ncol=choose(nindiv,2),0)
for(snp in 1:dim(input_data)[1]){
array<-input_data[snp,]
pair_count=0
for(i in 1:(nindiv-1)){
for(k in (i+1):nindiv){
pair_count=pair_count+1
if(array[i][1,1]!=array[k][1,1]){
if (array[i][1,1]=="-"|array[k][1,1]=="-"){
theta_matrix[snp,pair_count]=NA
}else{theta_matrix[snp,pair_count]=1}
}
}
}
}
theta_matrix1<-theta_matrix
theta_matrix1[is.na(theta_matrix)]=0
#### theta_pi是考慮到NA值的虱歪,使用了總位點數(shù)/非gap位點數(shù) 作為標(biāo)準(zhǔn)化常數(shù)
nromalized_factor<-dim(theta_matrix1)[1]/apply(!is.na(theta_matrix),2,sum)
nromalized_factor[nromalized_factor==Inf]<-1
a1<-sum(1/1:(dim(input_data)[1]-1))
a2<-sum(1/(1:(dim(input_data)[1]-1))**2)
c1<-b1-1/a1
c2<-b2-((nindiv+2)/(a1*nindiv))+(a2/a1**2)
e1<-c1/a1
e2<-c2/(a1**2+a2)
S<-sum(apply(theta_matrix1,1,sum)!=0)
theta_pi<-sum(apply(theta_matrix1,2,sum)*nromalized_factor)/
(combination_times-sum(apply(!is.na(theta_matrix),2,sum)==0)
)
theta_w<-S/a1
var_d<-(e1*S+e2*S*(S-1))**(1/2)
tajimasd<-(theta_pi-theta_w)/var_d
theta_pi_between<-
sum(apply(theta_matrix1[,between_or_within_pop=="between"],2,sum)*nromalized_factor)/nbetween
theta_pi_within<-
sum(apply(theta_matrix1[,between_or_within_pop=="within"],2,sum)*nromalized_factor)/nbetween
fst<-(theta_pi_between-theta_pi_within)/theta_pi_between
return(list(tajimasd,fst))
}
4. 定義掃描基因組的sliding window的參數(shù)
##### define sliding window parameters
window_size=20 #### change this
step_wise=1 #### change this
combination_times<-choose(nindiv,2)
b1<-nindiv+1/3*(nindiv-1)
b2<-(2*(nindiv**2+nindiv+3))/(9*nindiv*(nindiv-1))
total_step=(nsnp-window_size)/step_wise
tajimad_list<-c()
fst_list<-c()
window_position<-matrix(nrow=total_step,ncol=2)
5. 啟動 window sliding
##### run sliding windows
for(pace in 1:total_step){
print(pace/total_step)
start<-1+step_wise*(pace-1)
end<-1+step_wise*(pace-1)+window_size
res<-tajima_and_fst(data1[start:end,]) #### using the function
new_d<-res[[1]] ### saving result of D
new_fst<-res[[2]] ### saving result of fst
tajimad_list<-c(tajimad_list,new_d)
fst_list<-c(fst_list,new_fst)
window_position[pace,]<-c(start,end)
}
6. 儲存結(jié)果,畫圖
#### concate output result
out<-data.frame(chr=pos_data[,1][1],
snp_position_start=pos_data[,2][window_position[,1]],
snp_position_end=pos_data[,2][window_position[,2]],
relative_start=window_position[,1],
relative_end=window_position[,2],
tajimasd=(tajimad_list-mean(tajimad_list))/sd(tajimad_list), #### 這里標(biāo)準(zhǔn)化栅表、中心化了
fst=(fst_list-mean(fst_list))/sd(fst_list)) #### 這里標(biāo)準(zhǔn)化笋鄙、中心化了
write.table(out,"chr18.tajimasd.fst.txt",sep="\t",quote = F,row.names = F)
head(out)
7. 畫圖
library(ggplot2)
ggplot(data=out)+
geom_point(aes(x=(snp_position_start+snp_position_end)/2,y=fst,color="fst"))
ggplot(data=out)+
geom_point(aes(x=(snp_position_start+snp_position_end)/2,y=tajimasd,color="tajimasd2"))
暫時還沒搞懂怎么計算P值...好像要序列模擬,比較麻煩怪瓶,后面再思考萧落。或者套個正態(tài)分布看離群值?但這樣有點武斷找岖,而且沒有假設(shè)檢驗陨倡。
Tajima原文中Tajimas' D和計算機序列模擬的比對結(jié)果。因此絕對值大于2應(yīng)該睡rule of thumb许布,應(yīng)該就算顯著兴革。