用多基因序列構建進化樹,要先將每個基因序列進行比對吆你,拼接后構樹。比對可以用muscle或mafft符喝,當數(shù)據(jù)量大時muscle就會因內(nèi)存不足而報錯谤祖,這時只能用mafft婿滓。怎么進行序列拼接呢?自己寫程序吧粥喜。
# 讀序列文件的函數(shù)
#' build sequence database with raw fasta files
#'
#' @param dat_path
#' give the raw file path
#' @param type dna or prot or other types
#' @return
#' @export
#'
#' @examples
db_build <- function(dat_path, type) {
require(tidyverse)
dat <- readLines(dat_path)
indx <- dat %>% str_detect(">") %>% c(1:length(dat))[.]
gen=dat[indx] %>% str_remove_all("^>") %>% str_remove_all("-RA.*$")
seqs=rep(NA, length(indx))
for (i in 1:(length(indx)-1)) {
seqs[i] <- dat[(indx[i]+1):(indx[i+1]-1)] %>% paste0(collapse = "")
}
seqs[length(indx)] <- dat[(indx[length(indx)]+1):length(dat)] %>% paste0(collapse = "")
attr(seqs, "type") <- type
db <- list(gen=gen, seq=seqs)
return(db)
}
讀序列
fil_nam_f=list.files("D:/Single_Copy_Orthologue_Sequences/",full.names = T)
fxj=lapply(fil_nam_f, function(x){db_build(x,"prot")})#蛋白序列
#轉(zhuǎn)成一個df凸主,每列一個基因
fxj_df=data.frame(gen=fxj[[1]]$gen,seq1=fxj[[1]]$seq)
for (i in 2:length(fxj)) {
fxj_df[,i+1] =fxj[[i]]$seq
}
#把每列中的序列粘成一條
fxj_df2=fxj_df[,-1]
temp_fun=function(vec){
paste0(vec,collapse = "")
}
#再轉(zhuǎn)成fasta
temp_fun=function(vec){
paste0(vec,collapse = "")
}
fxj_df=data.frame(gen=fxj_df$gen,seq=apply(fxj_df[,-1],1,FUN=temp_fun))
fxj_seq <- c()
for (i in 1:nrow(fxj_df)) {
hit_seq_temp <- c(paste0(">", fxj_df$gen[i]), fxj_df$seq[i])
fxj_seq <- c(fxj_seq, hit_seq_temp)
}
write(fxj_seq, file = "fxj_single_cop_seq.fa")