isoform usage是常見的RNA seq下游分析流程求厕,常見的包也是DEXseq2。但是用它跑自己的數(shù)據(jù)一直報錯,跑示例數(shù)據(jù)是能夠work的翔始。
names(x) <- value: 'names' atrribute [9] must be the same length as the vector [1]
網(wǎng)上很多教程都是對示例數(shù)據(jù)進(jìn)行講解心俗,解決不了我的問題举畸『溃看報錯信息感覺是gff文件格式對不上俄周,看了源代碼,它的版本很久了疚顷,要求gff文件里“exon_number”為“exonic_part_number”武契、“transcript_id”為“transcripts”、第三列類型為“exonic_part”荡含。要改的地方太多了,所以我自己改了它的源代碼來跑届垫。
要求輸入文件有:
- 以exon為單元的reads count文件
- gff文件
獲得reads count文件——HTseq
# 安裝
conda install -c bioconda htseq
htseq-count -f bam -c all.exon.count.csv -n 40 --append-output -t exon -i gene_id -i exon_number \
/data/workdir/FRAL190036196.sort.bam \
/data/workdir/FRAL190036198.sort.bam \
/data/workdir/FRAL190036258.sort.bam.........\
/data/workdir/information/hg38/gencode_new.v40.gtf
# -f 輸入文件格式
# -c 輸出文件名
# -n 線程數(shù)
# --append-output 所有結(jié)果合并輸出一個結(jié)果文件释液,每列為一個樣本
# -t 識別gtf第三列的exon為計算reads count的單元
# -i 結(jié)果文件第一列為id,為gene_id:exon_number的格式
装处!非常耗時误债,建議用多點線程跑浸船。一個樣本單線程差不多一個半小時
在這我多輸出了一列g(shù)ene name,完全沒必要
分隔各列為一個文件的腳本
# samples.txt為樣本名寝蹈,每行一個樣本
sf="samples.txt"
for i in {2..469};
do
s_num=$((i-1))
s=$(sed -n "${s_num}p" "${sf}")
awk -v col="$((i+1))" '{OFS = "\t"} {print $1,$col}' all.exon.count.tsv > "split_sample_exon_count/${s}.txt"
done
接下來用R跑
# DEXseq分析
library("DEXSeq")
samples_label <- read.delim("D:/data/465_sample_group.txt")
countFiles <- list.files("D:/data/extData_exonnumber/", pattern=".txt$", full.names=TRUE)
gffFile <- list.files("D:/data/DEXseq", pattern="gff$", full.names=TRUE)
sampleTable <- data.frame(row.names=samples_label$Sample,
condition=samples_label$group)
sample_names <- samples_label$Sample
countFileNames <- sub("\\.txt$", "", basename(countFiles))
# 根據(jù)樣本名稱排序 countFiles
sorted_countFiles <- countFiles[order(match(countFileNames, sample_names))]
# 大部分都沒有動李命,只做了細(xì)微改變
modify_DEXSeqDataSetFromHTSeq = function (sorted_countFiles, sampleTable, design = ~sample + exon +
condition:exon, gffFile = NULL){
if (!all(sapply(sorted_countFiles, class) == "character")) {
stop("The countfiles parameter must be a character vector")
}
lf <- lapply(sorted_countFiles, function(x) read.table(x, header = FALSE,
stringsAsFactors = FALSE))
if (!all(sapply(lf[-1], function(x) all(x$V1 == lf[1]$V1))))
stop("Count files have differing gene ID column.")
dcounts <- sapply(lf, `[[`, "V2")
rownames(dcounts) <- gsub("_PAR_Y","",lf[[1]][, 1])
dcounts <- dcounts[substr(rownames(dcounts), 1, 1) != "_",]
rownames(dcounts) <- sub(":", ":E", rownames(dcounts))
colnames(dcounts) <- sorted_countFiles
splitted <- strsplit(rownames(dcounts), ":")
exons <- sapply(splitted, "[[", 2)
genesrle <- sapply(splitted, "[[", 1)
if (!is.null(gffFile)) {
aggregates <- read.delim(gffFile, stringsAsFactors = FALSE, header = FALSE)
colnames(aggregates) <- c("chr", "source", "class",
"start", "end", "ex", "strand", "ex2", "attr")
aggregates$strand <- gsub("\\.", "*", aggregates$strand)
aggregates <- aggregates[which(aggregates$class == "exon"),]
aggregates$attr <- gsub("\"|=|;", " ", aggregates$attr)
aggregates$gene_id <- sub(".*gene_id\\s(\\S+).*", "\\1", aggregates$attr)
transcripts <- gsub(".*transcript_id\\s(\\S+).*", "\\1", aggregates$attr)
exonids <- gsub(".*exon_number\\s(\\S+).*", "\\1", aggregates$attr)
exoninfo <- GRanges(as.character(aggregates$chr), IRanges(start = aggregates$start,
end = aggregates$end), strand = aggregates$strand)
names(exoninfo) <- paste(aggregates$gene_id, exonids, sep = ":E")
names(transcripts) <- rownames(exoninfo)
if (!all(rownames(dcounts) %in% names(exoninfo))) {
stop("Count files do not correspond to the flattened annotation file")
}
matching <- match(rownames(dcounts), names(exoninfo))
stopifnot(all(names(exoninfo[matching]) == rownames(dcounts)))
stopifnot(all(names(transcripts[matching]) == rownames(dcounts)))
dxd <- DEXSeqDataSet(dcounts, sampleTable, design, exons,
genesrle, exoninfo[matching], transcripts[matching])
return(dxd)
}
else {
dxd <- DEXSeqDataSet(dcounts, sampleTable, design, exons,
genesrle)
return(dxd)
}
}
dxd <- modify_DEXSeqDataSetFromHTSeq(
sorted_countFiles,
sampleData=sampleTable,
design= ~sample + exon + condition:exon,
flattenedfile=gffFile)
# 差異分析
dxr <- DEXSeq(dxd)
沒跑完,實在是太慢了箫老,數(shù)據(jù)量太大封字,R直接死機(jī)了