DEseq預(yù)熱
主要就是這幾個(gè)步驟了蛛株。
#準(zhǔn)備count tables
cnts <- matrix(rnbinom(n=1000, mu=100, size=1/0.5),ncol = 10) #構(gòu)建滿足隨機(jī)負(fù)二項(xiàng)分布的一組數(shù)
cond <- factor(rep(1:2,each=5))
#創(chuàng)建DEseqDataSet
dds2 <- DESeqDataSetFromMatrix(cnts,DataFrame(cond), ~cond)
#標(biāo)準(zhǔn)分析
dds2 <- DESeq(dds2)
res <- results(dds2)
##另一種檢驗(yàn)积糯,LRT,似然率檢驗(yàn)
#ddsLRT <- DESeq(dds2,test="LRT",reduced=~1)
#resLRT <- results(ddsLRT)
#moderate lfc
resultsNames(dds2)
resLFC <- lfcShrink(dds2,coef = 2,type = "apeglm")
#提取結(jié)果
summary()、subset()
1. counts: 查看每個(gè)樣本在每個(gè)基因上的counts數(shù)
library(DESeq2)
> dds <- makeExampleDESeqDataSet(m=4)
> head(counts(dds))
sample1 sample2 sample3 sample4
gene1 0 0 2 1
gene2 14 56 50 47
gene3 5 3 3 8
gene4 70 105 116 125
gene5 5 2 2 12
gene6 153 70 70 53
> dds1 <- estimateSizeFactors(dds)
> head(counts(dds1,normalized=TRUE)) #通過估計(jì)size對(duì)counts數(shù)進(jìn)行標(biāo)準(zhǔn)化
sample1 sample2 sample3 sample4
gene1 0.000000 0.000000 1.883002 0.9839948
gene2 14.041418 53.289007 47.075041 46.2477573
gene3 5.014792 2.854768 2.824502 7.8719587
gene4 70.207091 99.916889 109.214095 122.9993545
gene5 5.014792 1.903179 1.883002 11.8079380
gene6 153.452641 66.611259 65.905057 52.1517263
2. 構(gòu)建DESeqDataSet
> #from matrix:從數(shù)值矩陣得到,比如用featurecount統(tǒng)計(jì)得到的數(shù)值矩陣
> a <- as.matrix(read.table("6sample_count_matrix.txt",sep="\t",header = T,row.names=1))
> head(a)
WT1 WT2 WT3 acc1.51 acc1.52 acc1.53
AT1G01010 226 183 247 514 353 383
AT1G01020 386 364 453 369 483 443
AT1G01030 77 65 87 52 62 74
AT1G01040 1322 1258 1621 1333 1526 1641
AT1G01050 1531 1518 2029 1573 1888 1987
AT1G01060 124 151 823 146 197 297
> b <- read.table("coldata.txt",header = T,row.names = 1)
> b
condition
WT1 WT
WT2 WT
WT3 WT
acc1.51 acc1.5
acc1.52 acc1.5
acc1.53 acc1.5
> all(rownames(b) == colnames(a)) #make sure that it's true
[1] TRUE
> dds3 <- DESeqDataSetFromMatrix(countData = a,
+ DataFrame(b),
+ design = ~ condition)
> #from htseq-count:由單個(gè)htseq-count結(jié)果文件合并得到
> c <- read.table("sampletable_for_htseq.txt.txt",sep = "\t", header = F)
> c
V1 V2 V3
1 WT1 SRR3286802.count2 WT
2 WT2 SRR3286803.count2 WT
3 WT3 SRR3286804.count2 WT
4 acc1.51 SRR3286805.count2 acc1.5
5 acc1.52 SRR3286806.count2 acc1.5
6 acc1.53 SRR3286807.count2 acc1.5
> colnames(c) <- c("samplename","filename","condition")
> dds4 <- DESeqDataSetFromHTSeqCount(c, directory = ".", design = ~condition)
3. run DESeq() & results()——主要分析步驟
#是否需要初步過濾一下
#keep <- rowSums(counts(dds4)) >= 20 # 對(duì)counts數(shù)進(jìn)行初步的過濾
#dds4 <- dds4[keep,]
> dds4 <- DESeq(dds4) #三步走:文庫(kù)大小估計(jì);離散程度估計(jì)蜂科;統(tǒng)計(jì)檢驗(yàn)
estimating size factors
estimating dispersions
gene-wise dispersion estimates
mean-dispersion relationship
final dispersion estimates
fitting model and testing
> res4 <- results(dds4) #結(jié)果呈現(xiàn)
> res4
log2 fold change (MLE): condition WT vs acc1.5
Wald test p-value: condition WT vs acc1.5
DataFrame with 37884 rows and 6 columns
baseMean log2FoldChange lfcSE stat pvalue padj
<numeric> <numeric> <numeric> <numeric> <numeric> <numeric>
AT1G01010 308.89691311743 -0.873161860576389 0.235593150198139 -3.70622770586514 0.000210369178701959 0.00230248659971188
AT1G01020 392.486827906655 -0.014533822556904 0.140844859602549 -0.103190294611512 0.91781194256941 0.95926533475846
AT1G01030 63.2288140763626 0.399739741186851 0.255254339106486 1.56604484212152 0.117338119589006 0.275094993904629
......
4. LFC校正
resultsNames(dds4) #顯示誰(shuí)和誰(shuí)比較
res.shr <- lfcShrink(dds4,coef = 2) #2表示第2列顽决,即LFC這一列
res.shr
res.shr2 <- lfcShrink(dds4,contrast = c("condition","acc1.5","WT")) #contrast和coef不能同時(shí)存在
res.shr2
#更換校正模型
res.ape <- lfcShrink(dds4,coef = 2, type = "apeglm")
res.ash <- lfcShrink(dds4, coef = 2, type = "ashr")
res.ash2 <- lfcShrink(dds4, contrast = c("condition","acc1.5","WT"), type = "ashr")
#這一步只會(huì)對(duì)LFC的值產(chǎn)生影響,p值是沒有改變的
5. 簡(jiǎn)單作圖
plotCounts: 用以檢驗(yàn)單個(gè)基因在組間的reads數(shù)有多少
plotCounts(dds4,"AT1G01010")
plotMA: 所有樣本normalized counts的平均值和LFC的關(guān)系
從整體上來看gene的上調(diào)下調(diào)情況导匣,注意用resultsNames()看一下是誰(shuí)和誰(shuí)比較
res.shr2 <- lfcShrink(dds4,coef = 2,type = "apeglm") #校正LFC
plotMA(res.shr2,alpha=0.01,main="apeglm") #alpha=0.01才菠,定義顯著水平
6. results()——從DESeq分析中提取結(jié)果
> res4 <- results(dds4,contrast = c("condition","acc1.5","WT"),alpha = 0.001,tidy = F)
> # dds4經(jīng)過了DESeq()分析
> # contrast: 定義誰(shuí)和誰(shuí)比較
> # lfcThreshold: 定義了lfc閾值,從MA圖中可以看到紅點(diǎn)減少贡定,此外results()結(jié)果中不滿足lfc閾值的gene的p值都是1
> # alpha: 定義顯著水平
> # tidy: 是否將結(jié)果輸出為data.frame格式
> summary(res4,alpha=0.001)
out of 27432 with nonzero total read count
adjusted p-value < 0.001
LFC > 0 (up) : 1198, 4.4%
LFC < 0 (down) : 554, 2%
outliers [1] : 63, 0.23%
low counts [2] : 6254, 23%
(mean count < 7)
[1] see 'cooksCutoff' argument of ?results
[2] see 'independentFiltering' argument of ?results
> # res4經(jīng)過了results()操作
> # alpha: 校正p值的閾值赋访,如果沒有設(shè)置,會(huì)沿用results()中的alpha參數(shù)
#按照自定義的閾值提取差異基因并導(dǎo)出
#diff_gene_deseq2 <-subset(res, padj < 0.001 & abs(log2FoldChange) > 1)
#write.csv(diff_gene_deseq2,file= "DEG_acc1.5_vs_WT.csv")