References:
1.RNA-seq(4):Hisat2+FeatureCounts+DESeq2流程+作圖坐梯!
https://pzweuj.github.io/2018/07/18/rna-seq-4.html
2.一個(gè)植物轉(zhuǎn)錄組項(xiàng)目的實(shí)戰(zhàn)
http://www.bio-info-trainee.com/2809.html
3.RNA_seq(1)植物轉(zhuǎn)錄組差異基因分析簡單練習(xí)
http://www.reibang.com/p/7146d5c41294
Part I II 基本是照著做的徽诲,腳本會(huì)有一丟丟修改,之后吃飽了再弄吧
Part III:Featurecounts —>構(gòu)建dds object 吵血,獲得expression matrix
featureCounts完成生物學(xué)定量
featureCounts是一款可以進(jìn)行生物學(xué)定量的軟件谎替,它集成在subread的package里了
需要提供gtf格式的注釋或者SAF格式的注釋;
>gff3='/public/study/mRNAseq/tair/Arabidopsis_thaliana.TAIR10.28.gff3.gz'
>gtf='/public/study/mRNAseq/tair/Arabidopsis_thaliana.TAIR10.28.gtf.gz'
>featureCounts='/trainee/home/yjxiang/tools/subread-1.6.2-source/bin/featureCounts'
>nohup$featureCounts-T 5 -p -t exon -g gene_id -a$gtf-o /trainee/home/yjxiang/practice/subread_workflow/counts_out/counts_id.txt/trainee/home/yjxiang/practice/subread_workflow/mapping_out/*.bam & #掛在后臺(tái)即便網(wǎng)絡(luò)不穩(wěn)也可執(zhí)行,在提交程序和前臺(tái)運(yùn)行之間的選擇蹋辅!
DESeq2差異基因分析
First step:獲取表達(dá)矩陣和分組信息
> library(DESeq2)
## 數(shù)據(jù)預(yù)處理
> sampleNames<-c('ERR1698194','ERR1698195','ERR1698196','ERR1698197')# 抽取前四列sample
>data <- read.table("count_id.txt", header=TRUE, quote="\t", skip=1)
>names(data)<-c('Geneid','Chr','Start','End','Strand','Length','ERR1698194','ERR1698195','ERR1698196','ERR1698197','ERR1698198','ERR1698199','ERR1698200','ERR1698201','ERR1698202','ERR1698203','ERR1698204','ERR1698205','ERR1698206','ERR1698207','ERR1698208','ERR1698209')#對(duì) 第一行重命名
# 前六列分別是Geneid Chr Start End Strand Length# 我們要的是count數(shù)秩命,所以從第七列開始
>names(data)[7:10] <- sampleNames
>countData<-as.matrix(data[7:10])#第七列開始是每個(gè)sample對(duì)應(yīng)的feature的counts,[前處sampleName命令有誤,與此提取數(shù)據(jù)不match]
#將數(shù)據(jù)轉(zhuǎn)換為矩陣格式
用featureCounts定量得到的counts_id.txt 褒傅,經(jīng)過格式處理之后得到表達(dá)矩陣:countdata:第一列是基因ID弃锐,之后的列都是樣本ID
每一行代表不同的基因在不同樣本中的表達(dá)量.
> rownames(countData)<-data$Geneid
> database <- data.frame(name=sampleNames)
#database設(shè)置分組信息
>database <- data.frame(name=sampleNames, condition=c('a','a','b','b'))
>rownames(database) <- sampleNames#database是一個(gè)二維矩陣,賦予列名
>colnames(countData)<-NULL
##Second step: 構(gòu)建dds對(duì)象
>dds <- DESeqDataSetFromMatrix(countData, colData=database, design= ~ condition)
> dds
class: DESeqDataSet?
dim: 33602 4?
metadata(1): version
assays(1): counts
rownames(33602): AT1G01010 AT1G01020 ...
? ATCG01300 ATCG01310
rowData names(0):
colnames(4): ERR1698194 ERR1698197
? ERR1698204 ERR1698207
colData names(2): name condition
> dds <- dds[ rowSums(counts(dds)) > 1, ]
## 使用DESeq函數(shù)估計(jì)離散度樊卓,然后差異分析獲得res對(duì)象
> dds<-DESeq(dds)#對(duì)原始的dds進(jìn)行標(biāo)準(zhǔn)化
>resultsNames(dds)#查看結(jié)果名稱
>res <- results(dds)#用results函數(shù)提取結(jié)果拿愧,并賦值給res變量
> summary(res) #查看結(jié)果
out of 21129 with nonzero total read count
adjusted p-value < 0.1
LFC > 0 (up)? ? ? ?: 76, 0.36%
LFC < 0 (down)? ? ?: 115, 0.54%
outliers [1]? ? ? ?: 0, 0%
low counts [2]? ? ?: 3687, 17%
(mean count < 12)
[1] see 'cooksCutoff' argument of ?results
[2] see 'independentFiltering' argument of ?results
>write.csv(res, "res_des_output.csv")
>resdata <- merge(as.data.frame(res), as.data.frame(counts(dds, normalized=TRUE)),by="row.names",sort=FALSE)
>write.csv(resdata, "all_des_output.csv", row.names=FALSE)
##Third step:提取結(jié)果并繪制火山圖
Part VI: Drawing
> #提取差異基因!!!
> res <- res[order(res$padj),]
> resdata <-merge(as.data.frame(res),as.data.frame(counts(dds,normalize=TRUE)),by="row.names",sort=FALSE)
> deseq_res<-data.frame(resdata)
> up_diff_result<-subset(deseq_res,padj < 0.05 & (log2FoldChange > 1)) #提取上調(diào)差異表達(dá)基因
> down_diff_result<-subset(deseq_res,padj < 0.05 & (log2FoldChange < -1)) #提取下調(diào)差異表達(dá)基因
> write.csv(up_diff_result,"D:\\R.3.5.3\\上調(diào)_diff_results.csv") #輸出上調(diào)基因
> write.csv(down_diff_result,"D:\\R.3.5.3\\下調(diào)_diff_results.csv") #輸出下調(diào)基因
1.Valcano?火山圖 可以非常直觀且合理地篩選出在兩樣本間發(fā)生差異表達(dá)的基因
R script:
> library(ggplot2)
> # 這里的resdata也可以用res_des_output.csv這個(gè)結(jié)果重新導(dǎo)入哦杠河。
> # 現(xiàn)在就是用的前面做DESeq的時(shí)候的resdata碌尔。
> resdata$change <- as.factor(
+? ? ?ifelse(
+? ? ? ? ?resdata$padj<0.01 & abs(resdata$log2FoldChange)>1,
+? ? ? ? ?ifelse(resdata$log2FoldChange>1, "Up", "Down"),
+? ? ? ? ?"NoDiff"
+? ? ?)
+ )#確定統(tǒng)計(jì)學(xué)顯著性
> valcano <- ggplot(data=resdata, aes(x=log2FoldChange, y=-log10(padj), color=change)) +?
+? ? ?geom_point(alpha=0.8, size=1) +?
+? ? ?theme_bw(base_size=15) +?
+? ? ?theme(
+? ? ? ? ?panel.grid.minor=element_blank(),
+? ? ? ? ?panel.grid.major=element_blank()
+? ? ?) +?
+? ? ?ggtitle("DESeq2 Valcano") +?
+? ? ?scale_color_manual(name="", values=c("red", "green", "black"), limits=c("Up", "Down", "NoDiff")) +?
+? ? ?geom_vline(xintercept=c(-1, 1), lty=5, col="gray", lwd=0.5) +?
+? ? ?geom_hline(yintercept=-log10(0.01), lty=5, col="gray", lwd=0.5)
> valcano
2.PCA 主成分分析,把數(shù)據(jù)降維后進(jìn)行分析券敌,pc1和pc2是對(duì)整個(gè)數(shù)據(jù)集解釋程度貢獻(xiàn)率第一和第二的cluster唾戚,主成分。
R script:
> # library(ggplot2)
> rld <- rlog(dds)
> pcaData <- plotPCA(rld, intgroup=c("condition", "name"), returnData=T)
> percentVar <- round(100*attr(pcaData, "percentVar"))
> pca <- ggplot(pcaData, aes(PC1, PC2, color=condition, shape=name)) +?
+? ? ?geom_point(size=3) +?
+? ? ?ggtitle("DESeq2 PCA") +?
+? ? ?xlab(paste0("PC1: ", percentVar[1], "% variance")) +?
+? ? ?ylab(paste0("PC2: ", percentVar[2], "% variance"))
> pca
3.heatmap:熱圖實(shí)現(xiàn)基因表達(dá)模式可視化的需求待诅。 4個(gè)樣本的表達(dá)差異并不明顯叹坦,因?yàn)槲沂菑耐慌沃须S機(jī)抽取的無差別處理的4個(gè)samples。
Rscript:
>?library(pheatmap)
> select <- order(rowMeans(counts(dds, normalized=T)), decreasing=T)[1:1000]
> nt <- normTransform(dds)
> log2.norm.counts <- assay(nt)[select,]
> df <- as.data.frame(colData(dds)[, c("name", "condition")])
> pheatmap(log2.norm.counts, cluster_rows=T, show_rownames=F, cluster_cols=T, annotation_col=df, fontsize=6)
>