GEO數(shù)據(jù)庫 mRNA芯片分析全流程
我們演示的數(shù)據(jù)集是GSE19136,樣本可分為三組(對照組,支架組驼鞭,紫杉醇組)畦幢,具體實驗設(shè)計情況請查看鏈接。本篇文章主要對對照組和支架組 做差異分析判没。分析流程主要包括基于limma包的差異分析、pheatmap包的熱圖和ggplot2的火山圖(包括對基因的批量標(biāo)記)。
1. 差異分析
- 數(shù)據(jù)下載
setwd("D:\\R\\wjh\\GSE19136")
library(GEOquery) #用GEOquery包獲取GEO數(shù)據(jù)
eSet<-getGEO('GSE19136',destdir='./',getGPL=F) #下載數(shù)據(jù)扶踊,構(gòu)建eSet對象
eset<-exprs(eSet[[1]])[,c(1,2,4,5,7,8,10,11)] #提取基因表達(dá)矩陣
metadata<-pData(eSet[[1]]) #可以查看芯片的描述性信息,為后面分組做準(zhǔn)備
x<-metadata$title[c(1,2,4,5,7,8,10,11)] #查看芯片的標(biāo)題郎任,可以選擇用于分組的字段
group_list<-factor(unlist(lapply(x,function(x) strsplit(as.character(x),"_")[[1]][5]))) #我們發(fā)現(xiàn)取第5個字段作為分組信息最好
- ID轉(zhuǎn)換
gpl <- getGEO('GPL570', destdir=".") #下載并獲取GSE19136對應(yīng)平臺的注釋信息
anno1<-Table(gpl)[,c(1,11)] #提取探針I(yè)D秧耗,gene symbol
eset<-as.data.frame(eset) #矩陣轉(zhuǎn)換成數(shù)據(jù)框,便于后面添加非數(shù)值型的列
eset$ID <- rownames(eset) #添加一列(基因名)
merg<-merge(eset,anno1,by="ID") #合并表達(dá)矩陣和基因名舶治,通過探針號索引
y<-merg$`Gene Symbol`
gene<-unlist(lapply(y,function(y) strsplit(as.character(y)," /// ")[[1]][1]))
merg$gene <- gene #一個探針對應(yīng)多個基因的時候分井,取第一個基因
aggr<-aggregate(merg[,2:9],by=list(merg$gene),mean) #多個探針對應(yīng)一個基因的時候,取平均值
- 差異分析
eset <- aggr[,2:9]
rownames(eset)<-aggr[,1] #得到一個行名為unique的gene symbol矩陣
design<-model.matrix(~-1+group_list) #構(gòu)建實驗設(shè)計矩陣
library(limma)
contrast.matrix<-makeContrasts(contrasts = "group_listbare-group_listcontrol", levels = design) #構(gòu)建對比模型霉猛,比較兩個實驗條件下表達(dá)數(shù)據(jù)尺锚,根據(jù)上面的group_list修改bare和control
fit <- lmFit(log2(eset),design) #線性模型擬合
fit1 <- contrasts.fit(fit, contrast.matrix) #根據(jù)對比模型進(jìn)行差值計算
fit2 <- eBayes(fit1) #貝葉斯檢驗
tempOutput = topTable(fit2, coef="group_listbare-group_listcontrol", n=nrow(fit2),lfc=log2(1)) #生成所有基因的檢驗結(jié)果報表
dif<-tempOutput[tempOutput[,"P.Value"]<0.05,] #根據(jù)P值篩選全部差異表達(dá)基因(圖)
dim(dif)
head(dif)
write.csv(dif,file = "dif.csv",quote = F)
2. 畫熱圖
rt <- read.table("dif.csv",header = T,sep = ",",stringsAsFactors = F)
heat<-eset[rownames(eset) %in% c(head(rownames(subset(dif,dif$logFC>0)),15),head(rownames(subset(dif,dif$logFC<0)),15)),] #對前15個上調(diào)基因和下調(diào)基因做熱圖
library(pheatmap)
x <- t(scale(t(heat))) #事實證明用這個做歸一化,效果最好
pheatmap(x)
3. 火山圖
##################### 火山圖 #################################
tempOutput1 = topTable(fit2, coef="group_listbare-group_listcontrol", n=nrow(fit2),lfc=log2(1)) #生成所有基因的檢驗結(jié)果報表
data<-tempOutput1[tempOutput1[,"P.Value"]<=1,]
data$gene<-rownames(data)
data$sig[data$P.Value>=0.05 | abs(data$logFC) < 1] <- "black"
data$sig[data$P.Value<0.05 & data$logFC >= 1] <- "red"
data$sig[data$P.Value<0.05 & data$logFC <= -1] <- "green"
library(ggplot2)
library(ggrepel)
library(dplyr)
ggplot(data=data, aes(x=logFC, y =-log10(P.Value),color =sig)) +
geom_point() +theme_set(theme_bw())+theme(panel.grid=element_blank(),strip.text = element_blank())+
scale_color_manual(values = c("black","green","red"))+
geom_hline(yintercept = -log10(0.05),lty=4,lwd=0.6,alpha=0.8)+
geom_vline(xintercept = c(log2(2),-log2(2)),lty=4,lwd=0.6,alpha=0.8)
4. 基因標(biāo)記的火山圖
input <- data
Gene<-as.data.frame(row.names(heat))
volc = ggplot(input, aes(logFC, -log10(P.Value))) +
geom_point(aes(col=sig)) + scale_color_manual(values=c("black","green", "red")) +
ggtitle("Your title here")+geom_point(data=input[input$gene %in% Gene[,1],], aes(logFC, -log10(P.Value)), colour="blue", size=2) +
geom_hline(yintercept = -log10(0.05),lty=4,lwd=0.6,alpha=0.8)+
geom_vline(xintercept = c(log2(2),-log2(2)),lty=4,lwd=0.6,alpha=0.8)
volc+geom_text_repel(data=input[input$gene %in% Gene[,1],], aes(label=gene))
5. GO和KEGG
library(clusterProfiler)
library(pathview)
rt <- read.table("dif.csv",header = T,sep = ",",stringsAsFactors = F)
colnames(rt)[1] <- "SYMBOL"
GeneSymbol <- rt$SYMBOL
gene.symbol.eg<- id2eg(ids=GeneSymbol, category='SYMBOL', org='Hs',na.rm = F) #如果是小鼠惜浅,就是Mm
merg<-merge(gene.symbol.eg,rt,by="SYMBOL")
mer <- subset(merg,merg$ENTREZID != "NA")
geneFC <- mer$logFC
gene <- mer$ENTREZID
names(geneFC) <- gene
kkd <- enrichGO(gene = gene,"org.Hs.eg.db",ont = "BP",qvalueCutoff = 0.05) #如果是小鼠瘫辩,就是Mm
write.table(kkd@result,file = "GO.xls",sep = "\t",quote = F,row.names = F)
barplot(kkd,drop=T,showCategory = 12)
cnetplot(kkd,categorySize = "geneNum",foldChange = geneFC)
kk <- enrichKEGG(gene = gene,organism = "human",qvalueCutoff = 0.05) #如果是小鼠,就是mouse
write.table(kk@result,file = "KEGG.xls",sep = "\t",quote = F,row.names = F)
barplot(kk,drop=T,showCategory = 12)
cnetplot(kk,categorySize = "geneNum",foldChange = geneFC)
setwd("./pathview") #新建pathview文件夾坛悉,存放KEGG關(guān)系圖
keggxls<-read.table("KEGG.xls",sep = "\t",header = T)
keggxls<-subset(keggxls,keggxls$p.adjust<0.05)
for (i in keggxls$ID){
pv.out <- pathview(gene.data = geneFC, pathway.id = i, species = "hsa", out.suffix = "pathview") #如果是小鼠mmu杭朱,看GSE70410
}
6. PPI
ppi<-read.table("string_interactions0.5.tsv",header = T,sep = "\t",stringsAsFactors = F)
ppi_attr1 <- rt[rt$SYMBOL %in% unique(c(ppi[,1],ppi[,2])),] # rt即是dif
ppi_attr <- merge(ppi_attr1,as.data.frame(table(c(ppi[,1],ppi[,2]))),by.x="SYMBOL",by.y="Var1")
write.csv(ppi_attr,file = "ppi_attrabute.csv",quote = F,row.names = F)
write.table(ppi_attr,file = "ppi_attrabute.txt",quote = F,sep = "\t",row.names = F)