GEO數(shù)據(jù)的獲取(全文用的是GSE17708這個數(shù)據(jù))
這里面是芯片數(shù)據(jù)的處理
library(GEOquery)
gset <- getGEO("GSE17708",destdir = ".",AnnotGPL=F,getGPL=F)
save(gset,file = "./Rdata/GSE17708_gset.Rdata")
讀取下載好的文件
rm(list=ls())
load(file="./Rdata/GSE17708_gset.Rdata")
exprSet <- exprs(gset[[1]])#我們需要的測序信息
phe <- pData(gset[[1]])#描述樣本的信息
生成group_list以便后面使用
library(stringr)
group_list <- str_split(as.character(phe$title)," ",simplify = T)[,11]#這個地方之所以能用11可是自己數(shù)出來確定的哦魂那,數(shù)值不固定,這點要注意哦
group_list <- paste0("group_",group_list,"h")#這個group_list在后文會用到哦琅翻,暫時先放在這里,注意不要自行添加空格隐锭,不然會報錯渐扮,用短下劃線最保險
保存重要的賦值數(shù)據(jù)生成一個Rdata:
save(exprSet,group_list,phe,
file = "./Rdata/GSE17708_exprSet.Rdata")
重新加載Rdata:
rm(list = ls())
load("./Rdata/GSE17708_exprSet.Rdata")
ID轉換大全
#查看所用的芯片平臺
phe$platform_id
#查看到是GPL570论悴,然后 → http://www.reibang.com/p/f6906ba703a0 網(wǎng)站上查找所對應的R包,那就安裝“hgu133plus2.db”這個包
#install.packages("hgu133plus2.db")
library(hgu133plus2.db)
ls("package:hgu133plus2.db") # 這里可以窮舉R包里面的項目信息
ids <- toTable(hgu133plus2SYMBOL)#將包中的數(shù)據(jù)轉換為數(shù)據(jù)框儲存在eg2probe
#查看信息,進行核對:
length(unique(ids$symbol))#查看包含了多少種不同的基因
tail(sort(table(ids$symbol)))
table(sort(table(ids$symbol)))
table(table(ids$symbol));plot(table(table(ids$symbol)))#匯總查看一下
ids[ids$symbol=="TRAF4",]#查看一下symbol等于TRAF4這個基因的有哪些行~
head(exprSet)#查看一下表達矩陣
#重點來啦墓律,先分析一波膀估,有個數(shù)
table(rownames(exprSet)%in% ids$probe_id)#查看到exprSet中有41922能夠和R包提供的ids匹配上(無序匹配)
dim(exprSet)#但是exprSet中有54675個id,說明還有12753個匹配不上
#所以去除那些匹配不上的是必然了:
exprSet <- exprSet[rownames(exprSet)%in% ids$probe_id,]
#去除無法使用的那些行耻讽!它是在exprSet的行名稱里面判斷TURE FALSE
dim(exprSet)#可以發(fā)現(xiàn)減到了41922
##下面的這些代碼可以用id_transfr()函數(shù)來代替
#在這之后需要知道有exprSet中有多行是對應同一個基因的察纯,所以就需要排除那些重復的行
temp <- by(exprSet,
ids$symbol,
function(x) rownames(x)[which.max(rowMeans(x))])
useful_probes <- as.character(temp)#得到行名(就是有用的probe_ids)
exprSet <- exprSet[rownames(exprSet) %in% useful_probes ,]
#再次根據(jù)有用的probe_ids減除重復的不必的probes_ids行,可以看到,從41922減少到了20174,減掉了21748U敕省饼记!很多了!慰枕!
rm(temp); rm(useful_probes)
rownames(exprSet) <- ids[match(rownames(exprSet),ids$probe_id),2]#然后這一步是真正地把rownames(exprSet)改成基因名稱了具则,用到的是match函數(shù),相當于excel表格中的vlookup具帮,返回的是相匹配的行名
head(ids); head(exprSet)#可以看到是對應起來的
new_exprSet <- exprSet
#總的來講博肋,可以用這樣一個id_transfr函數(shù)來表示:
if(F){
source("./Functions/Function_id_transfr.R")#加載這個函數(shù)
new_exprSet <- id_transfr(exprSet,ids)
}
保存以及加載Rdata
save(new_exprSet,group_list,ids,phe,
file='./Rdata/GSE17708_new_exprSet.Rdata')
加載Rdata
rm(list=ls())
load(file='./Rdata/GSE17708_new_exprSet.Rdata')
exprSet <- new_exprSet
rm(new_exprSet)
了解你的表達矩陣
##第一個檢驗
exprSet["GAPDH",]#檢查內(nèi)參表達量
exprSet["ACTB",]#檢查內(nèi)參表達量
boxplot(exprSet[,1:26])
## 第二個檢驗:ggplot2繪圖檢驗
library(reshape2)
exprSet_melt <- melt(exprSet)#就是將每個單元格提取出來了,注意低斋,它是一列一列melt的~
colnames(exprSet_melt) <- c("gene","sample","value")
exprSet_melt$group <- rep(group_list,each =nrow(exprSet))#添加了each = 表示每個值先重復那么多遍,然后再重復下一個值匪凡,然后再下一個……
library(ggplot2)
#箱式圖
p=ggplot(exprSet_melt,aes(x=sample,y=value,fill=group))+geom_boxplot()
print(p)
#x小提琴圖
p=ggplot(exprSet_melt,aes(x=sample,y=value,fill=group))+geom_violin()
print(p)
#包裝圖
p=ggplot(exprSet_melt,aes(value,fill=group))+geom_histogram(bins = 200)+facet_wrap(~sample, nrow = 4)
print(p)
p=ggplot(exprSet_melt,aes(value,col=group))+geom_density()+facet_wrap(~sample, nrow = 4)
print(p)
#密度圖
p=ggplot(exprSet_melt,aes(value,col=group))+geom_density()
print(p)
#圖形的修飾
p=p+stat_summary(fun.y="mean",geom="point",shape=23,size=3,fill="red")
p=p+theme_set(theme_set(theme_bw(base_size=20)))
p=p+theme(text=element_text(face='bold'),axis.text.x=element_text(angle=30,hjust=1),axis.title=element_blank())
print(p)
###第三個檢驗
# hclust
exprSet_clust <- exprSet
colnames(exprSet_clust) <- paste(group_list,1:26)#先處理一下列名
colnames(exprSet_clust)#看看效果
plot(hclust(dist(t(exprSet_clust))))
#PCA圖
# BiocManager::install('ggfortify')
library(ggfortify)
df=as.data.frame(t(exprSet_clust))
df$group=group_list
autoplot(prcomp( df[,1:(ncol(df)-1)] ), data=df,colour = 'group')
加載Rdata
rm(list=ls())
load(file='./Rdata/GSE17708_new_exprSet.Rdata')
exprSet <- new_exprSet
rm(new_exprSet)
差異分析
dim(exprSet)
group_list
#從下面兩行可以看出膊畴,limma包只能兩兩分組比較,無法進行批量操作
exprSet_0_72 <- exprSet[,c(1:3,24:26)]
group_list_0_72 <- group_list[c(1:3,24:26)]
suppressMessages(library(limma)) #加載limma包
design_0_72 <- model.matrix(~0+factor(group_list_0_72))
colnames(design_0_72) <- levels(factor(group_list_0_72))
rownames(design_0_72) <- colnames(exprSet_0_72)
table(design_0_72)
#作比較矩陣
cntrst.mtrix_0_72<-makeContrasts(paste0(unique(group_list_0_72),collapse = "-"),
levels = design_0_72)
cntrst.mtrix_0_72 <- makeContrasts("group_72h-group_h",
levels = design_0_72)
cntrst.mtrix_0_72
##這個矩陣聲明病游,我們要把72h處理組和未處理的細胞系進行差異分析比較
#這之后可以用一個函數(shù)來代替唇跨,簡化流程
##step1
fit <- lmFit(exprSet_0_72,design_0_72)
##step2
fit2 <- contrasts.fit(fit, cntrst.mtrix_0_72) ##這一步很重要,大家可以自行看看效果
fit2 <- eBayes(fit2) ## default no trend !!!
##eBayes() with trend=TRUE
##step3
tempOutput <- topTable(fit2, coef=1, n=Inf)
nrDEG <- na.omit(tempOutput)
#write.csv(nrDEG2,"limma_notrend.results.csv",quote = F)
head(nrDEG)
#以上可以寫一個function來代替它:
if(F){
source("./Functions/Function_diff_genes.R")#加載這個函數(shù)
DEG <- diff_genes(exprSet_0_72,design_0_72,cntrst.mtrix_0_72)
}
head(DEG)#這個DEG就是我們想要的了,可以查看一下
#劃定何為上調衬衬,何為下調基因:
logFC_cutoff <- with(DEG,mean(abs(logFC)) + 2*sd(abs( logFC)) )
# logFC_cutoff=1
DEG$change <- as.factor(ifelse(DEG$P.Value < 0.05 & abs(DEG$logFC) > logFC_cutoff,
ifelse(DEG$logFC > logFC_cutoff ,'UP','DOWN'),'NOT'))
#存儲為csv格式买猖,方便用excel打開或者傳送給老板
作圖觀察
## heatmap
library(pheatmap)
choose_gene=head(rownames(nrDEG),50) ## 50 maybe better
choose_matrix=exprSet[choose_gene,]
choose_matrix=t(scale(t(choose_matrix)))
pheatmap(choose_matrix,filename = 'DEG_top50_heatmap.png')
## volcano plot
library(ggplot2)
colnames(nrDEG)
plot(nrDEG$logFC,-log10(nrDEG$P.Value))
DEG <- nrDEG
logFC_cutoff <- with(DEG,mean(abs( logFC)) + 2*sd(abs( logFC)) )
# logFC_cutoff=1
the_title <- paste0('Cutoff for logFC is ',round(logFC_cutoff,3),
'\nThe number of up gene is ',nrow(DEG[DEG$change =='UP',]) ,
'\nThe number of down gene is ',nrow(DEG[DEG$change =='DOWN',])
)
g = ggplot(data=DEG,
aes(x=logFC, y=-log10(P.Value),
color=change)) +
geom_point(alpha=0.4, size=1.75) +
theme_set(theme_set(theme_bw(base_size=20)))+
xlab("log2 fold change") + ylab("-log10 p-value") +
ggtitle( the_title ) + theme(plot.title = element_text(size=15,hjust = 0.5))+
scale_colour_manual(values = c('blue','black','red')) ## corresponding to the levels(res$change)
print(g)
ggsave(g,filename = 'volcano.png')
存儲好這次差異分析的數(shù)據(jù)
save(exprSet,group_list,DEG,ids,phe,
file='./Rdata/GSE17708_DEG.Rdata')
讀取數(shù)據(jù)
rm(list=ls())
load('./Rdata/GSE17708_DEG.Rdata')
富集分析
source('./Functions/Function_kegg_plot.R')
library(ggplot2)
library(clusterProfiler)
library(org.Hs.eg.db)
Sybl_to_Entrz <- bitr(rownames(DEG), fromType = "SYMBOL",
toType = c( "ENTREZID"),
OrgDb = org.Hs.eg.db)#這里面用到的函數(shù)我估計是bio-transfer函數(shù),專門用于id轉換的佣耐,這里面的“OrgDb”是指Organism database
head(Sybl_to_Entrz); head(DEG)
DEG$SYMBOL <- rownames(DEG)
merged_DEG <- merge(DEG,Sybl_to_Entrz,by='SYMBOL')#這里面是將DEG和Sybl_to_Entrz按照SYMBOL這一列進行匹配+合并政勃,估計最終的排序是和DEG一致的
head(merged_DEG)
gene_up <- merged_DEG[merged_DEG$change == 'UP','ENTREZID']
gene_down <- merged_DEG[merged_DEG$change == 'DOWN','ENTREZID']
gene_diff <- c(gene_up,gene_down)
gene_all <- as.character(merged_DEG[,'ENTREZID'])
data(geneList, package="DOSE")
# head(geneList)
# boxplot(geneList)
# boxplot(merged_DEG$logFC)
geneList <- merged_DEG$logFC
names(geneList) <- merged_DEG$ENTREZID
geneList <- sort(geneList,decreasing = T)
# save(gene_up,gene_all,gene_down,gene_diff,file = "c:/Users/Administrator/Desktop/test_bug.Rdata")
# load("c:/Users/Administrator/Desktop/test_bug.Rdata")
# source('./Functions/Function_kegg_plot.R')
library(ggplot2)
library(clusterProfiler)
library(org.Hs.eg.db)
### over-representation test
kk.up <- enrichKEGG(gene=gene_up,organism = 'hsa',universe= gene_all,pvalueCutoff = 0.9,qvalueCutoff =0.9)
kk.down <- enrichKEGG(gene=gene_down,organism='hsa',universe=gene_all,pvalueCutoff = 0.99)
head(kk.down)[,1:6]
kk.diff <- enrichKEGG(gene = gene_diff,
organism = 'hsa',
pvalueCutoff = 0.99)
head(kk.diff)[,1:6]
kegg_diff_dt <- as.data.frame(kk.diff)
kegg_down_dt <- as.data.frame(kk.down)
kegg_up_dt <- as.data.frame(kk.up)
down_kegg<-kegg_down_dt[kegg_down_dt$pvalue<0.05,]; down_kegg$group <- -1
up_kegg<-kegg_up_dt[kegg_up_dt$pvalue<0.05,]; up_kegg$group <- 1
g_kegg <- kegg_plot(up_kegg,down_kegg)
print(g_kegg)
ggsave(g_kegg,filename = './Exported_files/kegg_up_down.png')
### GO database analysis
g_list=list(gene_up=gene_up,
gene_down=gene_down,
gene_diff=gene_diff)
if(T){
go_enrich_results <- lapply( g_list , function(gene) {
lapply( c('BP','MF','CC') , function(ont) {
cat(paste('Now process ',ont ))
ego <- enrichGO(gene = gene,
universe = gene_all,
OrgDb = org.Hs.eg.db,
ont = ont ,
pAdjustMethod = "BH",
pvalueCutoff = 0.99,
qvalueCutoff = 0.99,
readable = TRUE)
print( head(ego) )
return(ego)
})
})
}
save(go_enrich_results,down_kegg,up_kegg,file = './Rdata/GSE17708_KEGG_GO.Rdata')
rm(list = ls())
load(file = './Rdata/GSE17708_KEGG_GO.Rdata')
#匯總輸出GO氣泡圖
n1= c('gene_up','gene_down','gene_diff')
n2= c('BP','MF','CC')
for (i in 1:3){
for (j in 1:3){
fn=paste0('./Exported_files/dotplot_',n1[i],'_',n2[j],'.png')
cat(paste0(fn,'\n'))
png(fn,res=150,width = 1080)
print(dotplot(go_enrich_results[[i]][[j]]))
dev.off()
}
}