#芯片分析中經(jīng)常會(huì)遇到Affymetrix Human Transcriptome Array 2.0芯片,由于目前還沒有現(xiàn)成的R包可以用,因此分析方法也不統(tǒng)一寻狂。見生信技能樹Jimmy老師HTA2.0芯片比較麻煩瞬项,其實(shí)這類常見的有3個(gè)平臺(tái)蔗蹋,3種類型:
GPL17586 [HTA-2_0] Affymetrix Human Transcriptome Array 2.0 [transcript (gene) version]
GPL19251 [HuGene-2_0-st] Affymetrix Human Gene 2.0 ST Array [probe set (exon) version]
GPL16686 [HuGene-2_0-st] Affymetrix Human Gene 2.0 ST Array [transcript (gene) version]
對(duì)于這三種平臺(tái)可以去Affymetrix的官網(wǎng)去查看其區(qū)別,也可以去NCBI去查看:
GPL17586
GPL19251
GPL16686
我安裝芯片分析的一般流程進(jìn)行分析,以GPL16686平臺(tái)囱淋,GSE77532
1纸颜、讀入soft文件,手動(dòng)下載GSE77532對(duì)應(yīng)的soft文件绎橘,實(shí)際應(yīng)該下載GPL16686對(duì)應(yīng)的GPL16686.soft文件胁孙,網(wǎng)速原因,只能退而求其次称鳞。
rm(list = ls())
options(stringsAsFactors = F)
#讀入soft文件
library(GEOquery)
gse77532 <- getGEO(filename = "GSE77532_family.soft.gz",destdir = ".")
dim(gse77532)
y <- gse77532@gpls$GPL16686@dataTable@table
dim(y)
head(y)
y[1:4,1:8]
2涮较、 id轉(zhuǎn)換
#### id conversion
library(clusterProfiler)
ENTREZID<- bitr(y[,6], fromType = "ACCNUM",
toType=c("SYMBOL","ENSEMBL","ENTREZID"),
OrgDb = org.Hs.eg.db)
ls(package:clusterProfiler)
dim(ENTREZID)
ENTREZID[1:5,1:4]
save(y,ENTREZID,file = "ids.Rdata")
#ids過濾探針
table(y$GB_ACC %in% ENTREZID$ACCNUM)
y1 <- y[y$GB_ACC %in% ENTREZID$ACCNUM,]
y1[1:5,1:8]
y2 <- y1[,c(1,6)]
names(y2) <- c("probe_id","ACCNUM")
#合并y2與ENTREZID
ids <- merge(y2,ENTREZID,by ="ACCNUM",all=F)
ids[1:5,1:5]
dim(ids)
##載入表達(dá)矩陣
load("GSE77532_exprSet.Rdata")
exprSet <- exprSet2
exprSet[1:5,1:6]
#過濾表達(dá)矩陣
exprSet <- exprSet[rownames(exprSet) %in% ids$probe_id,]
dim(exprSet)
exprSet[1:5,1:5]
#ids過濾探針
ids <- ids[match(rownames(exprSet),ids$probe_id),]
dim(ids)
ids[1:2,1:5]
ids <- ids[,c(2,3)]
dim(ids)
ids[1:2,1:2]
#合并表達(dá)矩陣和ids
idcombine <- function(exprSet, ids){
tmp <- by(exprSet,
ids$SYMBOL,
function(x) rownames(x)[which.max(rowMeans(x))])
probes <- as.character(tmp)
print(dim(exprSet))
exprSet <- exprSet[rownames(exprSet) %in% probes,]
print(dim(exprSet))
rownames(exprSet) <- ids[match(rownames(exprSet), ids$probe_id),2]
return(exprSet)
}
new_exprSet <- idcombine(exprSet,ids)
new_exprSet[1:4,1:6]
id 轉(zhuǎn)換用biomaRt包,更方便一些冈止,知識(shí)網(wǎng)速支持不下來狂票。
GPL17586平臺(tái)芯片
#
rm(list = ls())
options(stringsAsFactors = F)
#加載R包
library(GEOquery)
#讀入soft文件
GSE110359 <- getGEO(filename = "GSE110359_family.soft.gz",destdir = ".")
dim(GSE110359)
y <- GSE110359@gpls$GPL17586@dataTable@table
dim(y)
head(y)
y[1:4,1:15]
View(head(y))## you need to check this , which column do you need
probe2gene <- y[,c(2,8)]
library(stringr)
probe2gene$symbol=trimws(str_split(probe2gene$gene_assignment,'//',simplify = T)[,2])
plot(table(table(probe2gene$symbol)),xlim=c(1,50))
head(probe2gene)
dim(probe2gene)
View(head(probe2gene))
ids2 <- probe2gene[,c(1,3)]
View(head(ids))
ids2[1:20,1:2]#含有缺失值
save(ids2,probe2gene,file='GSE110359-probe2gene.Rdata')
load("GSE110359-probe2gene.Rdata")
#####
View(head(probe2gene))
dim(probe2gene)
ID 轉(zhuǎn)換
library(biomaRt)
x <- probe2gene$probeset_id
value <- x
attr <- c("affy_hta_2_0","hgnc_symbol")
ensembl <- useMart("ensembl", dataset = "hsapiens_gene_ensembl")
ids <- getBM(attributes = attr,
filters = "affy_hta_2_0",
values = value,
mart = ensembl,
useCache = F)
dim(ids)#[1] 1041 2
View(head(ids))
save(ids,file = "GPL17586_ids.Rdata")
#去重之后
table(unique(ids$hgnc_symbol))#28262
attributes <- listAttributes(ensembl)
View(attributes) # 查看轉(zhuǎn)換格式
save(ids,ensembl,y,file = "ensembl.Rdata")