作業(yè)1
請根據(jù)R包org.Hs.eg.db找到下面ensembl 基因ID 對(duì)應(yīng)的基因名(symbol)
ENSG00000000003.13
ENSG00000000005.5
ENSG00000000419.11
ENSG00000000457.12
ENSG00000000460.15
ENSG00000000938.11
library(org.Hs.eg.db)
library(clusterProfiler)
ensembl <- c("ENSG00000000003.13","ENSG00000000005.5","ENSG00000000419.11","ENSG00000000457.12","ENSG00000000460.15","ENSG00000000938.11")
ensembl_sub <- str_sub(ensembl,1,15)
gene_sym <- bitr(
geneID = ensembl_sub,
fromType = "ENSEMBL",
toType = "SYMBOL",
OrgDb = org.Hs.eg.db
)
>gene_sym
ENSEMBL SYMBOL
1 ENSG00000000003 TSPAN6
2 ENSG00000000005 TNMD
3 ENSG00000000419 DPM1
4 ENSG00000000457 SCYL3
5 ENSG00000000460 C1orf112
6 ENSG00000000938 FGR
第二種方法
g2s <- toTable(org.Hs.egSYMBOL);head(g2s)
g2e <- toTable(org.Hs.egENSEMBL);head(g2e)
ensembl <- tibble(
ensembl=c("ENSG00000000003.13","ENSG00000000005.5","ENSG00000000419.11","ENSG00000000457.12","ENSG00000000460.15","ENSG00000000938.11")
)
for (i in 1:nrow(ensembl)){
ensembl[i,]=str_sub(ensembl[i,],1,15)
}
tmp1 <- inner_join(
ensembl,g2e,by="ensembl_id"
)
tmp2 <- inner_join(
tmp1,g2s,by="gene_id"
)
tmp2
# A tibble: 6 x 3
ensembl_id gene_id symbol
<chr> <chr> <chr>
1 ENSG00000000003 7105 TSPAN6
2 ENSG00000000005 64102 TNMD
3 ENSG00000000419 8813 DPM1
4 ENSG00000000457 57147 SCYL3
5 ENSG00000000460 55732 C1orf112
6 ENSG00000000938 2268 FGR
作業(yè)2
根據(jù)R包hgu133a.db找到下面探針對(duì)應(yīng)的基因名(symbol)
1053_at
117_at
121_at
1255_g_at
1316_at
1320_at
1405_i_at
1431_at
1438_at
1487_at
1494_f_at
1598_g_at
160020_at
1729_at
177_at
library(hgu133a.db)
columns(hgu133a.db)
probe_id_c <- c("1053_at","117_at","121_at","1255_g_at","1316_at",
"1320_at","1405_i_at","1431_at","1438_at","1487_at",
"1494_f_at","1598_g_at","160020_at","1729_at","177_at")
probe_id <- tibble(
probe_id=probe_id_c
)
g2s <- toTable(hgu133aSYMBOL)
tmp <- inner_join(
probe_id,g2s,by="probe_id"
)
tmp
# A tibble: 15 x 2
probe_id symbol
<chr> <chr>
1 1053_at RFC2
2 117_at HSPA6
3 121_at PAX8
4 1255_g_at GUCA1A
5 1316_at THRA
6 1320_at PTPN21
7 1405_i_at CCL5
8 1431_at CYP2E1
9 1438_at EPHB3
10 1487_at ESRRA
11 1494_f_at CYP2A6
12 1598_g_at GAS6
13 160020_at MMP14
14 1729_at TRADD
15 177_at PLD1
作業(yè)3
找到R包CLL內(nèi)置的數(shù)據(jù)集的表達(dá)矩陣?yán)锩娴腡P53基因的表達(dá)量揣钦,并且繪制在 progres.-stable分組的boxplot圖
library(CLL)
data(sCLLex)
exp <- exprs(sCLLex)
pd <- pData(sCLLex)
library(hgu95av2.db)
g2s <- toTable(hgu95av2SYMBOL)
g2s %>%
filter(symbol=="TP53")
TP53_probe_id=g2s %>%
filter(symbol=="TP53") %>%
select(probe_id)
TP53_probe_id=as.character(TP53_probe_id$probe_id)
par(mfrow=c(1,3))
boxplot(exp["1939_at",]~pd$Disease)
boxplot(exp["1974_s_at",]~pd$Disease)
boxplot(exp["31618_at",]~pd$Disease)
作業(yè)4
找到BRCA1基因在TCGA數(shù)據(jù)庫的乳腺癌數(shù)據(jù)集(Breast Invasive Carcinoma (TCGA, PanCancer Atlas))的表達(dá)情況
參考:找到TP53基因在TCGA數(shù)據(jù)庫的肝癌數(shù)據(jù)集的表達(dá)情況 - 簡書 (jianshu.com)
作業(yè)5
找到TP53基因在TCGA數(shù)據(jù)庫的乳腺癌數(shù)據(jù)集的表達(dá)量分組看其是否影響生存
參考:找到TP53基因在TCGA數(shù)據(jù)庫的乳腺癌數(shù)據(jù)集的表達(dá)量分組看其是否影響生存 - 簡書 (jianshu.com)
作業(yè)6
下載數(shù)據(jù)集GSE17215的表達(dá)矩陣并且提取下面的基因畫熱圖
ACTR3B ANLN BAG1 BCL2 BIRC5 BLVRA CCNB1 CCNE1 CDC20 CDC6 CDCA1 CDH3 CENPF CEP55 CXXC5 EGFR ERBB2 ESR1 EXO1 FGFR4 FOXA1 FOXC1 GPR160 GRB7 KIF2C KNTC2 KRT14 KRT17 KRT5 MAPT MDM2 MELK MIA MKI67 MLPH MMP11 MYBL2 MYC NAT1 ORC6L PGR PHGDH PTTG1 RRM2 SFRP1 SLC39A6 TMEM45B TYMS UBE2C UBE2T
load("D:/genetic_r/R-practise/GSE17215_eSet.Rdata")
gset1 <- gset[[1]]
exp <- exprs(gset1)
exp <- as.data.frame(exp)
pd <- pData(gset1)
library(hgu133a.db)
ids <- toTable(hgu133aSYMBOL)
exp=exp %>%
mutate(probe_id=rownames(exp))
exp=exp %>%
inner_join(ids,by="probe_id")
exp=exp %>%
select(-probe_id)
exp=exp %>%
select(symbol,everything())
exp=exp %>%
mutate(rowMean=rowMeans(.[,-1])) %>%
arrange(desc(rowMean)) %>%
distinct(symbol,.keep_all = T) %>%
select(-rowMean) %>%
column_to_rownames("symbol")
ng='ACTR3B ANLN BAG1 BCL2 BIRC5 BLVRA CCNB1 CCNE1 CDC20 CDC6 CDCA1 CDH3 CENPF CEP55 CXXC5 EGFR ERBB2 ESR1 EXO1 FGFR4 FOXA1 FOXC1 GPR160 GRB7 KIF2C KNTC2 KRT14 KRT17 KRT5 MAPT MDM2 MELK MIA MKI67 MLPH MMP11 MYBL2 MYC NAT1 ORC6L PGR PHGDH PTTG1 RRM2 SFRP1 SLC39A6 TMEM45B TYMS UBE2C UBE2T'
ng=str_split(ng,' ')
ng=unlist(ng)
table(ng%in%rownames(exp))
ng=ng[ng%in%rownames(exp)]
dat=exp[ng%in%rownames(exp),]
# 清洗掉不存在的ng,注意這一步存在排序(連同下一步理解)
ng=ng[ng %in% rownames(exp)]
dat=exp[ng,]
# 畫圖
dat=log2(dat)
pheatmap::pheatmap(dat,scale = 'row')
作業(yè)7
下載數(shù)據(jù)集GSE24673的表達(dá)矩陣計(jì)算樣本的相關(guān)性并且繪制熱圖执泰,需要標(biāo)記上樣本分組信息
rm(list = ls())
options(stringsAsFactors = F)
gset <- getGEO( 'GSE24673', getGPL = F )
library(GEOquery)
gset <- gset [[1]]
exp <- exprs(gset)
pd <- pData(gset)
exp <- as.data.frame(exp)
group_list=c('rbc','rbc','rbc',
'rbn','rbn','rbn',
'rbc','rbc','rbc',
'normal','normal')
exp[1:4,1:4]
#相關(guān)性分析
M=cor(exp)
pheatmap::pheatmap(M)
tmp=data.frame(g=group_list)
rownames(tmp) <- colnames(M)
pheatmap::pheatmap(M,annotation_col = tmp)
作業(yè)8
找到 GPL6244 platform of Affymetrix Human Gene 1.0 ST Array 對(duì)應(yīng)的R的bioconductor注釋包,并且安裝它风响!
可在此搜索
用R獲取芯片探針與基因的對(duì)應(yīng)關(guān)系三部曲-bioconductor | 生信菜鳥團(tuán) (bio-info-trainee.com)
platformMap <- data.table::fread("resource/platformMap.txt",data.table = F)
[1] "hugene10sttranscriptcluster.db" #可獲得對(duì)應(yīng)的注釋包
## 平臺(tái)的名稱如何知道?
index <- "GPL6244"
## 數(shù)據(jù)儲(chǔ)存在bioc_package這一列中
paste0(platformMap$bioc_package[grep(index,platformMap$gpl)],".db")
## 安裝R包,可以直接安裝慈缔,這里用了判斷
if(!requireNamespace("hugene10sttranscriptcluster.db")){
options(BioC_mirror="https://mirrors.ustc.edu.cn/bioc/")
BiocManager::install("hugene10sttranscriptcluster.db",update = F,ask = F)
}
## 加載R包
library(hugene10sttranscriptcluster.db)
作業(yè)9
下載數(shù)據(jù)集GSE42872的表達(dá)矩陣趴梢,并且分別挑選出 所有樣本的(平均表達(dá)量/sd/mad/)最大的探針噩峦,并且找到它們對(duì)應(yīng)的基因薪捍。
rm(list = ls())
options(stringsAsFactors = F)
gset <- getGEO( 'GSE42872', getGPL = F )
library(GEOquery)
gset <- gset [[1]]
exp <- exprs(gset)
pd <- pData(gset)
exp <- as.data.frame(exp)
#由平臺(tái) "GPL6244"可得出注釋包為hugene10sttranscriptcluster.db
#加載注釋包
library(hugene10sttranscriptcluster.db)
ids <- toTable(hugene10sttranscriptclusterSYMBOL)
exp=exp %>%
rownames_to_column("probe_id")
exp <- inner_join(exp,ids,by="probe_id")
exp=exp %>%
select(probe_id,symbol,everything())
#平均表達(dá)量
exp=exp %>%
mutate(rowmean=rowMeans(.[,-c(1,2)]))
#找最大
exp %>%
arrange(-rowmean) %>%
head(1)
#7953385 GAPDH
#mad
exp=exp %>%
mutate(md=apply(exp[,3:8],1,median))
exp %>%
arrange(-md) %>%
head(1)
#7953385 GAPDH
#sd
exp=exp %>%
mutate(sd=apply(exp[,3:8],1,sd))
exp %>%
arrange(-sd) %>%
head(1)
#8133876 CD36
作業(yè)10
下載數(shù)據(jù)集GSE42872的表達(dá)矩陣噪舀,并且根據(jù)分組使用limma做差異分析魁淳,得到差異結(jié)果矩陣
rm(list = ls())
options(stringsAsFactors = F)
gset <- getGEO( 'GSE42872', getGPL = F )
library(GEOquery)
gset <- gset [[1]]
exp <- exprs(gset)
exp <- as.data.frame(exp)
pd <- pData(gset)
group_list=character(6)
for (i in 1:nrow(pd)){
group_list[i]=strsplit(pd$title[i],' ')[[1]][4]
}
exprSet=exp
# 用limma包做差異表達(dá)分析
#差異分析重點(diǎn)在于做好表達(dá)矩陣和分組信息,具體原理可以不用理解
suppressMessages(library(limma))
design <- model.matrix(~0+factor(group_list))
colnames(design)=levels(factor(group_list))
rownames(design)=colnames(exprSet)
design
contrast.matrix<-makeContrasts(paste0(unique(group_list),collapse = "-"),levels = design)
contrast.matrix
##step1
fit <- lmFit(exprSet,design)
##step2
fit2 <- contrasts.fit(fit, contrast.matrix) ##這一步很重要与倡,大家可以自行看看效果
fit2 <- eBayes(fit2) ## default no trend !!!
##eBayes() with trend=TRUE
##step3
tempOutput = topTable(fit2, coef=1, n=Inf)
nrDEG = na.omit(tempOutput)
#write.csv(nrDEG2,"limma_notrend.results.csv",quote = F)
head(nrDEG)
參考:盤一盤 生信技能樹R語言小作業(yè)(中級(jí)) - 簡書 (jianshu.com)