作業(yè) 1
根據(jù)R包org.Hs.eg.db找到下面ensembl 基因ID 對應(yīng)的基因名(symbol)
A<- read.table("xl.txt")
library(org.Hs.eg.db)
g2s=toTable(org.Hs.egSYMBOL)
g2e=toTable(org.Hs.egENSEMBL)
a<-A$V1
for (i in 1:6) {
A$V1[i]<-strsplit(a,'[.]')[[i]][1]}
names(A)[1]<-"ensembl_id"
A<-merge(A,g2e,by=("ensembl_id"))
A<-merge(A,g2s,by=("gene_id"))
運行結(jié)果
作業(yè)2
根據(jù)R包hgu133a.db找到下面探針對應(yīng)的基因名(symbol)
B<-read.table("e2.txt")
library(hgu133a.db)
ids=toTable(hgu133aSYMBOL)
head(ids)
names(B)[1]<-"probe_id"
merge(B,ids,by=("probe_id"))
運行結(jié)果
probe_id symbol
1 1053_at RFC2
2 117_at HSPA6
3 121_at PAX8
4 1255_g_at GUCA1A
5 1316_at THRA
6 1320_at PTPN21
7 1405_i_at CCL5
8 1431_at CYP2E1
9 1438_at EPHB3
10 1487_at ESRRA
11 1494_f_at CYP2A6
12 1598_g_at GAS6
13 160020_at MMP14
14 1729_at TRADD
15 177_at PLD1
作業(yè)3
找到R包CLL內(nèi)置的數(shù)據(jù)集的表達矩陣?yán)锩娴腡P53基因的表達量价匠,并且繪制在 progres.-stable分組的boxplot圖
suppressPackageStartupMessages(library(CLL))
data(sCLLex)
sCLLex
exprSet=exprs(sCLLex)
pd=pData(sCLLex)
library(hgu95av2.db)
ids=toTable(hgu95av2SYMBOL)
boxplot(exprSet['1939_at',]~pd$Disease)
boxplot(exprSet['1974_s_at',]~pd$Disease)
boxplot(exprSet['31618_at',]~pd$Disease)
作業(yè)4
找到BRCA1基因在TCGA數(shù)據(jù)庫的乳腺癌數(shù)據(jù)集(Breast Invasive Carcinoma (TCGA, PanCancer Atlas))的表達情況
a<-read.table('plot.txt',sep = '\t',fill = T,header = T)
colnames(a)=c('id','subtype','expression','mut')
dat=a
library(ggplot2)
ggsave('plot-again-BRCA1-TCGA-BRCA-cbioportal.png')
作業(yè)5
找到TP53基因在TCGA數(shù)據(jù)庫的乳腺癌數(shù)據(jù)集的表達量分組看其是否影響生存
a=read.table('BRCA_7157_50_50.csv',sep = ',',fill = T,header = T)
tmp=a
library(ggplot2)
library(survival)
library(survminer)
tmp$Status=ifelse(tmp$Status=='Dead',1,0)
sfit <- surv_fit(Surv(Days,Status)~Group,data=tmp)
summary(sfit)
ggsurvplot(sfit,conf.int = F,pval = TRUE)
ggsave('survival_TP53_in_BRCA_TCGA.png')
作業(yè)6
下載數(shù)據(jù)集GSE17215的表達矩陣并且提取下面的基因畫熱圖
ACTR3B ANLN BAG1 BCL2 BIRC5 BLVRA CCNB1 CCNE1 CDC20 CDC6 CDCA1 CDH3 CENPF CEP55 CXXC5 EGFR ERBB2 ESR1 EXO1 FGFR4 FOXA1 FOXC1 GPR160 GRB7 KIF2C KNTC2 KRT14 KRT17 KRT5 MAPT MDM2 MELK MIA MKI67 MLPH MMP11 MYBL2 MYC NAT1 ORC6L PGR PHGDH PTTG1 RRM2 SFRP1 SLC39A6 TMEM45B TYMS UBE2C UBE2T
if (!file.exists(f)) {
gset <- getGEO('GSE17215',destdir=".",AnnotGPL = F,
getGPL =F)
save(gset,file = f)
}
a=gset[[1]]
load('GSE17215_eSet.Rdata')
dat=exprs(a)
library(hgu133a.db)
ids=toTable(hgu133aSYMBOL)
dat=dat[ids$probe_id,]
ids$median=apply(dat,1,median)
ids=ids[order(ids$symbol,ids$median,decreasing = T),]
ids=ids[!duplicated(ids$symbol),]
dat=dat[ids$probe_id,]
rownames(dat)=ids$symbol
ng='ACTR3B ANLN BAG1 BCL2 BIRC5 BLVRA CCNB1 CCNE1 CDC20 CDC6 CDCA1 CDH3 CENPF CEP55 CXXC5 EGFR ERBB2 ESR1 EXO1 FGFR4 FOXA1 FOXC1 GPR160 GRB7 KIF2C KNTC2 KRT14 KRT17 KRT5 MAPT MDM2 MELK MIA MKI67 MLPH MMP11 MYBL2 MYC NAT1 ORC6L PGR PHGDH PTTG1 RRM2 SFRP1 SLC39A6 TMEM45B TYMS UBE2C UBE2T'
ng=strsplit(ng,' ')[[1]]
ng=ng[ng%in% rownames(dat)]
dat[ng,]
dat=log2(dat)
pheatmap::pheatmap(dat)
作業(yè)7
下載數(shù)據(jù)集GSE24673的表達矩陣計算樣本的相關(guān)性并且繪制熱圖冕杠,需要標(biāo)記上樣本分組信息
if (!file.exists(f)) {
gset <- getGEO('GSE17215',destdir=".",AnnotGPL = F,
getGPL =F)
save(gset,file = f)
}
a=gset[[1]]
load('GSE17215_eSet.Rdata')
dat=exprs(a)
library(hgu133a.db)
ids=toTable(hgu133aSYMBOL)
dat=dat[ids$probe_id,]
ids$median=apply(dat,1,median)
ids=ids[order(ids$symbol,ids$median,decreasing = T),]
ids=ids[!duplicated(ids$symbol),]
dat=dat[ids$probe_id,]
rownames(dat)=ids$symbol
ng='ACTR3B ANLN BAG1 BCL2 BIRC5 BLVRA CCNB1 CCNE1 CDC20 CDC6 CDCA1 CDH3 CENPF CEP55 CXXC5 EGFR ERBB2 ESR1 EXO1 FGFR4 FOXA1 FOXC1 GPR160 GRB7 KIF2C KNTC2 KRT14 KRT17 KRT5 MAPT MDM2 MELK MIA MKI67 MLPH MMP11 MYBL2 MYC NAT1 ORC6L PGR PHGDH PTTG1 RRM2 SFRP1 SLC39A6 TMEM45B TYMS UBE2C UBE2T'
ng=strsplit(ng,' ')[[1]]
ng=ng[ng%in% rownames(dat)]
dat[ng,]
dat=log2(dat)
pheatmap::pheatmap(dat)
作業(yè)8
找到 GPL6244 platform of Affymetrix Human Gene 1.0 ST Array 對應(yīng)的R的bioconductor注釋包掂铐,并且安裝它!
BiocManager::install("hugene10sttranscriptcluster.db",ask = F,update = F)
作業(yè)9
下載數(shù)據(jù)集GSE42872的表達矩陣绣溜,并且分別挑選出 所有樣本的(平均表達量/sd/mad/)最大的探針霹肝,并且找到它們對應(yīng)的基因
options(stringsAsFactors = F)
f='GSE42872_eSet.Rdata'
library(GEOquery)
if (!file.exists(f)) {
gset <- getGEO('GSE42872',destdir=".",AnnotGPL = F,
getGPL =F)
save(gset,file = f)
}
a=gset[[1]]
load('GSE42872_eSet.Rdata')
dat=exprs(a)
sort(apply(dat,1,mean),decreasing = T)[1]
運行結(jié)果
7917645
14.185
作業(yè)10
下載數(shù)據(jù)集GSE42872的表達矩陣穆趴,并且根據(jù)分組使用limma做差異分析饿凛,得到差異結(jié)果矩陣
options(stringsAsFactors = F)
f='GSE42872_eSet.Rdata'
library(GEOquery)
if (!file.exists(f)) {
gset <- getGEO('GSE42872',destdir=".",AnnotGPL = F,
getGPL =F)
save(gset,file = f)
}
a=gset[[1]]
load('GSE42872_eSet.Rdata')
dat=exprs(a)
pd=pData(a)
group_list=unlist(lapply(pd$title, function(x){
strsplit(x,' ')[[1]][4]
}))
experSet=dat
#差異分析
suppressMessages(library(limma))
design<-model.matrix(~0+factor(group_list))
colnames(design)=levels(factor(group_list))
row.names(design)=colnames(experSet)
design
contrast.matrix<-makeContrasts(paste0(unique(group_list),collapse = "-"),levels = design)
fit<-lmFit(experSet,design)
fit2<-contrasts.fit(fit,contrast.matrix)
fit2<-eBayes(fit2)
tempOutput=topTable(fit2,coef = 1,n=Inf)
nrDEG=na.omit(tempOutput)