本文采用GSE7014數(shù)據(jù)集作為示例數(shù)據(jù)集
刪除了35-44樣本组底,僅保留26個(gè)樣本進(jìn)行計(jì)算
分為T2DM和Control組
首先下載CEl文件和準(zhǔn)備Targets文件
截屏2021-06-03 下午9.16.06.png
第一步提取
### 讀入Targets
setwd("/Users/apple/Desktop/練手T2DM/T2DM/GSE7014/")
library(limma)
Targets <- readTargets("Targets_GSE7104.txt") # 不指定行名所在的列
### 讀入CEL,使用readaffy
library(affy)
GSE7014_cel <- ReadAffy(filenames = Targets$FileName,
celfile.path = "GSE7014_RAW/",
phenoData = Targets)
cel <- GSE7014_cel
# 表達(dá)矩陣 exprs查看
head(exprs(cel))[, 1:5]
# 樣本信息 pData
GSE7014_targets <- pData(cel)
# 樣本名稱 sampeNames
sampleNames(cel)
# ScanDate 查看樣本掃描日期
cel@protocolData@data$ScanDate
# expression matrix 表達(dá)矩陣
GSE7014_expr_cel <- exprs(cel)
### 保存數(shù)據(jù),備下一步分析用
save(GSE7014_cel, GSE7014_expr_cel, GSE7014_targets, file = "./GSE7014_cel.Rdata")
第二步
setwd("/Users/apple/Desktop/練手T2DM/T2DM/GSE7014/芯片質(zhì)量評(píng)估/")
# 加載Affymetrix芯片GSE7014數(shù)據(jù)(第一步變量)
load_input <- load("/Users/apple/Desktop/練手T2DM/T2DM/GSE7014/GSE7014_cel.Rdata")
load_input
library(affy)
GSE7014_expr_cel <- exprs(GSE7014_cel)#提取表達(dá)矩陣
head(GSE7014_expr_cel)[, 1:5]
GSE7014_targets <- pData(GSE7014_cel) #提取targets(樣本分組)數(shù)據(jù),在phenoData中的data
第三步芯片數(shù)據(jù)質(zhì)量評(píng)估 (比較簡(jiǎn)單秸侣,得到一份報(bào)告)
# 基于arrayQulitMetrics
if (!requireNamespace("BiocManager", quietly = TRUE))
install.packages("BiocManager")
BiocManager::install("arrayQualityMetrics")
library(arrayQualityMetrics)
arrayQualityMetrics(GSE7014_cel,
outdir = "./芯片數(shù)據(jù)質(zhì)量評(píng)估 基于arrayQulitMetrics/", # 通過該文件夾下的index來看結(jié)果
force = TRUE, # 如果本身有個(gè)文件夾的話就把文件夾刷新了
intgroup = "group", # 直接讀取原始數(shù)據(jù)GSE7014_cel@phenoData@data[["group"]]
do.logtransform = TRUE)
dev.off() # dev.off報(bào)錯(cuò)了也無妨笼平,就是為了讓它報(bào)錯(cuò)
第四步芯片數(shù)據(jù)標(biāo)準(zhǔn)化(基于RMA和gcRMA兩種方法)
RMA
# RMA
bgcorrect.methods() # 可以查看背景校正方法
GSE7014_bgc <- affy::bg.correct(GSE7014_cel, method = "rma") # 最常用rma方法
# 背景矯正后作圖觀察樣本
# boxplot
par(mfrow = c(1, 1))
par(mar = c(9, 5, 3, 3))
par(cex = 0.8)
boxplot(GSE7014_cel,
las = 2,
col = rep(c("blue", "red"), each = 20), # 20個(gè)實(shí)驗(yàn)組
ylab = "Log intensity")
# 繪制PCA
# PCA_new函數(shù)
PCA_new <- function(expr, ntop = 500, group, show_name = F){
library(ggplot2)
library(ggrepel)
rv <- genefilter::rowVars(expr)
select <- order(rv, decreasing = TRUE)[seq_len(min(ntop, length(rv)))]
# 主成分計(jì)算函數(shù)
pca <- prcomp(t(expr[select, ])) # 最核心的函數(shù)代碼
percentVar <- pca$sdev^2/sum(pca$sdev^2)
d <- data.frame(PC1 = pca$x[, 1],
PC2 = pca$x[, 2],
group = group,
name = colnames(expr))
attr(d, "percentVar") <- percentVar[1:2]
if (show_name) {
# 作圖函數(shù)ggplot
ggplot(data = d, aes_string(x = "PC1", y = "PC2", color = "group")) +
geom_point(size = 2) +
xlab(paste0("PC1: ", round(percentVar[1] * 100), "% variance")) +
ylab(paste0("PC2: ", round(percentVar[2] * 100), "% variance")) +
geom_text_repel(aes(label = name),
size = 3,
segment.color = "black",
show.legend = FALSE )
} else {
ggplot(data = d, aes_string(x = "PC1", y = "PC2",color = "group")) +
geom_point(size = 2) +
xlab(paste0("PC1: ", round(percentVar[1] * 100), "% variance")) +
ylab(paste0("PC2: ", round(percentVar[2] * 100), "% variance"))
}
}
PCA_new(log2(exprs(GSE7014_bgc)),
nrow(GSE7014_bgc),
group = GSE7014_targets$group,
show_name = T)
#可以看到PCA圖還是沒什么改善
gcRMA
# gcRMA
library(gcrma)
GSE7014_gcrma <- gcrma(GSE7014_cel)
GSE7014_expr_gcrma <- exprs(GSE7014_gcrma) # 提取原始數(shù)據(jù)中的表達(dá)矩陣
# 運(yùn)行以后再做箱線圖觀察
dev.new()
par(mfrow = c(1, 1))
par(mar = c(9, 5, 3, 3))
par(cex = 0.8)
boxplot(GSE7014_gcrma, # 表達(dá)矩陣
las = 2,
col = rep(c("blue", "red"), each = 20), # 20個(gè)實(shí)驗(yàn)組
ylab = "Log intensity")
# dev.off()
# 可以看到GSM161970這個(gè)樣本離群值非常明顯
PCA_new(GSE7014_expr_gcrma,
nrow(GSE7014_gcrma),
group = GSE7014_targets$group,
show_name = T)
# 可以看到970樣本多重分析下來,可以考慮刪除這個(gè)樣本
##之后也可以再重復(fù)一次上一步arrayQualityMetrics進(jìn)行質(zhì)量評(píng)估师逸,驗(yàn)證這一步
第五步缺失值的補(bǔ)充(采用KNN法),選取gcRMA標(biāo)準(zhǔn)化進(jìn)行(rma也可)
#首先看一下有沒有缺失值
sum(is.na(exprs(GSE7014_gcrma))) # 可以使用RMA函數(shù)的結(jié)果也可以用gcRMA的結(jié)果
# 發(fā)現(xiàn)沒有缺失值
# 如果有缺失值那么運(yùn)行下面代碼
if (!requireNamespace("BiocManager", quietly = TRUE))
install.packages("BiocManager")
BiocManager::install("impute")
library(impute)
# KNN法impute函數(shù)計(jì)算
GSE7014_gcrma <- impute.knn(GSE7014_gcrma, maxp = 30000) # maxp默認(rèn)的是1500豆混,好像會(huì)經(jīng)常報(bào)錯(cuò)篓像,可以嘗試調(diào)大到30000
sum(is.na(GSE7014_gcrma)) # 再次查看是否有缺失值
第六步標(biāo)準(zhǔn)化結(jié)果可視化
# 再次用boxplot和PCA進(jìn)行可視化查看
# 運(yùn)行以后再做箱線圖觀察
dev.new()
par(mfrow = c(1, 1))
par(mar = c(9, 5, 3, 3))
par(cex = 0.8)
boxplot(GSE7014_gcrma, # 表達(dá)矩陣
las = 2,
col = rep(c("blue", "red"), each = 20), # 20個(gè)實(shí)驗(yàn)組
ylab = "Log intensity")
# dev.off()
PCA_new(GSE7014_expr_gcrma,
nrow(GSE7014_gcrma),
group = GSE7014_targets$group,
show_name = T)
## 保存變量后續(xù)使用
save(GSE7014_expr_gcrma, GSE7014_targets,
file = "affymetrix/GSE29450/GSE29450_after_gcrma.Rdata")