芯片數(shù)據(jù)質(zhì)控
1.前后對比箱線圖
## --------------------------------------------數(shù)據(jù)檢驗和聚類------------------------------
exprSet_L <- melt(exprSet)
colnames(exprSet_L) <- c("sample", "value")
exprSet_L$group <- rep(pdata$group, each = nrow(exprSet))
exprSet_L[1:10, ]
## 箱線圖
p21 <- ggplot(exprSet_L, aes(x = sample, y = value, fill = group)) +
geom_boxplot() +
theme(
axis.title.x = element_text(face = "italic"),
axis.text.x = element_text(angle = 45, vjust = 0.5),
legend.position = "top"
)
p21 # 數(shù)據(jù)不太規(guī)整
# #ggsave(p21,filename = ".\\plots\\bar_before.pdf",width = 18,height = 9,units = "cm")
# ##標準化后再圖
# exprSet_1 <- normalizeBetweenArrays(exprSet) %>% data.frame()
# exprSet_L <- melt(exprSet_1)
# colnames(exprSet_L) = c('sample','value')
# exprSet_L$group = rep(group_info$group,each = nrow(exprSet))
# p22 <- ggplot(exprSet_L,aes(x = sample, y = value,fill = group)) +geom_boxplot()+
# theme(axis.title.x = element_text(face = 'italic'),
# axis.text.x = element_text(angle = 45 , vjust = 0.5),
# legend.position = 0)
# p22 #表準化后規(guī)整多了雁歌,但知道效果咋樣
#
# plot_qc <- plot_grid(p21,p22,labels = "AUTO",nrow = 2)
#
# ggsave(plot_qc,filename = ".\\plots\\plot_qc.pdf",width = 18,height = 18,units = "cm")
# 把標準化后的數(shù)據(jù)賦值給原始數(shù)據(jù),暫時不做
# exprSet <- exprSet_1
2.提琴圖
## 小提琴圖
# p1 <- ggplot(exprSet_L,aes(x = sample, y = value,fill = group)) +geom_violin()
# p1
3.密度圖
## 密度圖
# p3 <- ggplot(exprSet_L,aes(value,col=group)) +geom_density() + facet_wrap(~sample,nrow = 2)
# print(p3)
4.聚類圖1
## hclust聚類圖
colnames(exprSet) <- paste(pdata$title1, sep = "") ## 樣本名稱改變
# hc <- hclust(dist(t(exprSet))) #hclust需要用plot畫圖逛拱,此例不用
dist.r <- dist(t(exprSet))
res <- hcut(dist.r, k = 2, stand = TRUE)
# Visualize
cluster_unbatch <- fviz_dend(res,
# 加邊框
rect = TRUE,
# 邊框顏色
rect_border = "cluster",
# 邊框線條類型
rect_lty = 2,
# 邊框線條粗細
lwd = 1.2,
# 邊框填充
rect_fill = T,
# 字體大小
cex = 0.8,
# 字體顏色
color_labels_by_k = T,
# 平行放置
horiz = T,
k_colors = "lancet", #指定色板
labels_track_height = 7,
main = element_blank()
)+theme(text = element_text(size = 8))
cluster_unbatch
# 數(shù)據(jù)不可用,需要校正批次效應(yīng)
# 把樣本名改回來
colnames(exprSet) <- pdata$geo_accession
5.PCA圖1
# 畫主成分分析圖需要加載這兩個包
library(FactoMineR)
library(factoextra)
# pca
dat.pca <- PCA(as.data.frame(t(exprSet)), graph = FALSE)
pca_plot1 <- fviz_pca_ind(dat.pca,
geom.ind = "point", # show points only (nbut not "text")
col.ind = group_list, # color by groups
# palette = c("#00AFBB", "#E7B800"), #修改顏色
addEllipses = TRUE, # Concentration ellipses
# legend.title = "GROUP"
) + theme(
legend.position = c(0.80, 1),
text = element_text(size = 8),
legend.key.size = unit(3, "mm"),
legend.title = element_blank()
)
pca_plot1
6.處理批次效應(yīng)
library(sva)
model <- model.matrix(~ as.factor(pdata$group)) # "group"為分組所在列趋翻,pdata是表型文件(元數(shù)據(jù))
### 重點來了2m镉蕖煌珊!
### 重點來了K蕉拧!救欧!
### 重點來了Kゴ狻!笆怠!
# 方法一(ComBat)
combat_edata <- ComBat(dat = exprSet, batch = pdata$batch, mod = model) # exprSet是表達矩陣铝耻,batch是批次信息列,在表型文件里蹬刷,下同
# 使用sva包中的combat函數(shù)!
# 依次是什么數(shù)據(jù)瓢捉、批次來自于哪、想保留什么信息
7.再次查看聚類圖
# 再次聚類查看
colnames(combat_edata) <- paste(pdata$title1, sep = "") ## 樣本名稱再次改變
hc <- hclust(dist(t(combat_edata)))
dist.r <- dist(t(combat_edata))
res <- hcut(dist.r, k = 2, stand = TRUE)
# Visualize
cluster_batched <- fviz_dend(res,
# 加邊框
rect = TRUE,
# 邊框顏色
rect_border = "cluster",
# 邊框線條類型
rect_lty = 2,
# 邊框線條粗細
lwd = 1.2,
# 邊框填充
rect_fill = T,
# 字體大小
cex = 0.8,
# 字體顏色
color_labels_by_k = T,
# 平行放置
horiz = T,
k_colors = "lancet", #指定色板
labels_track_height = 7,
main = element_blank()
)+theme(text = element_text(size = 8))
cluster_batched
plot_batch <- plot_grid(cluster_unbatch,cluster_batched,labels = "AUTO",label_size = 10)
ggsave(plot_batch,filename = ".\\plots\\cluster_batched.pdf",width = 180,height = 89,units = "mm")
8.再次作圖PCA
# pca
dat.pca <- PCA(as.data.frame(t(combat_edata)), graph = FALSE)
pca_plot2 <- fviz_pca_ind(dat.pca,
geom.ind = "point", # show points only (nbut not "text")
col.ind = group_list, # color by groups
# palette = c("#00AFBB", "#E7B800"), #修改顏色
addEllipses = TRUE, # Concentration ellipses
# legend.title = "GROUP"
) + theme(
legend.position = c(0.80, 1),
text = element_text(size = 8),
legend.key.size = unit(3, "mm"),
legend.title = element_blank()
)
pca_plot2
plot_batch <- plot_grid(pca_plot1,pca_plot2,labels = "AUTO",label_size = 10)
ggsave(plot_batch,filename = ".\\plots\\pca_batched.pdf",width = 180,height = 89,units = "mm")
9.后續(xù)處理
# 把校正批次效應(yīng)后的文件賦給表達矩陣
exprSet <- combat_edata %>% data.frame()
# 再次改回樣本名
colnames(exprSet) <- pdata$geo_accession
# 這樣寫就行办成,不用加其他參數(shù)泡态,供CIBERSORT使用,生成TXT后需要到文件里把第一行向后tab一下,否則報錯迂卢。
write.table(exprSet,paste0(".\\data\\",gse_id,"_exprset_to_cibersort.txt"),row.names = T,quote = F,sep = "\t")
質(zhì)控處理完畢某弦,準備差異表達分析。