初級(jí)題目
正式開始我們的旅程
library(tidyverse)
library(ggpubr)
- 軟件安裝以及R包安裝
參考:http://www.bio-info-trainee.com/3727.html
# # 先注釋掉吉挣,避免在Rmarkdown中運(yùn)行
# rm(list = ls())
# options()$repos
# options()$BioC_mirror
# options(BioC_mirror="https://mirrors.ustc.edu.cn/bioc/")
# options("repos" = c(CRAN="https://mirrors.tuna.tsinghua.edu.cn/CRAN/"))
# options()$repos
# options()$BioC_mirror
#
# # https://bioconductor.org/packages/release/bioc/html/GEOquery.html
# if (!requireNamespace("BiocManager", quietly = TRUE))
# install.packages("BiocManager")
# BiocManager::install("KEGG.db",ask = F,update = F)
# BiocManager::install(c("GSEABase","GSVA","clusterProfiler" ),ask = F,update = F)
# BiocManager::install(c("GEOquery","limma","impute" ),ask = F,update = F)
# BiocManager::install(c("genefu","org.Hs.eg.db","hgu133plus2.db" ),ask = F,update = F)
#
# # 下面代碼被我注釋了,意思是這些代碼不需要運(yùn)行废麻,因?yàn)樗^時(shí)了,很多舊教程就忽略
# # source("https://bioconductor.org/biocLite.R")
# # library('BiocInstaller')
# # options(BioC_mirror="https://mirrors.ustc.edu.cn/bioc/")
# # BiocInstaller::biocLite("GEOquery")
# # BiocInstaller::biocLite(c("limma"))
# # BiocInstaller::biocLite(c("impute"))
#
# options()$repos
# install.packages('WGCNA')
# install.packages(c("FactoMineR", "factoextra"))
# install.packages(c("ggplot2", "pheatmap","ggpubr"))
# library("FactoMineR")
# library("factoextra")
#
# library(GSEABase)
# library(GSVA)
# library(clusterProfiler)
# library(genefu)
# library(ggplot2)
# library(ggpubr)
# library(hgu133plus2.db)
# library(limma)
# library(org.Hs.eg.db)
# library(pheatmap)
- 打開 Rstudio 告訴我它的工作目錄
getwd()
3.新建6個(gè)向量模庐,基于不同的原子類型烛愧。(重點(diǎn)是字符串,數(shù)值掂碱,邏輯值)
a1 <- 1:10
a1
class(a1)
a2 <- c('hello','the','world')
class(a2)
a3 <- c(T,T,T,F,F,T,T,F)
a3
class(a3)
a4 <- 607L
a4
class(a4)
a5 <- seq(from = 0, to = 20, by =2)
a5
class(a5)
a6 <- c(13.14,14)
a6
class(a6)
4.新建一些數(shù)據(jù)結(jié)構(gòu)屑彻,比如矩陣,數(shù)組顶吮,數(shù)據(jù)框,列表等重點(diǎn)是數(shù)據(jù)框粪薛,矩陣)
c <- c(1,2,3)
list <- list(c(1,2,3),'3,14')
M = matrix( c('a','a','b','c','b','a'), nrow = 2, ncol = 3, byrow = TRUE)
a <- array(c('a','b'),dim = c(3,3,2))
f <- factor(c)
df <- data.frame(gene=paste0("gene",1:15),
s1=rnorm(n=15),s2=rnorm(n=15),s3=rnorm(n=15),s4=rnorm(n=15),s5=rnorm(n=15))
# 取df的第1悴了,3行,取第4违寿,6列
df[c(1,3),]
df[,c(4,6)]
5.使用data函數(shù)來加載R內(nèi)置數(shù)據(jù)集 rivers湃交,其他數(shù)據(jù)集:
data("rivers")
#Lengths of Major North American Rivers
head(rivers)
tail(rivers)
# 數(shù)據(jù)集的長(zhǎng)
length(rivers)
# structure 顯示對(duì)象內(nèi)部結(jié)構(gòu)
str(rivers)
# 獲取描述性統(tǒng)計(jì)量(最小值/最大值/四分位數(shù)/數(shù)值型變量/因子向量/邏輯型向量)
summary(rivers)
rm(list = ls())
- 下載 https://www.ncbi.nlm.nih.gov/sra?term=SRP133642 里面的 RunInfo Table 文件讀入到R里面,了解這個(gè)數(shù)據(jù)框藤巢,多少列搞莺,每一列都是什么屬性的元素
打開鏈接https://www.ncbi.nlm.nih.gov/sra?term=SRP133642
點(diǎn)擊Send results to Run selector
鏈接
點(diǎn)擊RunInfo Table
按鈕即可下載RunInfo Table
文件
rm(list = ls())
options(stringsAsFactors = F)
Table <- read.table(file = "SraRunTable.txt",header = T,sep = '\t')
class(Table)
dim(Table) # 查看data.frame維度
ncol(Table) #查看data.frame列數(shù)
str(Table)
- 下載 https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE111229 里面的樣本信息sample.csv讀入到R里面,了解這個(gè)數(shù)據(jù)框掂咒,多少列才沧,每一列都是什么屬性的元素
7.1 打開GEO主頁https://www.ncbi.nlm.nih.gov/geo/,點(diǎn)擊Samples
鏈接
7.2 在搜索欄中輸入GSE111229
绍刮,點(diǎn)擊Search
按鈕温圆,然后點(diǎn)擊Export
按鈕
7.3 在彈出的對(duì)話框點(diǎn)擊Export
按鈕,得到sample.csv
# 讀取樣本信息
sample <-read.csv("sample.csv")
colnames(sample)
dim(Table)
## [1] 768 31
dim(sample)
## [1] 768 12
d = merge(Table,sample,by.x = "Sample_Name",by.y = "Accession")
# merge() 函數(shù) 此時(shí)Table的Sample_Name列和sample的Accession列內(nèi)容相同孩革,合并這一列
dim(d)
library(tidyverse)
d = sample %>%
dplyr::rename("Sample_Name" = "Accession") %>%
left_join(Table,by = "Sample_Name")
dim(d)
- 對(duì)前面讀取的 RunInfo Table 文件在R里面探索其MBases列岁歉,包括 箱線圖(boxplot)和五分位數(shù)(fivenum),還有頻數(shù)圖(hist)膝蜈,以及密度圖(density)
# Mbases樣本的堿基數(shù)
# 舉例:
boxplot(Table$MBases, main = "boxplot of MBases")
plot(fivenum(Table$MBases), main = "fivenum of MBases")
hist(Table$MBases, main = "hist of MBases")
plot(density(Table$MBases,na.rm=T), main = "density of MBases")
- 把前面讀取的樣本信息表格的樣本名字根據(jù)下劃線分割看第3列元素的統(tǒng)計(jì)情況锅移。第三列代表該樣本所在的plate
title = sample$Title
plate = unlist(lapply(title,function(x){stringr::str_split(x,"_")[[1]][3]}))
table(plate)
- 根據(jù)plate把關(guān)聯(lián)到的 RunInfo Table 信息的MBases列分組檢驗(yàn)是否有統(tǒng)計(jì)學(xué)顯著的差異
# plate 指384孔PCR板熔掺,編號(hào)分別是48號(hào)和49號(hào)
t.test(Table$MBases~plate)
- 分組繪制箱線圖(boxplot),頻數(shù)圖(hist)非剃,以及密度圖(density)
boxplot(d$MBases~plate)
typeof(plate)
e = d[,c("MBases","Title")]
e$plate = plate
hist(e$MBases,freq = F, breaks = "sturges")
plot(density(e$MBases,na.rm=T))
# 比較簡(jiǎn)陋置逻,可以嘗試用ggplot2和ggpubr畫圖包
- 使用ggplot2把上面的圖進(jìn)行重新繪制
suppressMessages(library(ggplot2))
e$plate = plate
e$num=c(1:768)
colnames(e)
# 箱線圖
ggplot(e,aes(x=plate,y=MBases)) + geom_boxplot()
# 頻數(shù)圖
ggplot(e,aes(x=MBases)) + geom_histogram(fill="lightblue",colour="grey") + facet_grid(plate ~ .) # 頻數(shù)圖
ggplot(e,aes(x=MBases,fill=plate))+geom_histogram()
# 密度圖
ggplot(e,aes(y=MBases,x=num)) + geom_point() + stat_density2d(aes(alpha=..density..),geom = "raster",contour = F)+ facet_grid(plate ~ .)
ggplot(e,aes(x=MBases,fill=plate))+geom_density()
- 使用ggpubr把上面的圖進(jìn)行重新繪制
suppressMessages(library(ggpubr))
ggboxplot(e, x="plate", y="MBases", color = "plate", palette = "aaas",add = "jitter") + stat_compare_means(method = "t.test")
gghistogram(e, x="MBases", fill = "plate",palette = c("#f4424e", "#41a6f4"))
ggdensity(e, x="MBases", fill = "plate", color = "plate", add = "mean",palette = c("#f4424e", "#41a6f4"))
- 隨機(jī)取384個(gè)MBases信息,跟前面的兩個(gè)plate的信息組合成新的數(shù)據(jù)框努潘,第一列是分組诽偷,第二列是MBases,總共是384*3行數(shù)據(jù)
# sample() 函數(shù) 隨機(jī)抽樣
data <- e[sample(nrow(e),384),][,c(3,1,2)]
str(data)
感謝
http://www.reibang.com/p/c07e67e2c757
http://www.reibang.com/p/2e5a5192f219
歡迎關(guān)注生信菜鳥團(tuán)疯坤、生信技能樹1健!压怠!