1、首先是軟件和R package安裝：

rm(list = ls()) 
options()$repos 
options()$BioC_mirror
options(BioC_mirror="https://mirrors.ustc.edu.cn/bioc/")
options("repos" = c(CRAN="https://mirrors.tuna.tsinghua.edu.cn/CRAN/"))
options()$repos 
options()$BioC_mirror

以上都是鏡像設(shè)置相關(guān)操作媳叨，其中的ustc.edu是設(shè)置成中科大鏡像关顷，下面的tuna.tsinghua.edu.cns是設(shè)置成清華鏡像议双，之所以要設(shè)置鏡像，是因?yàn)橹苯訌?code>CRAN下載R包可能速度較慢汞舱。同樣的宗雇，當(dāng)一個(gè)R包安裝不順利的時(shí)候赔蒲，也可以考慮換個(gè)鏡像去安裝。
關(guān)于鏡像設(shè)置欢际，其實(shí)鼠標(biāo)操作更簡單：
Tools>Global Options>Packages>Change矾兜，選擇中國國內(nèi)鏡像，一共有6個(gè)可選浑槽，一般清華的是最穩(wěn)定的。
這里順便說一下R 包安裝的三種方式，

(1)從CRAN安裝庫：

install.packages("BiocManager")

也可以從Rstudio界面用鼠標(biāo)點(diǎn)擊下載溉旋，效果是一樣的，如圖：

R package install.png

(2)從Bioconductor安裝：

(!requireNamespace("BiocManager", quietly = TRUE)) 
install.packages("BiocManager") 
BiocManager::install("biomaRt", version = "3.8")#安裝biomaRt

(3)github上R包安裝：

devtools::install_github("ggrothendieck/sqldf")#如果沒有devtools需要先安裝devtools,如下：
     install.packages("devtools")
     library(devtools)

大部分包我都已經(jīng)安裝過，只有2個(gè)需要安裝：

library("FactoMineR")
library("factoextra")
library(GSEABase)
library(GSVA)
BiocManager::install('GSVA')
library(clusterProfiler)
library(genefu)#這個(gè)包不太好裝梧油，看網(wǎng)，win7比win10貌似好裝點(diǎn)
if (!requireNamespace("BiocManager", quietly = TRUE))
  install.packages("BiocManager")
BiocManager::install("genefu")

library(ggplot2)
library(ggpubr)
library(hgu133plus2.db)
library(limma)
library(org.Hs.eg.db)
library(pheatmap)

2褪子、下面開始做題：

1骗村、打開 Rstudio 告訴我它的工作目錄。#告訴我在你打開的rstudio里面 getwd() 代碼運(yùn)行后返回的是什么笼痛？
getwd()
2缨伊、新建6個(gè)向量进宝，基于不同的原子類型。（重點(diǎn)是字符串紧唱，數(shù)值隶校，邏輯值）

a1<-c("a","b","c")
a2<-c(1:3)#數(shù)值型向量（integer）  
a2_2<-c(2.1,4.2,5.5,6.7,9.3,10.6)#數(shù)值型向量（小數(shù)）    
a3<-c("hello","R","nihao")#字符串向量
a4<-  a1<a2#通過邏輯運(yùn)算賦值邏輯型向量
a4_2 <- c(T,F,F,F,T,F)#直接賦值邏輯值向量   
a5<-factor(c("stageI","stageII","stageIII"))
a6<-as.numeric(a2)+1
a7 <- c(1,2,3,"yes")#這里是看到別人這么寫的深胳，ps，向量不是只能有一種類型嗎轻庆？
a8 <- rep('e',times=3)
a1
a2
a2_2
a3
a4
a4_2
a5
a6
a7
class(a8)
a8

3、新建一些數(shù)據(jù)結(jié)構(gòu)纷宇，比如矩陣蛾方，數(shù)組，數(shù)據(jù)框拓春，列表等重點(diǎn)是數(shù)據(jù)框亚隅，矩陣）

b1<-matrix(LETTERS[1:24],nrow=4,ncol = 6)
b1
b1_1<- matrix(data=c(1:20),nrow=4,ncol=5) 
b1_1
b2<-data.frame(a1,a2)
b2
b3<-matrix(1:10,nrow=5)
b3
b3_0<-c(rep(a1,2),rep(a2,2))
b3_0
b3<-as.data.frame(matrix(b3_0,ncol = 6))
b3
b4<-matrix(rep(c(a1,a2),3),nrow=3,byrow = F)
b4
b5 <- list(Name <- c("Liming","Xiaofang","Xiaohong","Lihua","Shanshan"),Gender <- c("M","F","M","M"),score <- matrix(1:12,3,4))
b5 #創(chuàng)建列表

4煮纵、在你新建的數(shù)據(jù)框進(jìn)行切片操作，比如首先取第1矾瑰，3行隘擎，然后取第4货葬，6列

b1
b1[1,3]
b1[,c(4,6)]#通過位置索引

colnames(b1)<-c(1:6)
b1
b1[,colnames(b1)>3&colnames(b1)!=5]#通過邏輯值索引

dim(b1)
tmp1 <- c(T,F,T,F)
tmp2 <- c(F,F,F,T,F,T)
b1[tmp1,tmp2]#通過邏輯值索引

colnames(b1)<-paste0("col_",c(1:6))
b1
b1[,c("col_4","col_6")]#通過名字索引
b1

5、使用data函數(shù)來加載R內(nèi)置數(shù)據(jù)集 rivers 描述它震桶。并且可以查看更多的R語言內(nèi)置的數(shù)據(jù)集：
R語言系列：探索R自帶數(shù)據(jù)包(這里列舉的是可能會用到和遇到的數(shù)據(jù)，包括一些以前見過的數(shù)據(jù)包）"https://mp.weixin.qq.com/s/dZPbCXccTzuj0KkOL7R31g

data(rivers)
head(rivers)
rivers
data(euro)#歐元匯率蹲姐，長度為11，每個(gè)元素都有命名
euro
data(mtcars)#32輛汽車在11個(gè)指標(biāo)上的數(shù)據(jù)
data(esoph)   ##法國的一個(gè)食管癌病例對照研究    
esoph    
data(iris)     # #3種鳶尾花形態(tài)數(shù)據(jù)
data(longley)    #強(qiáng)共線性的宏觀經(jīng)濟(jì)數(shù)據(jù)   
data(LifeCycleSavings)    #50個(gè)國家的存款率
data(PlantGrowth)     #三種處理方式對植物產(chǎn)量的影響
data(quakes)     #1000次地震觀測數(shù)據(jù)（震級>4）
data(sleep)     #兩藥物的催眠效果
data(ToothGrowth)     #VC劑量和攝入方式對豚鼠牙齒的影響
data(ChickWeight)    #飲食對雞生長的影響
data(CO2)    #耐寒植物CO2攝取的差異
data(Indometh)    #某藥物的藥物動力學(xué)數(shù)據(jù)
data(Theoph)    #茶堿藥動學(xué)數(shù)據(jù)

#時(shí)間序列數(shù)據(jù)
data(ldeaths)    #1974-1979年英國每月支氣管炎忙厌、肺氣腫和哮喘的死亡率
data(fdeaths)    #前述死亡率的女性部分
data(mdeaths)    #前述死亡率的男性部分
data(USAccDeaths)    #1973-1978美國每月意外死亡人數(shù)
data(EuStockMarkets)    #多變量時(shí)間序列逢净。歐洲股市四個(gè)主要指標(biāo)的每個(gè)工作日記錄爹土，共1860條記錄

6、下載 https://www.ncbi.nlm.nih.gov/sra?term=SRP133642 里面的 RunInfo Table 文件讀入到R里面社露，了解這個(gè)數(shù)據(jù)框琼娘，多少列，每一列都是什么屬性的元素。（參考B站生信小技巧獲取runinfo table）這是一個(gè)單細(xì)胞轉(zhuǎn)錄組項(xiàng)目的數(shù)據(jù)挪拟，共768個(gè)細(xì)胞击你，如果你找不到RunInfo Table 文件，可以點(diǎn)擊下載惯雳，然后讀入你的R里面也可以鸿摇。
下載方法拙吉，NCBI---SRA---搜索SRP133642，點(diǎn)擊“Send results to Run selector"筷黔，進(jìn)來后再點(diǎn)擊”RunInfo Table“即可下載。

sra<-read.table('SraRunTable.txt',header = T,sep = '\t')
dim(sra)
str(sra)
colnames(sra)

7椎例、下載 https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE111229 里面的樣本信息sample.csv讀入到R里面订歪，了解這個(gè)數(shù)據(jù)框肆捕，多少列，每一列都是什么屬性的元素掏秩。（參考 https://mp.weixin.qq.com/s/fbHMNXOdwiQX5BAlci8brA 獲取樣本信息sample.csv）如果你實(shí)在是找不到樣本信息文件sample.csv蒙幻，也可以點(diǎn)擊下載。
下載sample.csv:下載并保存GEO數(shù)據(jù)(第一次下載失敗诈豌，第二次成功)

library(GEOquery)
GSE_name = 'GSE111229'
options( 'download.file.method.GEOquery' = 'libcurl' ) #windows系統(tǒng)
gset <- getGEO( 'GSE111229', destdir = '.',getGPL = F )
save( gset, file = 'gset.Rdata' ) #掌握保存和加載Rdata的命令

由于gset是列表抒和，故將其轉(zhuǎn)為可操作的數(shù)據(jù)結(jié)構(gòu)Gset

load("gset.Rdata")
Gset <- gset[[1]]

用GEOquery里的pdata函數(shù)獲取樣本信息
看一下pdata的結(jié)構(gòu)摧莽，很明顯是數(shù)據(jù)框

pdata<-pData(Gset)
dim(pdata)#766行，47列油够，發(fā)現(xiàn)比手動下載的列數(shù)多
#dim查看行列 colnames查看列名

手動下載征懈，載入數(shù)據(jù)https://www.ncbi.nlm.nih.gov/geo/browse/
兩種方法都可以卖哎，這里我用的是手動下載的數(shù)據(jù)

sample<-read.csv('sample.csv',header = T,sep = ',')
head(sample)
class(sample)
View(sample)
dim(sample)
colnames(sample)

8、把前面兩個(gè)步驟的兩個(gè)表（RunInfo Table 文件厦章，樣本信息sample.csv）關(guān)聯(lián)起來照藻，使用merge函數(shù)。

sra_merge<-merge(sra,sample,by.x='Sample_Name',by.y='Accession')

對前面讀取的 RunInfo Table 文件在R里面探索其MBases列群发，包括箱線圖(boxplot)和五分位數(shù)(fivenum)发乔，還有頻數(shù)圖(hist)，以及密度圖(density) 起愈。

boxplot(sra$MBases)
fivenum(sra$MBases)
hist(sra$MBases)
plot(density(sra$MBases))

9、把前面讀取的樣本信息表格的樣本名字根據(jù)下劃線分割看第3列元素的統(tǒng)計(jì)情況官觅。第三列代表該樣本所在的plate, 根據(jù)plate把關(guān)聯(lián)到的 RunInfo Table 信息的MBases列分組檢驗(yàn)是否有統(tǒng)計(jì)學(xué)顯著的差異

sample
head(sra_merge)
sra1<-sra_merge[,c("MBases","Title")]
head(sra1)
save(sra1,file = 'input.Rdata')
load(file = 'input.Rdata')
class(sra1)
str(sra1)
sra1$Title=as.character(sra1$Title)
sra1

strsplit(sra1$Title,"_")[[1]][3]
sra1

plate <- unlist(lapply(sra1$Title,function(x)
  {  x
  strsplit(x,'_')[[1]][3]
}))
table(plate)
class(plate)
head(plate)

str(plate)

10休涤、根據(jù)plate把關(guān)聯(lián)到的 RunInfo Table 信息的MBases列分組檢驗(yàn)是否有統(tǒng)計(jì)學(xué)顯著的差異功氨。

sra1$plate<-plate
head(sra1)
t.test(sra1[,1] ~ plate)
[1] t = 2.3019, df = 728.18, p-value = 0.02162手幢。有顯著差異。

11跺涤、分組繪制箱線圖(boxplot)管钳，頻數(shù)圖(hist)软舌，以及密度圖(density) 。

其實(shí)基礎(chǔ)作圖都可以不用花費(fèi)太多時(shí)間去學(xué)習(xí)醇滥，練練玩玩即可鸳玩，大部分功能都可以用ggplot來解決演闭。
sra1$plate<-as.factor(plate)
sra1$plate<-as.numeric(plate)#這兩個(gè)變量類型的轉(zhuǎn)化需根據(jù)后面分組作圖的要求確定要不要運(yùn)行。
str(sra1$plate)
hist(sra1[,1],main='boxplot',freq = F, breaks = "sturges")
boxplot(sra1[,1]~plate)

plate0 <- sra1[,c("MBases","plate")]
plate1 <- plate0[1:384,1]
hist(plate1)          #此步為0048的頻數(shù)圖
plate2<- plate0[385:768,1]
hist(plate2)   #此步為0049的頻數(shù)圖

density(plate1)
plot(density(plate1))    #此步為0048的密度圖
plot(density(plate2))   #*此步為0049的密度圖#這里的plate1和plate2賦值原先沒有想到窝革，感覺稍微麻煩了點(diǎn)虐译，暫時(shí)沒想到更好的辦法進(jìn)行分組作圖吴趴，當(dāng)然boxplot不需要賦值即可進(jìn)行分組。*

12厢拭、使用ggplot2把上面的圖進(jìn)行重新繪制。

library(ggplot2)
colnames(sra1)
ggplot(sra1,aes(x=plate,y=MBases))+geom_boxplot()+theme_classic()#我喜歡用這個(gè)主題箭昵，背景比較干凈

頻數(shù)圖：柱狀圖和直方圖是很類似的家制，只是柱狀圖更適合畫分類變量泡一，
直方圖更適合把連續(xù)型的數(shù)據(jù)按照一個(gè)個(gè)等長的分區(qū)（bin）來切分，然后計(jì)數(shù)涵但，畫柱狀圖帖蔓。

sra1$plate<-as.factor(plate)

ggplot(sra1,aes(x=MBases,fill=plate))+geom_histogram()#與下面一句等價(jià)
ggplot(sra1)+geom_histogram(aes(x=MBases,fill=plate))#直方圖最容易，提供一個(gè)變量x澈侠，畫出數(shù)據(jù)分布埋酬，并可以根據(jù)其他變量給他填充顏色。
ggplot(sra1)+geom_histogram(aes(x=MBases,fill=plate),position = "dodge")#并列排列
ggplot(sra1)+geom_histogram(aes(x=MBases,fill=plate),position = "fill")#使用position="fill"拳球，按照相對比例來畫

畫成柱狀圖是這個(gè)樣子：

ggplot(sra1)+geom_bar(aes(x=MBases,y=1), stat="identity")#通過stat參數(shù)祝峻，可以讓geom_bar按指定高度畫圖

密度圖

ggplot(sra1,aes(x=MBases,fill=plate))+geom_density()
ggplot(sra1)+geom_density(aes(x=MBases, colour=plate,fill=plate))#color指定線條顏色扎筒，fill指定圖片顏色填充

其他人的答案（boxplot一樣）：(此時(shí)需要先把plate轉(zhuǎn)換為numeric變量)

sra1$plate<-as.numeric(plate)
ggplot(sra1,aes(x=plate,y=MBases))+geom_histogram()   #geom_histogram() 不完整，會報(bào)錯宋距，需要把（）內(nèi)的部分填充
plate1 <- as.data.frame(plate1)
plate2 <- as.data.frame(plate2)            #必須轉(zhuǎn)化成數(shù)據(jù)框才可以作圖症脂，先以plate1為例淫僻，colname為plate1,其實(shí)這里是全部0048的MBases
colnames(plate1)="MBases"   
colnames(plate2)="MBases" 
ggplot(plate1,aes(x=MBases))+geom_histogram(bins = 60,binwidth=0.01,color="blue")
ggplot(plate1,aes(x=MBases))+geom_histogram(bins = 30,color="blue")

可以前后對比雳灵，發(fā)現(xiàn)前后差別悯辙。

ggplot(plate2,aes(x=MBases))+geom_histogram(bins = 60,binwidth=0.01,color="blue")
ggplot(plate2,aes(x=MBases))+geom_histogram(bins = 30,color="blue")

密度圖

ggplot(plate1,aes(x=MBases))+geom_density()
ggplot(plate2,aes(x=MBases))+geom_density()

13迎吵、使用ggpubr把上面的圖進(jìn)行重新繪制。

ggpubr號稱專做出版級的圖拢蛋，確實(shí)做出來比較好看蔫巩。
library(ggpubr)
#boxplot:
p <- ggboxplot(sra1, x="plate", y="MBases", color = "plate", 
               palette = c("#00AFBB", "#E7B800", "#FC4E07"), 
               add = "jitter", shape="plate")#增加了jitter點(diǎn)，點(diǎn)shape由plate映射
p
#換個(gè)顏色(別人答案)
p <- ggboxplot(sra1, x = "plate", y = "MBases",
               color = "plate", palette = "jco",
               add = "jitter")
# Add p-value
p + stat_compare_means(method = 't.test')
#再換個(gè)顏色
ggboxplot(sra1, x="plate", y="MBases", color = "plate", 
          palette = "aaas",add = "jitter") + stat_compare_means(method = "t.test")

#箱線圖+分組形狀+統(tǒng)計(jì)：
my_comparisons <- list(c("0.5", "1"))
p+stat_compare_means(comparisons = my_comparisons)+ #不同組間的比較 
  stat_compare_means(label.y = 60)

#內(nèi)有箱線圖的小提琴圖+星標(biāo)記：
ggviolin(sra1, x="plate", y="MBases", fill = "plate", 
         palette = c("#00AFBB", "#E7B800", "#FC4E07"), 
         add = "boxplot", add.params = list(fill="white"))+ 
  stat_compare_means(comparisons = my_comparisons, label = "p.signif")+#label這里表示選擇顯著性標(biāo)記（星號） 
  stat_compare_means(label.y = 50)

#直方圖：
sra1$plate<-as.factor(plate)
gghistogram(sra1, x="MBases", fill = "plate",palette = c("#f4424e", "#41a6f4"))
gghistogram(sra1, x="MBases", add = "mean", rug = TRUE, color = "plate", fill = "plate",
            palette = c("#00AFBB", "#E7B800"))#帶有均值線和邊際地毯線的直方圖

#密度圖：
ggdensity(sra1, x="MBases", fill = "plate",  color = "plate", add = "mean",palette = c("#f4424e", "#41a6f4"))
ggdensity(sra1, x="MBases", add = "mean", rug = TRUE, color = "plate", fill = "plate",
          palette = c("#00AFBB", "#E7B800"))#帶有均值線和地毯線的密度圖

其他人：

hist_0048
ggplot(plate0,aes(x=MBases))+geom_histogram(bins = 40,binwidth=0.01,color="blue")

14、隨機(jī)取384個(gè)MBases信息坪郭，跟前面的兩個(gè)plate的信息組合成新的數(shù)據(jù)框，第一列是分組信姓，第二列是MBases绸罗，總共是384*3行數(shù)據(jù)珊蟀。

這個(gè)題目沒太明白意思，主要是練習(xí)sample隨機(jī)取樣函數(shù)的用法育灸。
col_4<-sample(nrow(sra1), 384)#用到隨機(jī)抽樣函數(shù)sample
col_4<-sample(sra1$MBases,384)
sra1$col_4<-col_4
head(sra1)
sra1<-sra1[,c(3,1,4,2)]
sra1

別人的答案一：https://blog.csdn.net/weixin_34240520/article/details/91199550

sra2 <- sample(nrow(sra1), 384)
class(sra2)
sra3 <- sra1[sra2,][, c(3, 2, 1)]
dim(sra3)

別人的答案二：http://www.reibang.com/p/448ec60bdd4a

data <-  sra1[sample(nrow(sra1),384),]
data <- data[,c(3,2,4,1)]
str(data)
dim(data)

merge一般都是都是有相同的列或者行才會用的磅崭。

plate3 <- sample(1:nrow(sra1),384)

colnames(plate1) <- "MBases_0048"
colnames(plate2) <- "MBases_0049"

data1 <- data.frame(plate2,plate1)
data2 <- data.frame(plate3,data1)

guangzhou_biotrainee_taskI_junior初級