查看當(dāng)前位置
getwd()
getwd()
[1] "/media/albert/0E3711AB0E3711AB/Research/TCGA/GSE1323"
使用getwd函數(shù)來顯示當(dāng)前工作目錄,使用setwd函數(shù)改變當(dāng)前目錄
setwd(/media/albert/0E3711AB0E3711AB/Research/TCGA/GSE1323/GSE1323")
列出當(dāng)前文件夾里文件
list.files()
[1] "GSE1323" "gse1323.r"
包的安裝源添加
#ChAMP包的安裝
source("https://bioconductor.org/biocLite.R")
options(BioC_mirror="http://mirrors.ustc.edu.cn/bioc/")
biocLite("ChAMP")
如果是墻內(nèi)通惫,最好加上options選擇中科大鏡像,因?yàn)橛泻枚嘁蕾嚢枰惭b
改變包的源
install.packages(c('europepmc', 'ggplotify', 'ggraph', 'ggridges'))
Installing packages into ‘/home/albert/R/x86_64-pc-linux-gnu-library/3.4’
(as ‘lib’ is unspecified)
Error in readRDS(dest) : error reading from connection
options(repos=structure(c(CRAN="https://mirrors.tuna.tsinghua.edu.cn/CRAN/")))
shell 環(huán)境下安裝github上的包
Rscript -e 'devtools::install_local("enrichplot")'##已下載的包
R 環(huán)境安裝github上的包
devtools::install_local("enrichplot")##已下載的包
devtools::install_github(c("GuangchuangYu/DOSE", "GuangchuangYu/clusterProfiler"))
流程化安裝包
library(BiocManager)
deps <- c("pathview","clusterProfiler","ELMER", "DO.db","GO.db",
"ComplexHeatmap","EDASeq", "TCGAbiolinks","AnnotationHub",
"gaia","ChIPseeker","minet","BSgenome.Hsapiens.UCSC.hg19",
"MotifDb","MotIV", "rGADEM", "motifStack","RTCGAToolbox")
for(pkg in deps) if (!pkg %in% installed.packages()) install(pkg, dependencies = TRUE)
deps <- c("devtools","DT","pbapply","readr","circlize")
for(pkg in deps) if (!pkg %in% installed.packages()) install.packages(pkg,dependencies = TRUE)
devtools::install_github("BioinformaticsFMRP/TCGAWorkflowData")
devtools::install_github("BioinformaticsFMRP/TCGAWorkflow", dependencies = TRUE)
刪除安裝的某個(gè)包
remove.packages(GEOquery)
Load and Unload Packages in R
You load the fortunes package like this:
library(fortunes)
The detach() function will let you unload a package
detach(package:fortunes)
suppressMessage
不顯示載入包提示信息:
suppressMessages(library(tidyverse))
缺失值
x[is.na(x)] = mean(x[!is.na(x)]) #填空值/補(bǔ)缺失值
矩陣 OR 向量
library(dplyr)
dat <- read.csv("femaleMiceWeights.csv")
control = filter(dat, Diet == "chow") %>% select(Bodyweight) %>% unlist # filter(dat, Diet == "chow") %>% select(Bodyweight) 還是個(gè)矩陣
> dat <- read.csv("femaleMiceWeights.csv")
> dat
Diet Bodyweight
1 chow 21.51
2 chow 28.14
3 chow 24.04
4 chow 23.45
5 chow 23.68
6 chow 19.79
7 chow 28.40
8 chow 20.98
9 chow 22.51
10 chow 20.10
11 chow 26.91
12 chow 26.25
13 hf 25.71
14 hf 26.37
15 hf 22.80
16 hf 25.34
17 hf 24.97
18 hf 28.14
19 hf 29.58
20 hf 30.92
21 hf 34.02
22 hf 21.90
23 hf 31.53
24 hf 20.73
> dat[,2]
[1] 21.51 28.14 24.04 23.45 23.68 19.79 28.40 20.98 22.51 20.10 26.91 26.25
[13] 25.71 26.37 22.80 25.34 24.97 28.14 29.58 30.92 34.02 21.90 31.53 20.73
> dat[,"Bodyweight"]
[1] 21.51 28.14 24.04 23.45 23.68 19.79 28.40 20.98 22.51 20.10 26.91 26.25
[13] 25.71 26.37 22.80 25.34 24.97 28.14 29.58 30.92 34.02 21.90 31.53 20.73
> x=dat[,2]
> class(x)
[1] "numeric"
> control = filter(dat, Diet == "chow") %>% select(Bodyweight)
> control
Bodyweight
1 21.51
2 28.14
3 24.04
4 23.45
5 23.68
6 19.79
7 28.40
8 20.98
9 22.51
10 20.10
11 26.91
12 26.25
> control[,1]
[1] 21.51 28.14 24.04 23.45 23.68 19.79 28.40 20.98 22.51 20.10 26.91 26.25
dplyr
Extract the first, last or nth value from a vector
Description:
These are straightforward wrappers around ‘[[’. The main advantage
is that you can provide an optional secondary vector that defines
the ordering, and provide a default value to use when the input is
shorter than expected.
Usage:
nth(x, n, order_by = NULL, default = default_missing(x))
first(x, order_by = NULL, default = default_missing(x))
last(x, order_by = NULL, default = default_missing(x))
Arguments:
x: A vector
n: For ‘nth_value()’, a single integer specifying the position.
Negative integers index from the end (i.e. ‘-1L’ will return
the last value in the vector).
If a double is supplied, it will be silently truncated.
order_by: An optional vector used to determine the order
default: A default value to use if the position does not exist in the
input. This is guessed by default for base vectors, where a
missing value of the appropriate type is returned, and for
lists, where a ‘NULL’ is return.
For more complicated objects, you'll need to supply this
value. Make sure it is the same type as ‘x’.
Value:
A single value. ‘[[’ is used to do the subsetting.
Examples:
x <- 1:10
y <- 10:1
first(x)
last(y)
nth(x, 1)
nth(x, 5)
nth(x, -2)
nth(x, 11)
last(x)
# Second argument provides optional ordering
last(x, y)
# These functions always return a single value
first(integer())
NAs produced by integer overflow
dat[16,1]dat[16,2]
[1] NA
Warning message:
In dat[16, 1] * dat[16, 2] : NAs produced by integer overflow
as.numeric(dat[16,1]) as.numeric(dat[16,2])
[1] 16030361700
dat[16,1]
[1] 150
dat[16, 2]
[1] 106869078
150 * 106869078
[1] 16030361700
ggplot2
theme_classic
theme_classic(): 無柵格
library("ggplot2")
ggplot(data = mpg, aes(x = displ,y = hwy,color = class)) + geom_point( )
ggplot(data = mpg, aes(x = displ,y = hwy,color = class)) + geom_point( ) + theme_classic()
顯著性添加工具---ggsignif
ggsignif 是發(fā)表在github上的開源包,專門用于在box plot上添加顯著性標(biāo)簽。
https://github.com/const-ae/ggsignif
安裝穩(wěn)定版本:
install.packages("ggsignif")
安裝最新開發(fā)版本:
devtools::install_github("const-ae/ggsignif")
library(ggplot2)
library(ggsignif) #載入ggsignif
#我們使用iris數(shù)據(jù)集作為演示也糊,iris數(shù)據(jù)集Species作為分類標(biāo)簽葫盼,Species有3個(gè)類別("versicolor"、"virginica"叠艳、"setosa")
#Species的三組兩兩分別作差異性檢驗(yàn),提前設(shè)定好配對(duì)分析的list:
compaired_list <- list(c("versicolor", "virginica"),
c("versicolor","setosa"),
c("virginica","setosa"))
#繪制geom_boxplot()易阳,選擇好統(tǒng)計(jì)檢驗(yàn)方法:
p <- ggplot(iris, aes(Species, Sepal.Width, fill = Species)) +
geom_boxplot() +
ylim(1.5, 6.5) +
geom_signif(comparisons = compaired_list,
step_increase = 0.3,
map_signif_level = F,
test = wilcox.test) +
theme_classic()
p
##不顯示p值附较,而顯示***
p <- ggplot(iris, aes(Species, Sepal.Width, fill = Species)) +
geom_boxplot() +
ylim(1.5, 6.5) +
geom_signif(comparisons = compaired_list,
step_increase = 0.3,
map_signif_level = T,
test = wilcox.test) +
theme_classic()
p
topTags: Table of the Top Differentially Expressed Tags
(edgeR)
Description
Extracts the top DE tags in a data frame for a given pair of groups, ranked by p-value or absolute log-fold change.
Usage
topTags(object, n=10L, adjust.method="BH", sort.by="PValue", p.value=1)
# generate raw counts from NB, create list object
y <- matrix(rnbinom(80,size=1,mu=10),nrow=20)
d <- DGEList(counts=y,group=rep(1:2,each=2),lib.size=rep(c(1000:1001),2))
rownames(d$counts) <- paste("gene",1:nrow(d$counts),sep=".")
# estimate common dispersion and find differences in expression
# here we demonstrate the 'exact' methods, but the use of topTags is
# the same for a GLM analysis
d <- estimateCommonDisp(d)
de <- exactTest(d)
# look at top 10
topTags(de)
# Can specify how many genes to view
tp <- topTags(de, n=15)
# Here we view top 15
tp
# Or ** order by fold change instead**
topTags(de,sort.by="logFC")
markdown 數(shù)理化公式
Math and Statistics
Algebra
x = a_0 + \frac{1}{a_1 + \frac{1}{a_2 + \frac{1}{a_3 + a_4}}}
x = a_0 + \frac{1}{\displaystyle a_1 + \frac{1}{\displaystyle a_2 + \frac{1}{\displaystyle a_3 + a_4}}}
Trig
\cos^{-1}\theta
\sin^{-1}\theta
e^{i \theta}
\left(\frac{\pi}{2}-\theta \right )
Geometry
\overleftrightarrow{AB}
\widehat{AB}
Calculus
\frac{\partial y}{\partial x}
\fracrsw7tcj{dx}cn=nx{n-1}
\frac0zjko2e{dx}\ln(x)=\frac{1}{x}
\frac5atbylh{dx}\sin x=\cos x
$\sqrt[3]{\frac xy}$
$$ x = \dfrac{-b \pm \sqrt{b^2 - 4ac}}{2a} $$
Matrices
sets
Statistics
Hypergeometric test: 超幾何檢驗(yàn)
$p=1-\sum_{k=0}^m \frac{\binom{K}{k}\binom{N-K}{n-k}}{\binom{N}{n}}$
-N: total number of genes (20000)
-K: total number of genes annotated with this term
-n: number of genes in the selected list
-k: number of genes in the list annotated with this term
$$t= \frac{\bar x_1 - \bar x_2}{\sqrt{\frac{s_1^2}{n1} + \frac{s_2^2}{n2}}}$$
Phisics
Chemistry
Others
- FPKM
FPKM是Fragments Per Kilobase per Million的縮寫
$FPKM=\frac {10^6 \times n_f}{L \times N}$
其中潦俺, 是比對(duì)至目標(biāo)基因的fragments的數(shù)量拒课。 L是目標(biāo)基因的外顯子長(zhǎng)度之和除以1000,單位是kb事示,不是bp早像;N是總有效比對(duì)至基因組的read數(shù)量。
表格
FDR(False Discovery Rate)
| |# not rejected <br>Not called |# rejected <br>Called |Total
------|----|----|----
# H 0 <br>Two groups similar |U |V |$m_0$
# H 1 <br>Two groups different |T |S |$m_1$
# not rejected Not called |
# rejected Called |
Total | |
---|---|---|---|
# H 0 Two groups similar |
U | V | |
# H 1 Two groups different |
T | S | |
Total | m-R | R | m |
表格單元內(nèi)字換行用<br>
R包的使用與實(shí)用技巧
TCGA數(shù)據(jù)ENSG轉(zhuǎn)換為基因名(Symbol)
TCGA的RNAseq數(shù)據(jù)肖爵,是使用gencode進(jìn)行基因注釋的卢鹦。因此,下載raw_counts后劝堪,基因名稱是"ENSG"開頭的ensemble號(hào)冀自。ensemble號(hào)沒有實(shí)際意義揉稚,而發(fā)表文章常用的是基因名稱,經(jīng)常閱讀文獻(xiàn)可以從基因名稱猜測(cè)基因功能熬粗。因此搀玖,需要將ENSG轉(zhuǎn)換為Symbol(基因名稱)。
將ENSG轉(zhuǎn)換Symbol驻呐,是常規(guī)操作巷怜,于是,首先選擇找找已有的R包暴氏,而不是選擇擼起柚子造輪子延塑。
# 安裝包
source("https://bioconductor.org/biocLite.R")
biocLite("AnnotationDbi")
biocLite("org.Hs.eg.db")
# 加載包
library("AnnotationDbi")
library("org.Hs.eg.db")
# for test
GENEID <- c(1,2,3,9,10)
ENSEMBL <- c("ENSG00000121410","ENSG00000175899","ENSG00000256069","ENSG00000171428","ENSG00000156006")
df <- data.frame(ENSEMBL,GENEID)
# ENSG轉(zhuǎn)換Symbol
df$symbol <- mapIds(org.Hs.eg.db,
keys=ENSEMBL,
column="SYMBOL",
keytype="ENSEMBL",
multiVals="first")
#write.table "quote =F"和“quote =T(缺省)”的區(qū)別:
write.table(filt_set_annotated, file="filt_set_annotated.txt", row.names=F, sep="\t", quote =F)
~~~
ID adj.P.Val P.Value t B logFC SPOT_ID SEQUENCE Alias source chrom strand txStart txEnd circRNA_type best_transcript GeneSymbol
ASCRP002600 1.73691714077242e-11 5.00408280257107e-15 -35.5867037859303 24.2526719407331 -4.02735361814286 hsa_circRNA_102272 CCTAACCCAGGATGACGTCATGATGCCCCAGCGGGTGAGGCTGCAA hsa_circ_0000059 circBase chr1 + 40506473 40506637 intronic NM_001105530 CAP1
~~~
write.table(filt_set_annotated, file="filt_set_annotated.txt", row.names=F, sep="\t", quote =T)
OR
write.table(filt_set_annotated, file="filt_set_annotated.txt", row.names=F, sep="\t")
~~~
"ID" "adj.P.Val" "P.Value" "t" "B" "logFC" "SPOT_ID" "SEQUENCE" "Alias" "source" "chrom" "strand" "txStart" "txEnd" "circRNA_type" "best_transcript" "GeneSymbol"
"ASCRP002600" 1.73691714077242e-11 5.00408280257107e-15 -35.5867037859303 24.2526719407331 -4.02735361814286 "hsa_circRNA_102272" "CCTAACCCAGGATGACGTCATGATGCCCCAGCGGGTGAGGCTGCAA" "hsa_circ_0000059" "circBase" "chr1" "+" "40506473" "40506637" "intronic" "NM_001105530" "CAP1"
~~~