這里是佳奧!了解了大致流程直颅,我們開始正式的分析實戰(zhàn)博个。
本次實戰(zhàn)的數(shù)據(jù)來源是這篇文章:EBV-miR-BART8-3p induces epithelial-mesenchymal transition and promotes metastasis of nasopharyngeal carcinoma cells through activating NF-κB and Erk1/2 pathways
參考推文:
http://www.bio-info-trainee.com/6997.html
1 miRNA-Seq環(huán)境搭建
參考序列文件
在前一篇我們已經(jīng)準(zhǔn)備好了MIRBASE的miRNA參考序列文件。
mkdir -p ~/refer/miRNA
cd ~/refer/miRNA
wget https://www.mirbase.org/ftp/CURRENT/hairpin.fa.gz ## 38589 reads
wget https://www.mirbase.org/ftp/CURRENT/mature.fa.gz ## 48885 reads
wget https://www.mirbase.org/ftp/CURRENT/genomes/hsa.gff3
gzip -d hairpin.fa.gz
gzip -d mature.fa.gz
grep sapiens mature.fa |wc -l # 2656
grep sapiens hairpin.fa |wc # 1917
## Homo sapiens
perl -alne '{if(/^>/){if(/Homo/){$tmp=1}else{$tmp=0}};next if $tmp!=1;s/U/T/g if !/>/;print }' hairpin.fa > hairpin.human.fa
perl -alne '{if(/^>/){if(/Homo/){$tmp=1}else{$tmp=0}};next if $tmp!=1;s/U/T/g if !/>/;print }' mature.fa > mature.human.fa
# 這里值得一提的是miRBase數(shù)據(jù)庫下載的序列功偿,居然都是用U表示的盆佣,也就是說就是miRNA序列,而不是轉(zhuǎn)錄成該miRNA的基因序列械荷,而我們測序的都是基因序列共耍。
配置conda
conda create -n mirna
conda activate mirna
conda install -y -c bioconda hisat2 bowtie samtools fastp bowtie2 fastx_toolkit fastqc multiqc
bowtie構(gòu)建索引
##當(dāng)前refer目錄下的文件
(mirna) root 11:02:43 /home/kaoku/refer/miRNA
$ ls -lh
總用量 11M
hairpin.fa
hairpin.human.fa
hsa.gff3
mature.fa
mature.human.fa
bowtie-build mature.human.fa hsa-mature-bowtie-index
bowtie-build hairpin.human.fa hsa-hairpin-bowtie-index
##得到文件
hsa-hairpin-bowtie-index.1.ebwt
hsa-hairpin-bowtie-index.2.ebwt
hsa-hairpin-bowtie-index.3.ebwt
hsa-hairpin-bowtie-index.4.ebwt
hsa-hairpin-bowtie-index.rev.1.ebwt
hsa-hairpin-bowtie-index.rev.2.ebwt
hsa-mature-bowtie-index.1.ebwt
hsa-mature-bowtie-index.2.ebwt
hsa-mature-bowtie-index.3.ebwt
hsa-mature-bowtie-index.4.ebwt
hsa-mature-bowtie-index.rev.1.ebwt
hsa-mature-bowtie-index.rev.2.ebwt
2 文章數(shù)據(jù)下載
下載SRA文件
vdb-config -i
##進入配置文件后按c,在location of user-repository設(shè)置下載好的文件的位置,設(shè)置完成后按s保存x退出界面
##添加到環(huán)境
export PATH="$PATH:/home/kaoku/biosoft/sratoolkit/sratoolkit.3.0.0-ubuntu64/bin"
##下載第一個項目p1的SRA文件
SRR1542714
SRR1542715
SRR1542716
SRR1542717
SRR1542718
SRR1542719
cat srr.list | while read id; do ( prefetch $id & ); done
mv SRR* /home/kaoku/project/miRNA/p1/
##第二個項目p2的SRA文件
SRR7707744
SRR7707745
SRR7707746
SRR7707747
SRR7707748
SRR7707749
SRR7707750
SRR7707751
SRR7707752
SRR7707753
SRR7707754
cat srr.list | while read id; do ( prefetch $id & ); done
mv SRR* /home/kaoku/project/miRNA/p2/
轉(zhuǎn)fastq文件
##p1目錄
echo {14..19} |sed 's/ /\n/g' |while read id; \
do ( fastq-dump -O ./ --gzip --split-3 SRR15427${id} & ) ; \
done
##p2目錄
echo {44..54} |sed 's/ /\n/g' |while read id; \
do ( fastq-dump -O ./ --gzip --split-3 SRR77077${id} & ) ; \
done
質(zhì)控和清洗測序數(shù)據(jù)
##新建qc目錄,進行質(zhì)量控制
ls ../raw_fq/*gz | xargs fastqc -t 10 -o ./
multiqc ./
##清洗主要是fastx_toolkit修剪
ls *gz |while read id
do
echo $id
# fastqc $id
zcat $id| fastq_quality_filter -v -q 20 -p 80 -Q33 -i - -o tmp ;
fastx_trimmer -v -f 1 -l 27 -m 15 -i tmp -Q33 -z -o ${id%%.*}_clean.fq.gz ;
# fastqc ${id%%.*}_clean.fq.gz
done
##會進行兩步操作养葵,質(zhì)量控制并修剪長度
ls *gz |while read id
do
echo $id
zcat $id| fastq_quality_filter -v -q 20 -p 80 -Q33 -i - -o tmp ;
fastx_trimmer -v -f 1 -l 27 -m 15 -i tmp -Q33 -z -o ${id%%.*}_clean.fq.gz ;
done
#fastq_quality_filter和fastx_trimmer兩個命令征堪,都是來自于fastx_toolkit軟件包:
#fastq_quality_filter 代表根據(jù)質(zhì)量過濾序列
#fastx_trimmer 代表縮短FASTQ或FASTQ文件中的讀數(shù),根據(jù)質(zhì)量修剪(剪切)序列关拒。
##文件大小變化情況
-rw-r--r-- 1 root root 39M 8月 29 16:06 SRR1542714_clean.fq.gz
-rw-r--r-- 1 root root 50M 8月 29 14:50 SRR1542714.fastq.gz
第一種比對策略:bowtie比對 miRBaes數(shù)據(jù)庫下載的序列 +定量
##同時比對mature與hairpin佃蚜,注意hsa-mature-bowtie-index是文件名前綴,不是目錄名
mature=/home/kaoku/refer/miRNA/hsa-mature-bowtie-index
hairpin=/home/kaoku/refer/miRNA/hsa-hairpin-bowtie-index
ls *_clean.fq.gz |while read id
do
echo $id
bowtie -n 0 -m1 --best --strata $mature $id -S ${id}_matrue.sam
bowtie -n 0 -m1 --best --strata $hairpin $id -S ${id}_hairpin.sam
done
##壓縮.sam成.bam并刪除.sam
ls *.sam|while read id ;do (samtools sort -O bam -@ 5 -o $(basename ${id} ".sam").bam ${id});done
rm *.sam
-n 0 不允許錯配着绊,但是結(jié)果來看比對率很低谐算,修改成允許一個錯配的話:
ls *_clean.fq.gz |while read id
do
echo $id
bowtie -n 1 -m1 --best --strata $mature $id -S ${id}_matrue.n1.sam
bowtie -n 1 -m1 --best --strata $hairpin $id -S ${id}_hairpin.n1.sam
done
隨后得到了.bam文件
(mirna) root 16:46:10 /home/kaoku/project/miRNA/p1/align
$ ls
SRR1542714_clean.fq.gz_hairpin.bam SRR1542715_clean.fq.gz_matrue.bam SRR1542717_clean.fq.gz_hairpin.bam SRR1542718_clean.fq.gz_matrue.bam
SRR1542714_clean.fq.gz_matrue.bam SRR1542716_clean.fq.gz_hairpin.bam SRR1542717_clean.fq.gz_matrue.bam SRR1542719_clean.fq.gz_hairpin.bam
SRR1542715_clean.fq.gz_hairpin.bam SRR1542716_clean.fq.gz_matrue.bam SRR1542718_clean.fq.gz_hairpin.bam SRR1542719_clean.fq.gz_matrue.bam
##查看每個基因表達(dá)量
samtools view .bam | cut -f 3 | sort | uniq -c
批量走 samtools idxstats 得到表達(dá)量矩陣:
ls *.bam | xargs -i samtools index {}
ls *.bam | while read id ;do (samtools idxstats ${id} > ${id}.txt );done
# samtools view matrue.bam |cut -f 3 |sort |uniq -c
第二種比對策略:bowtie2比對參考基因組+定量
下載hg38參考基因組:
wget -c https://genome-idx.s3.amazonaws.com/bt/GRCh38_noalt_as.zip
比對參考基因組
##查看一下index目錄
(mirna) root 19:22:44 /home/kaoku/refer/miRNA/hg38
$ ls
GRCh38_noalt_as.zip hg38.1.bt2 hg38.2.bt2 hg38.3.bt2 hg38.4.bt2 hg38.rev.1.bt2 hg38.rev.2.bt2
index=/home/kaoku/refer/miRNA/hg38/hg38
ls *_clean.fq.gz |while read id
do
echo $id
bowtie2 -p 4 -x $index -U $id | samtools sort -@ 4 -o ${id}_genome.bam -
done
##比對結(jié)果
SRR1542714_clean.fq.gz_genome.bam SRR1542716_clean.fq.gz_genome.bam SRR1542718_clean.fq.gz_genome.bam
SRR1542715_clean.fq.gz_genome.bam SRR1542717_clean.fq.gz_genome.bam SRR1542719_clean.fq.gz_genome.bam
定量分析
##確保安裝subread
gtf=/home/kaoku/refer/miRNA/hsa.gff3
featureCounts -T 4 -F gff -M -t miRNA -g Name -a $gtf -o all.counts.mature.txt *genome* 1>counts.mature.log 2>&1
featureCounts -T 4 -F gff -M -t miRNA_primary_transcript -g Name -a $gtf -o all.counts.hairpin.txt *genome* 1>counts.hairpin.log 2>&1
##結(jié)果
all.counts.hairpin.txt
all.counts.hairpin.txt.summary
all.counts.mature.txt
all.counts.mature.txt.summary
counts.hairpin.log
counts.mature.log
3 比較兩種定量分析方法差異
比較miRBaes數(shù)據(jù)庫下載的序列和參考基因組的兩組表達(dá)矩陣。
QQ截圖20220829203444.png
hairpin<-read.table('all.counts.hairpin.txt',header = T)
m1<-hairpin[,7:12]
rownames(m1)<-hairpin[,1]
paste *_clean.fq.gz_hairpin.bam.txt | cut -f 1,3,7,11,15,19,23 > hairpin.txt
m2<-read.table('hairpin.txt',header = F,row.names = 1)
ids<-intersect(rownames(m1),rownames(m2))
##構(gòu)建merge matrix
overM<-cbind(m1[ids,],m2[ids,])
##LOG轉(zhuǎn)換后的COUNTS矩陣
pheatmap::pheatmap(cor(log2(overM+1)))
QQ截圖20220829211943.png
##取CPM后的矩陣
cpmM<-log(edgeR::cpm(overM)+1)
pheatmap::pheatmap(cor(cpmM))
##選取TOP500的MAD基因
highM=cpmM[names(tail(sort(apply(cpmM,1,mad)),500)),]
pheatmap::pheatmap(cor(highM))
##獨立選取各自的top500的mad基因归露,再去交集
hg1=names(tail(sort(apply(cpmM[,1:6],1,mad)),500))
hg2=names(tail(sort(apply(cpmM[,7:12],1,mad)),500))
hg=intersect(hg1,hg2)
hgM=cpmM[hg,]
pheatmap::pheatmap(cor(hgM))
QQ截圖20220829212715.png
流程對于定量分析結(jié)果影響很大洲脂。
##進一步探索兩種方法對于比對數(shù)的差異
colnames(overM)=NULL
換讀取另一個
mature<-read.table('all.counts.mature.txt',header = T)
m1<-mature[,7:12]
rownames(m1)<-mature[,1]
cpmM<-log(edgeR::cpm(overM)+1)
pheatmap::pheatmap(cor(cpmM))
m2<-read.table('mature.txt',header = F,row.names = 1)
ids<-intersect(rownames(m1),rownames(m2))
overM<-cbind(m1[ids,],m2[ids,])
pheatmap::pheatmap(cor(log2(overM+1)))
highM=cpmM[names(tail(sort(apply(cpmM,1,mad)),500)),]
pheatmap::pheatmap(cor(highM))
hg1=names(tail(sort(apply(cpmM[,1:6],1,mad)),500))
hg2=names(tail(sort(apply(cpmM[,7:12],1,mad)),500))
hg=intersect(hg1,hg2)
hgM=cpmM[hg,]
pheatmap::pheatmap(cor(hgM))
colnames(overM)=NULL
paste *_clean.fq.gz_matrue.bam.txt | cut -f 1,3,7,11,15,19,23 > mature.txt
QQ截圖20220829213919.png
4 DEG差異分析
##從mature矩陣開始
rm(list = ls())
options(stringsAsFactors = F)
mature<-read.table('all.counts.mature.txt',header = T)
m1<-mature[,7:12]
rownames(m1)<-mature[,1]
##判斷分組
pheatmap::pheatmap(cor(m1))
gl<-rep(c('control','ET1'),3)
##DEG分析
group_list<-gl
exprSet<-m1
suppressMessages(library(DESeq2))
(colData <- data.frame(row.names=colnames(exprSet), group_list=group_list) )
dds <- DESeqDataSetFromMatrix(countData = exprSet,
colData = colData,
design = ~ group_list)
dds <- DESeq(dds)
res <- results(dds,
contrast=c("group_list","ET1","control"))
resOrdered <- res[order(res$padj),]
head(resOrdered)
DEG=as.data.frame(resOrdered)
DEG <- na.omit(DEG)
##判斷上下調(diào)基因
df=DEG
colnames(df)
df$V<- -log10(df$pvalue)
library(ggpubr)
ggscatter(df, x = "log2FoldChange", y = "V", size=0.5)
df$g<-ifelse(df$pvalue>0.01, 'stable',
ifelse( df$log2FoldChange > 2 , 'up',
ifelse( df$log2FoldChange < - 2, 'down', 'stable')))
table(df$g)
down stable up
43 492 35
5 靶向基因預(yù)測
參考推文:
https://mp.weixin.qq.com/s/n_UncYeGIQFLneTMK2rTXQ
##載入R包
BiocManager::install("multiMiR",ask = F,update = F)
save(df,file = 'deg.R')
rm(list = ls())
options(stringsAsFactors = F)
load(file = 'deg.R')
library(multiMiR)
db.ver = multimir_dbInfoVersions()
db.ver[,1:3]
下面是使用edgeR包,對普通的轉(zhuǎn)錄組counts表達(dá)矩陣(miRNA)做差異分析剧包,并且拿到感興趣的miRNA基因集:
library(edgeR)
library(multiMiR)
# Load data
counts_file <- system.file("extdata", "counts_table.Rds", package = "multiMiR")
strains_file <- system.file("extdata", "strains_factor.Rds", package = "multiMiR")
counts_table <- readRDS(counts_file)
strains_factor <- readRDS(strains_file)
table(strains_factor)
# Standard edgeR differential expression analysis
design <- model.matrix(~ strains_factor)
# Using trended dispersions
dge <- DGEList(counts = counts_table)
dge <- calcNormFactors(dge)
dge$samples$strains <- strains_factor
dge <- estimateGLMCommonDisp(dge, design)
dge <- estimateGLMTrendedDisp(dge, design)
dge <- estimateGLMTagwiseDisp(dge, design)
# Fit GLM model for strain effect
fit <- glmFit(dge, design)
lrt <- glmLRT(fit)
# Table of unadjusted p-values (PValue) and FDR values
p_val_DE_edgeR <- topTags(lrt, adjust.method = 'BH', n = Inf)
# Getting top differentially expressed miRNA's
top_miRNAs <- rownames(p_val_DE_edgeR$table)[1:10]
有了感興趣的miRNA基因集恐锦,就可以查詢它們的靶基因
library(multiMiR)
# Plug miRNA's into multiMiR and getting validated targets
multimir_results <- get_multimir(org = 'mmu',
mirna = top_miRNAs,
table = 'validated',
summary = TRUE)
head(multimir_results@data)
table(multimir_results@data$mature_mirna_id)
dim(multimir_results@data)
相關(guān)的內(nèi)容就到這里,miRNA-Seq 實戰(zhàn)課程就到這里疆液。
我們下一個篇章(單細(xì)胞篇)再見R磺Α!堕油!