傳送門:
RNAseq006 轉(zhuǎn)錄組入門(6):reads計(jì)數(shù)
RNAseq005 轉(zhuǎn)錄組入門(5):序列比對(duì)
RNAseq004 轉(zhuǎn)錄組入門(4):參考基因組下載
RNAseq003 轉(zhuǎn)錄組入門(3):了解fastq測(cè)序數(shù)據(jù)
RNAseq002 轉(zhuǎn)錄組入門(2):數(shù)據(jù)下載
RNAseq001 轉(zhuǎn)錄組入門(1):資源準(zhǔn)備
## sratoolkit
## Download and install sratoolkit
## http://www.ncbi.nlm.nih.gov/Traces/sra/sra.cgi?view=software
## http://www.ncbi.nlm.nih.gov/books/NBK158900/
mkdir -p ~/biosoft && cd ~/biosoft
mkdir sratoolkit && cd sratoolkit
wget https://ftp-trace.ncbi.nlm.nih.gov/sra/sdk/2.10.8/sratoolkit.2.10.8-ubuntu64.tar.gz
tar xzvf sratoolkit.2.10.8-ubuntu64.tar.gz
echo "export PATH=\$PATH:/home/cqs/biosoft/sratookit/sratoolkit.2.10.8-ubuntu64/bin" >> ~/.bashrc
source ~/.bashrc
fastq-dump -h
# 如果有報(bào)錯(cuò)如下:
# This sra toolkit installation has not been configured.
# Before continuing, please run: vdb-config --interactive
# For more information, see https://www.ncbi.nlm.nih.gov/sra/docs/sra-cloud/
# 參考報(bào)錯(cuò)信息運(yùn)行代碼后搁胆,退出即可
vdb-config --interactive
# *************************************************************************************************************************************
# CMake
# CMake是一個(gè)跨平臺(tái)的編譯自動(dòng)配置工具卒煞,它能夠輸出各種各樣的makefile或者project文件,能測(cè)試編譯器所支持的C++特性啥供,類似UNIX下的automake蝙斜。
# CMake可以編譯源代碼箱蝠、制作程式庫煤杀、產(chǎn)生適配器(wrapper)诀紊、還可以用任意的順序建構(gòu)執(zhí)行檔谒出,CMake是一個(gè)比make更高級(jí)的編譯配置工具。
mkdir -p ~/biosoft/mybin
echo "export PATH=\$PATH:/home/cqs/biosoft/mybin/bin" >> ~/.bashrc
source ~/.bashrc
cd ~/biosoft
mkdir cmake-3.3.2 && cd cmake-3.3.2
wget http://cmake.org/files/v3.3/cmake-3.3.2.tar.gz
tar xvfz cmake-3.3.2.tar.gz
cd cmake-3.3.2
# 首次使用編譯需要配置gcc邻奠,g++
sudo apt-get update
# build-essential這個(gè)包會(huì)安裝上g++,libc6-dev,linux-libc-dev,libstdc++-dev等必須的軟件和頭文件
sudo apt-get install build-essential
# prefix選項(xiàng)是配置安裝的路徑笤喳,如果不配置該選項(xiàng),安裝后可執(zhí)行文件默認(rèn)放在/usr/local/bin碌宴,庫文件默認(rèn)放在/usr/local/lib杀狡,配置文件默認(rèn)放在/usr/local/etc,其它的資源文件放在/usr/local/share贰镣,較為凌亂
./configure --prefix=/home/cqs/biosoft/mybin
make
make install
# *************************************************************************************************************************************
## samtools
## Download and install samtools
## http://samtools.sourceforge.net/
## http://www.htslib.org/doc/samtools.html
cd ~/biosoft
mkdir samtools && cd samtools
wget https://github.com/samtools/samtools/archive/1.10.tar.gz
tar xzvf 1.10.tar.gz
cd samtools-1.10/
./configure --prefix=/home/cqs/biosoft/mybin
# ./configure報(bào)錯(cuò)解決
# bedidx.c:33:10: fatal error: zlib.h: No such file or directory
sudo apt-get install zlib1g-dev
# bam_tview_curses.c:41:10: fatal error: curses.h: No such file or directory
sudo apt-get install libncurses5-dev
# cram/cram_io.c:53:10: fatal error: bzlib.h: No such file or directory
sudo apt-get install libboost-all-dev
sudo apt-get install libbz2-dev
# cram/cram_io.c:57:10: fatal error: lzma.h: No such file or directory
sudo apt-get install liblzma-dev
# hfile_libcurl.c:47:10: fatal error: curl/curl.h: No such file or directory
libcurl4-openssl-dev
# 重新指定路徑
./configure --prefix=/home/cqs/biosoft/mybin
make
make install
# echo "export PATH=\$PATH:/home/cqs/biosoft/samtools-1.10/samtools-1.10" >> ~/.bashrc
# source ~/.bashrc
samtools
# *************************************************************************************************************************************
## FastQC
## 主頁 https://www.bioinformatics.babraham.ac.uk/projects/download.html#fastqc
# 判斷系統(tǒng)是否安裝java
java -version
# 安裝java
sudo apt install default-jre
# 驗(yàn)證
java -version
cd ~/biosoft
mkdir fastqc_v0.11.9 && cd fastqc_v0.11.9
wget https://www.bioinformatics.babraham.ac.uk/projects/fastqc/fastqc_v0.11.9.zip
unzip fastqc_v0.11.9.zip
cd FastQC/
chmod u+x fastqc
echo "export PATH=\$PATH:/home/cqs/biosoft/fastQC-0.11.9/FastQC" >> ~/.bashrc
source ~/.bashrc
fastqc -h
# *************************************************************************************************************************************
## multiqc
## 方法一
cd ~/biosoft
mkdir multiqc-1.9 && cd multiqc-1.9
wget https://files.pythonhosted.org/packages/c8/2d/f0a6be15f861c5d165726d7afecd823ca158dff530b566379623a0e4534b/multiqc-1.9.tar.gz
tar zxvf multiqc-1.9.tar.gz
cd multiqc-1.9
python setup.py install
# 報(bào)錯(cuò)
# Traceback (most recent call last):
# File "setup.py", line 24, in <module>
# from setuptools import setup, find_packages
# ImportError: No module named setuptools
# python2環(huán)境下安裝setuptools
sudo apt-get install python-setuptools
# python3環(huán)境下安裝setuptools
sudo apt-get install python3-setuptools
# 再次執(zhí)行安裝
python setup.py install
-------------------------------------------------------------------------------------
## 方法二
# https://www.runoob.com/w3cnote/python-pip-install-usage.html
cd ~/biosoft
mkdir multiqc-1.9 && cd multiqc-1.9
# -t指定當(dāng)前安裝路徑呜象,-i指定清華源
pip install -t ./ -i https://pypi.tuna.tsinghua.edu.cn/simple multiqc
# 接下來就是一連串無法解決的報(bào)錯(cuò)了,multiqc不能指定位置安裝碑隆,尷尬
sudo apt-get update
sudo apt-get install python3-pip
# 默認(rèn)安裝恭陡,完美解決
pip3 install -i https://pypi.tuna.tsinghua.edu.cn/simple multiqc
echo "export PATH=\$PATH:/home/cqs/.local/bin" >> ~/.bashrc
source ~/.bashrc
which multiqc
# *************************************************************************************************************************************
## bcftools
## Download and install bcftools
## http://www.htslib.org/download/
cd ~/biosoft
mkdir bcftools-1.10.2 && cd bcftools-1.10.2
wget https://github.com/samtools/bcftools/releases/download/1.10.2/bcftools-1.10.2.tar.bz2
tar jxvf
cd bcftools-1.10.2/
./configure --prefix=/home/cqs/biosoft/mybin
make
make install
# *************************************************************************************************************************************
## tophat2
# Download and install TopHat
# https://ccb.jhu.edu/software/tophat/index.shtml
cd ~/biosoft
mkdir -p tophat-2.1.1 && cd tophat-2.1.1
#### readme: https://ccb.jhu.edu/software/tophat/manual.shtml
wget http://ccb.jhu.edu/software/tophat/downloads/tophat-2.1.1.Linux_x86_64.tar.gz
tar xzvf tophat-2.1.1.Linux_x86_64.tar.gz
ln -s tophat-2.1.1.Linux_x86_64 current
# *************************************************************************************************************************************
## hisat2
## Download and install HISAT
## https://daehwankimlab.github.io/hisat2/
cd ~/biosoft
mkdir hisat2-2.0.4 && cd hisat2-2.0.4
#### readme: https://ccb.jhu.edu/software/hisat2/manual.shtml
wget https://cloud.biohpc.swmed.edu/index.php/s/4pMgDq4oAF9QCfA/download
unzip hisat2-2.0.4-Linux_x86_64.zip
ln -s hisat2-2.0.4 current
## ~/biosoft/HISAT/current/hisat2-build
## ~/biosoft/HISAT/current/hisat2
# *************************************************************************************************************************************
## HTSeq
cd ~/biosoft
mkdir HTSeq && cd HTSeq
wget https://files.pythonhosted.org/packages/c4/04/b9b0c5514dcd09e64481e8ebc242aef162646b6de956ffb44595d1de0f69/HTSeq-0.12.4.tar.gz
chmod u+x HTSeq-0.12.4.tar.gz
tar zxvf HTSeq-0.12.4.tar.gz
ls
cd HTSeq-0.12.4/
python setup.py install
# 如報(bào)錯(cuò)如下:
# symlinking folders for python3
# Setup script for HTSeq: Failed to import 'numpy'.
# Please install numpy and then try again to install HTSeq.
# 解決方案:sudo apt-get install build-essential python2.7-dev python-numpy python-matplotlib
# 如果報(bào)錯(cuò)
pip3 install -i https://pypi.tuna.tsinghua.edu.cn/simple Cython pysam matplotlib HTseq
sudo python setup.py install
# 找到htseq-count位置
which htseq-count
# /usr/local/bin/htseq-count
/usr/local/bin/htseq-count --help
echo "export PATH=\$PATH:/usr/local/bin/htseq-count" >> ~/.bashrc
source ~/.bashrc
htseq-count --help
## ftp://ftp.sanger.ac.uk/pub/gencode/Gencode_mouse/release_M1/
## http://hgdownload-test.cse.ucsc.edu/goldenPath/mm10/liftOver/
## GRCm38/mm10 (Dec, 2011)
## ls *bam |while read id;do ( ~/.local/bin/htseq-count -f bam $id genecode/mm9/gencode.vM1.annotation.gtf.gz 1>${id%%.*}.gene.counts ) ;done
## ls *bam |while read id;do ( ~/.local/bin/htseq-count -f bam -i exon_id $id genecode/mm9/gencode.vM1.annotation.gtf.gz 1>${id%%.*}.exon.counts ) ;done
# *************************************************************************************************************************************
# subread安裝
mkdir -p ~/biosoft/subread && cd ~/biosoft/subread
wget https://nchc.dl.sourceforge.net/project/subread/subread-2.0.1/subread-2.0.1-source.tar.gz
tar zxvf subread-2.0.1-source.tar.gz
cd subread-2.0.1-source
# 查看說明書
cat ReadMe.txt
cd src
make -f Makefile.Linux
cd ~/biosoft/subread/subread-2.0.1-source/bin
./featureCounts
echo "export PATH=\$PATH:/home/cqs/biosoft/subread/subread-2.0.1-source/bin" >> ~/.bashrc
source ~/.bashrc
featureCounts
# *************************************************************************************************************************************
# SRA測(cè)序數(shù)據(jù)下載
# 我的筆記:http://www.reibang.com/p/6819a16dee7a
# PMID: 27824034
# 文章地址:https://www.nature.com/articles/ncomms13347
# 數(shù)據(jù)地址:GSE81916
# 獲得數(shù)據(jù)下載地址:
# 下載方式1
srapath SRR3589962
# https://sra-downloadb.be-md.ncbi.nlm.nih.gov/sos2/sra-pub-run-7/SRR3589962/SRR3589962.1
wget https://sra-downloadb.be-md.ncbi.nlm.nih.gov/sos2/sra-pub-run-7/SRR3589962/SRR3589962.1
# 下載方式2 PREFETCH
# 創(chuàng)建下載數(shù)據(jù)列表,將空格替換為換行符\n
echo SRR35899{56..62} | sed 's/ /\n/g' > SRR_Acc_List.txt
# 查看列表是否創(chuàng)建成功
cat SRR_Acc_List.txt
# 創(chuàng)建一個(gè)簡(jiǎn)單的循環(huán)腳本
vim prefetch.sh
# 選擇insert模式
i
# 輸入腳本內(nèi)容上煤,注意 #!/bin/bash是腳本的第一行內(nèi)容休玩,意思是該腳本通過bash運(yùn)行
# 0、1和2分別表示標(biāo)準(zhǔn)輸入劫狠、標(biāo)準(zhǔn)輸出和標(biāo)準(zhǔn)錯(cuò)誤信息輸出拴疤,默認(rèn)為標(biāo)準(zhǔn)輸入,`1>$id.download.log 2>&1`表示將標(biāo)準(zhǔn)輸入重定向到各ID對(duì)應(yīng)的$id.download.log日志文件独泞,并將錯(cuò)誤信息也重定向至該文件
#
#!/bin/bash
cat SRR_Acc_List.txt | while read id;do prefetch $id 1>$id.download.log 2>&1;done
# 后臺(tái)無掛斷運(yùn)行腳本
nohup bash prefetch.sh &
# 下載方式3 ASCP
# ENA地址:https://www.ebi.ac.uk/ena/browser/view/
# 檢索關(guān)鍵詞:PRJNA323422
# 下載tsv文件
# 獲取fastq地址:
# ascp使用絕對(duì)路徑
/home/caoqiansheng/.aspera/connect/bin/ascp -QT -l 300m -P33001 -i ~/.aspera/connect/etc/asperaweb_id_dsa.openssh era-fasp@fasp.sra.ebi.ac.uk:vol1/fastq/SRR358/002/SRR3589962/SRR3589962_2.fastq.gz .
# 批量下載數(shù)據(jù)腳本
for i in {56..62}
do
a0='/home/caoqiansheng/.aspera/connect/bin/'
a1='ascp -QT -l 300m -P33001 -i ~/.aspera/connect/etc/asperaweb_id_dsa.openssh era-fasp@fasp.sra.ebi.ac.uk:vol1/fastq/SRR358/00'
a2=$(($i % 10))
a3='/SRR35899'$i
a4='_1.fastq.gz .'
a5='_2.fastq.gz .'
echo $a0$a1$a2$a3$a3$a4
echo $a0$a1$a2$a3$a3$a5
done >> ascp.command
# 后臺(tái)運(yùn)行腳本
nohup bash ascp.command &
# 參考基因組下載 http://www.reibang.com/p/9f06c3efb000
# genome index下載
# 每個(gè)比對(duì)軟件的index都不同遥赚,需要根據(jù)genome序列構(gòu)建索引,也可以在官網(wǎng)進(jìn)行下載阐肤,但是需要注意基因組版本號(hào)
# hisat2 index下載網(wǎng)址 http://daehwankimlab.github.io/hisat2/download/
# 找到M. musculus的下載鏈接如下凫佛,迅雷下載會(huì)更快
wget https://genome-idx.s3.amazonaws.com/hisat/grcm38_genome.tar.gz
# HISAT2比對(duì)
for i in {59..62};do hisat2 -t -x /mnt/e/Work/bioinfo/public/index/mouse/hisat2/grcm38/genome -1 /mnt/e/Work/bioinfo/project/202009_RNAseq/data/SRR35899${i}_1.fastq.gz -2 /mnt/e/Work/bioinfo/project/202009_RNAseq/data/SRR35899${i}_2.fastq.gz -S /mnt/e/Work/bioinfo/project/202009_RNAseq/result/align/20200910mouse/SRR35899${i}.sam;done
# SAM文件轉(zhuǎn)換為BAM
for i in `seq 59 62`
do
samtools view -S SRR35899${i}.sam -b > SRR35899${i}.bam
done
# 對(duì)排序后的bam統(tǒng)計(jì)flagstat
# 讀取bam文件名讲坎,".bam"為刪除
# ls *.bam | while read id ;do echo basename ${id} ".bam";done
# ls *.bam | while read id ;do echo $(basename ${id} ".bam").flagstat;done
ls *.bam |while read id ;do (samtools flagstat -@ 1 $id > $(basename ${id} ".bam").flagstat );done
mkdir flagstat && mv *.flagstat flagstat && cd flagstat
multiqc ./
# 構(gòu)建shell文本處理腳本
cat > stat.sh
#!/bin/bash
cat *.flagstat | awk '{print $1}' | paste - - - - - - - - - - - - - > file1
# 77607517 16671207 0 0 75387881 60936310 30468155 30468155 56502696 57494864 1221810 832364 530657
# 134310379 28365145 0 0 130964009 105945234 52972617 52972617 98979648 100621038 1977826 1398380 907493
# 94264829 20737377 0 0 91921243 73527452 36763726 36763726 68525830 69723750 1460116 1023854 644490
# 111681106 24075844 0 0 109169544 87605262 43802631 43802631 82145504 83390620 1703080 1013088 643888
# 取行名
cut -d"+" -f 2 SRR3589959.flagstat | cut -d" " -f 3-90 > file2
# in total (QC-passed reads
# secondary
# supplementary
# duplicates
# mapped (97.14% : N/A)
# paired in sequencing
# read1
# read2
# properly paired (92.72% : N/A)
# with itself and mate mapped
# singletons (2.01% : N/A)
# with mate mapped to a different chr
# with mate mapped to a different chr (mapQ>=5)
# 取列名
ls *.flagstat | while read id ;do echo $(basename ${id} ".flagstat") ;done > file3
# SRR3589959
# SRR3589960
# SRR3589961
# SRR3589962
paste file3 file1 > file4
# 將file4行列轉(zhuǎn)置
awk '{
for (i=1;i<=NF;i++){
if (NR==1){
res[i]=$i
}
else{
res[i]=res[i]" "$i
}
}
}END{
for(j=1;j<=NF;j++){
print res[j]
}
}' file4 > file5
# 在file2首行加入內(nèi)容
sed '1i Index' file2 > file6
paste file6 file5 > stat.txt
rm file*
# Enter,Ctrl+C后運(yùn)行腳本
bash stat.sh
# 排序愧薛,索引
for i in `seq 59 62`
do
samtools sort SRR35899${i}.bam -o SRR35899${i}_sorted.bam
samtools index SRR35899${i}_sorted.bam
done
# 將SAM轉(zhuǎn)換為BAM晨炕,并排序構(gòu)建索引,隨后刪除SAM文件
# for i in `seq 59 62`
# do
# samtools view -S SRR35899${i}.sam -b > SRR35899${i}.bam
# samtools sort SRR35899${i}.bam -o SRR35899${i}_sorted.bam
# samtools index SRR35899${i}_sorted.bam
# done
# rm *.sam
# 注釋
for i in {59..62}
do
htseq-count -s no -f bam -r pos /mnt/e/Work/bioinfo/project/202009_RNAseq/result/align/20200910mouse/SRR35899${i}_sorted.bam /mnt/e/Work/bioinfo/public/Annotation/mouse/gencode/gencode.vM25.annotation.gff3 > /mnt/e/Work/bioinfo/project/202009_RNAseq/result/annotation/SRR35899${i}.count
done
# 代碼運(yùn)行報(bào)錯(cuò)
# Please Install PySam to use the BAM_Reader Class (http://code.google.com/p/pysam/)Error occured when reading beginning of BAM file.
# No module named pysam
# [Exception type: ImportError, raised in __init__.py:1086]
# 解決辦法
# 下載pysam源代碼
# 下載地址:https://pypi.org/project/pysam/#files
# 復(fù)制下載鏈接放入迅雷:https://files.pythonhosted.org/packages/99/5a/fc440eb5fffb5346e61a38b49991aa552e4b8b31e8493a101d2833ed1e19/pysam-0.16.0.1.tar.gz
cd ~/biosoft
mkdir pysam && cd pysam
wget https://files.pythonhosted.org/packages/99/5a/fc440eb5fffb5346e61a38b49991aa552e4b8b31e8493a101d2833ed1e19/pysam-0.16.0.1.tar.gz
tar zxvf pysam-0.16.0.1.tar.gz
cd pysam-0.16.0.1
python setup.py install
# 報(bào)錯(cuò)
# Traceback (most recent call last):
# File "setup.py", line 24, in <module>
# from setuptools import setup, find_packages
# ImportError: No module named setuptools
# python2環(huán)境下安裝setuptools
sudo apt-get install python-setuptools
# python3環(huán)境下安裝setuptools
sudo apt-get install python3-setuptools
# 再次執(zhí)行安裝
sudo python setup.py install
# 再次運(yùn)行注釋
# 構(gòu)建腳本
cat > annotation.sh
#!/bin/bash
for i in {59..62}
do
# .sorted.bam地址
input="/mnt/e/Work/bioinfo/project/202009_RNAseq/result/align/20200910mouse/SRR35899${i}_sorted.bam"
# .gtf地址
annotation="/mnt/e/Work/bioinfo/public/Annotation/mouse/gencode/gencode.vM25.annotation.gff3"
# 輸出文件地址
output="/mnt/e/Work/bioinfo/project/202009_RNAseq/result/annotation"
htseq-count -s no -f bam -r pos ${input} ${annotation} > ${output}/SRR35899${i}.count
done
# 運(yùn)行
bash annotation.sh
# featureCounts計(jì)數(shù)
featureCounts -p -t exon -g gene_id -a /mnt/e/Work/bioinfo/public/Annotation/mouse/gencode/gencode.vM25.annotation.gff3 -o /mnt/e/Work/bioinfo/project/202009_RNAseq/result/count/all.id.txt /mnt/e/Work/bioinfo/project/202009_RNAseq/result/align/20200910mouse/SRR35899{59..62}_sorted.bam
# 運(yùn)行后報(bào)錯(cuò)
# featurecounts segmentation fault (core dumped)
# 解決辦法
# 下載二進(jìn)制版本subread
rm -rf ~/biosoft/subread
mkdir -p ~/biosoft/subread && cd ~/biosoft/subread
wget https://nchc.dl.sourceforge.net/project/subread/subread-2.0.1/subread-2.0.1-Linux-x86_64.tar.gz
tar zxvf subread-2.0.1-Linux-x86_64.tar.gz
cd subread-2.0.1-Linux-x86_64
cd ~/biosoft/subread/subread-2.0.1-Linux-x86_64/bin
./featureCounts
echo "export PATH=\$PATH:/home/cqs/biosoft/subread/subread-2.0.1-Linux-x86_64/bin" >> ~/.bashrc
source ~/.bashrc
featureCounts
# 再次運(yùn)行代碼
featureCounts -p -t exon -g gene_id -a /mnt/e/Work/bioinfo/public/Annotation/mouse/gencode/gencode.vM25.annotation.gff3 -o /mnt/e/Work/bioinfo/project/202009_RNAseq/result/count/all.id.txt /mnt/e/Work/bioinfo/project/202009_RNAseq/result/align/20200910mouse/SRR35899{59..62}_sorted.bam
# 對(duì)all.id.txt.summary進(jìn)行multiqc毫炉,查看Counts質(zhì)控
multiqc ./all.id.txt.summary
# [INFO ] multiqc : This is MultiQC v1.9
# [INFO ] multiqc : Template : default
# [INFO ] multiqc : Searching : /mnt/e/Work/bioinfo/project/202009_RNAseq/result/count/all.id.txt.summary
# Searching 1 files.. [####################################] 100%
# [INFO ] feature_counts : Found 4 reports
# [INFO ] multiqc : Compressing plot data
# [INFO ] multiqc : Report : multiqc_report.html
# [INFO ] multiqc : Data : multiqc_data
# [INFO ] multiqc : MultiQC complete