軟件安裝與下載
# Installing trinity (https://github.com/trinityrnaseq/trinityrnaseq/releases)
#wget https://github.com/trinityrnaseq/trinityrnaseq/releases/download/v2.15.1/trinityrnaseq-v2.15.1.FULL.tar.gz -P ~/software/
tar zxf ~/software/trinityrnaseq-v2.15.1.FULL.tar.gz
mv trinityrnaseq-v2.15.1/ /opt/biosoft/Trinity-v2.15.1
cd /opt/biosoft/Trinity-v2.15.1
make -j 4
make plugins
echo 'PATH=$PATH:/opt/biosoft/Trinity-v2.15.1/' >> ~/.bashrc
source ~/.bashrc
# Installing Jellyfish (http://www.genome.umd.edu/jellyfish.html)
# Trinity的運(yùn)行需要依賴jellyfish version 2脐区。Jellyfish兩個(gè)版本并存思犁,注意不要下錯(cuò)了润绎。
#wget https://github.com/gmarcais/Jellyfish/releases/download/v2.3.0/jellyfish-linux -P ~/software/
cp /home/train/software/jellyfish-linux /opt/biosoft/Trinity-v2.15.1/jellyfish
chmod 755 /opt/biosoft/Trinity-v2.15.1/jellyfish
# Installing samlomon (https://combine-lab.github.io/salmon/ | https://github.com/COMBINE-lab/salmon)
#wget https://github.com/COMBINE-lab/salmon/releases/download/v1.10.0/salmon-1.10.0_linux_x86_64.tar.gz -P ~/software/
tar zxf ~/software/salmon-1.10.0_linux_x86_64.tar.gz -C /opt/biosoft/
echo 'PATH=$PATH:/opt/biosoft/salmon-latest_linux_x86_64/bin/' >> ~/.bashrc
source ~/.bashrc
#tar zxf /home/train/software/R-4.3.1.Rocky9.2_opt_sysoft.tar.gz -C /opt/sysoft/
#echo 'PATH=$PATH:/opt/sysoft/R-4.3.1/bin/' >> ~/.bashrc
#source ~/.bashrc
#Installing RSEM (http://deweylab.github.io/RSEM/)
#wget https://github.com/deweylab/RSEM/archive/v1.3.3.tar.gz -O ~/software/RSEM-v1.3.3.tar.gz
tar zxf ~/software/RSEM-v1.3.3.tar.gz -C /opt/biosoft/
cd /opt/biosoft/RSEM-1.3.3/
make -j 4
echo 'PATH=$PATH:/opt/biosoft/RSEM-1.3.3/' >> ~/.bashrc
source ~/.bashrc
#Installing kallisto (https://github.com/pachterlab/kallisto/releases)
#wget https://github.com/pachterlab/kallisto/releases/download/v0.50.0/kallisto_linux-v0.50.0.tar.gz -P ~/software/
tar zxf ~/software/kallisto_linux-v0.50.0.tar.gz -C /opt/biosoft/
echo 'PATH=$PATH:/opt/biosoft/kallisto' >> ~/.bashrc
source ~/.bashrc
# Installing TransDecoder (https://github.com/TransDecoder/TransDecoder/releases)
#wget https://github.com/TransDecoder/TransDecoder/archive/TransDecoder-v5.5.0.tar.gz -P ~/software/
tar zxf ~/software/TransDecoder-v5.5.0.tar.gz -C /opt/biosoft/
mv /opt/biosoft/TransDecoder-TransDecoder-v5.5.0 /opt/biosoft/TransDecoder-v5.5.0
cd /opt/biosoft/TransDecoder-v5.5.0/
make -j 4
echo 'PATH=$PATH:/opt/biosoft/TransDecoder-v5.5.0/' >> ~/.bashrc
source ~/.bashrc
運(yùn)行數(shù)據(jù)
# 使用Trinity軟件對(duì)Illumina數(shù)據(jù)進(jìn)行無參考基因組的轉(zhuǎn)錄組分析
cd /home/train/08.RNA-seq_analysis_by_trinity
#ln -s ~/03.sequencing_data_preprocessing/?.?.fastq .
# 培訓(xùn)班上筆記本計(jì)算性能較差罪治,為了快速計(jì)算和減少內(nèi)存消耗鹃答,按如下方法減少數(shù)據(jù)量再進(jìn)行計(jì)算。
for i in `ls /home/train/03.sequencing_data_preprocessing/?.?.fastq`
do
x=${i/*\//}
head -n 1000000 $i > $x
done
# Trinity軟件推薦Paired-End Fastq數(shù)據(jù)的序列名以/1和/2結(jié)尾李滴。
# perl -e 'while (<>) { s/^(\@[^\s\/]+).*/$1\/1/; print; $_ = <>; print; $_ = <>; print; $_ = <>; print; }' read.1.fastq > read_formatted.1.fastq
# perl -e 'while (<>) { s/^(\@[^\s\/]+).*/$1\/2/; print; $_ = <>; print; $_ = <>; print; $_ = <>; print; }' read.2.fastq > read_formatted.2.fastq
# 使用 Trinity 進(jìn)行 De novo 組裝
Trinity --seqType fq --max_memory 2G --left `ls *.1.fastq | perl -pe 's/\n/,/' | perl -pe 's/,$//'` --right `ls *.2.fastq | perl -pe 's/\n/,/' | perl -pe 's/,$//'` --CPU 8 --jaccard_clip --normalize_reads --output trinity_denovo --bflyCalculateCPU &> trinity_denovo.log
# real 54m34.332s
# user 312m57.809s
# sys 67m20.243s
#由于運(yùn)行時(shí)間過長螃宙,可以將其kill掉
[train@MiWiFi-R3P-srv ~]$ kill_procedures_by_name_clf.pl Trinity
# 統(tǒng)計(jì)Trinity組裝結(jié)果
/opt/biosoft/Trinity-v2.15.1/util/TrinityStats.pl trinity_denovo.Trinity.fasta > trinity_denovo.Trinity.fasta.stats
[train@MiWiFi-R3P-srv trinity_denovo]$ cat trinity_denovo.Trinity.fasta.stats
################################
## Counts of transcripts, etc.
################################
Total trinity 'genes': 5566
Total trinity transcripts: 8051
Percent GC: 56.65
########################################
Stats based on ALL transcript contigs:
########################################
Contig N10: 8102
Contig N20: 5363
Contig N30: 4256
Contig N40: 3424
Contig N50: 2821
Median contig length: 1083
Average contig: 1701.38
Total assembled bases: 13697830
#####################################################
## Stats based on ONLY LONGEST ISOFORM per 'GENE':
#####################################################
Contig N10: 5458
Contig N20: 3960
Contig N30: 3135
Contig N40: 2589
Contig N50: 2129
Median contig length: 797.5
Average contig: 1277.36
Total assembled bases: 7109784
# 提取最長的Unigene
extract_longest_isoforms_from_TrinityFasta.pl trinity_denovo.Trinity.fasta > trinity_denovo.unigene.longest.fasta
# 使用 Trinity 進(jìn)行有參考基因組的組裝
samtools merge -@ 8 merged.bam ~/06.reads_aligment/hisat2/*.sam
samtools sort -@ 8 -O BAM -o merged.sort.bam merged.bam
Trinity --max_memory 2G --CPU 8 --jaccard_clip --normalize_reads --genome_guided_bam merged.sort.bam --genome_guided_max_intron 4000 --output trinity_genomeGuided --bflyCalculateCPU &> trinity_genomeGuided.log
# real 300m38.653s
# user 676m36.462s
# sys 39m30.345s
/opt/biosoft/Trinity-v2.15.1/util/TrinityStats.pl trinity_genomeGuided/Trinity-GG.fasta > trinity_genomeGuided/Trinity-GG.fasta.stats
#cp ~/08.RNA-seq_analysis_by_trinity/trinity_genomeGuided/Trinity-GG.fasta ~/00.incipient_data/data_for_gene_prediction_and_RNA-seq/
rm merged.*
# 進(jìn)行表達(dá)量計(jì)算
mkdir -p /home/train/08.RNA-seq_analysis_by_trinity/exp_cal
cd /home/train/08.RNA-seq_analysis_by_trinity/exp_cal
ln -s ~/00.incipient_data/data_for_gene_prediction_and_RNA-seq/Trinity.fasta ./#轉(zhuǎn)錄本序列
/opt/biosoft/Trinity-v2.15.1/util/support_scripts/get_Trinity_gene_to_trans_map.pl Trinity.fasta > Trinity.fasta.gene_trans_map
# 準(zhǔn)備轉(zhuǎn)錄組序列的索引文件
/opt/biosoft/Trinity-v2.15.1/util/align_and_estimate_abundance.pl --transcripts Trinity.fasta --est_method RSEM --output_dir . --aln_method bowtie2 --prep_reference --gene_trans_map Trinity.fasta.gene_trans_map
# real 0m58.207s
# user 0m57.761s
# sys 0m0.426s
# 進(jìn)行表達(dá)量計(jì)算
for i in `ls ../*.1.fastq`
do
i=${i/*\//}
i=${i/.1.fastq/}
echo "/opt/biosoft/Trinity-v2.15.1/util/align_and_estimate_abundance.pl --transcripts Trinity.fasta --seqType fq --left ../$i.1.fastq --right ../$i.2.fastq --est_method RSEM --aln_method bowtie2 --thread_count 8 --gene_trans_map Trinity.fasta.gene_trans_map --output_dir $i &> $i.log"
done > command.align_and_estimate_abundance.list
ParaFly -c command.align_and_estimate_abundance.list -CPU 1 #推薦
#可選sh command.align_and_estimate_abundance.list
# real 32m0.057s
# user 114m33.540s
# sys 8m22.277s
[train@MiWiFi-R3P-srv A]$ lh
total 141M
-rw-r--r-- 1 train train 70M Jul 21 16:48 bowtie2.bam
-rw-r--r-- 1 train train 70M Jul 21 16:48 bowtie2.bam.for_rsem.bam
-rw-r--r-- 1 train train 0 Jul 21 16:48 bowtie2.bam.ok
-rw-r--r-- 1 train train 466K Jul 21 16:49 RSEM.genes.results #基因的表達(dá)量
-rw-r--r-- 1 train train 613K Jul 21 16:49 RSEM.isoforms.results #轉(zhuǎn)錄本的表達(dá)量,TPM可以消除轉(zhuǎn)錄本長度不同帶來的影響
-rw-r--r-- 1 train train 0 Jul 21 16:49 RSEM.isoforms.results.ok
drwxr-xr-x 2 train train 58 Jul 21 16:49 RSEM.stat
TPM總和100萬
補(bǔ)充
for i in `ls */RSEM.isoforms.results`
do
x=${i/\/RSEM/}
cp $i $x
done
# cp ~/00.incipient_data/data_for_gene_prediction_and_RNA-seq/RSEM_out/* ./
# 由各個(gè)樣品的表達(dá)量,得到總的表達(dá)量矩陣所坯,同時(shí)根據(jù)表達(dá)量去除假陽性序列
extract_trinity_unigene_by_isofroms_expression_and_ORF.pl Trinity.fasta *.isoforms.results
#計(jì)算count總和是否高于100萬谆扎,如果低于一百萬,分析數(shù)據(jù)不可靠
[train@MiWiFi-R3P-srv exp_cal]$ head out.gene_rawCounts.matrix
A B C D E F G
TRINITY_DN0_c0_g1 178.99 190 559.01 71 93 61 68
TRINITY_DN1004_c1_g1 25 22 11 5 5 2 2
TRINITY_DN100_c0_g1 84.99 69 189 20 26 19 13
TRINITY_DN1015_c0_g1 20 25 30 3 0 1 5
TRINITY_DN1035_c0_g1 15 7 39 13 16 17 14
TRINITY_DN1037_c0_g1 50 51 35 15 10 9 9
TRINITY_DN103_c0_g1 56 59 52.01 6 10 6 6
TRINITY_DN1040_c0_g2 40 33 110 19 11 18 20
TRINITY_DN1042_c0_g1 42 49 176 13 16 16 11
[train@MiWiFi-R3P-srv exp_cal]$ cut -f 2 out.gene_rawCounts.matrix | perl -e '<>; while (<>) { $total += $_;} print "$total\n";'
232233.84 #低于100萬
# 差異表達(dá)分析_樣品間的標(biāo)準(zhǔn)化分析,消除樣品間的差異
mkdir -p /home/train/08.RNA-seq_analysis_by_trinity/diff_exp
cd /home/train/08.RNA-seq_analysis_by_trinity/diff_exp
ln -s ../exp_cal/out.gene_rawCounts.matrix gene.rawCount.matrix
ln -s ../exp_cal/out.gene_TPM.not_cross.matrix gene.TPM.not_cross.matrix
/opt/biosoft/Trinity-v2.15.1/util/support_scripts/run_TMM_scale_matrix.pl --matrix gene.TPM.not_cross.matrix > gene.TPM.TMM.matrix
echo -e "S4\tA
S4\tB
S4\tC
S2\tD
S2\tE
S2\tF
S2\tG" > samples.txt
# 對(duì)兩兩樣本counts總和 >= 40 的基因進(jìn)行差異表達(dá)分析
#/opt/biosoft/trinityrnaseq-Trinity-v2.1.1/Analysis/DifferentialExpression/run_DE_analysis.pl --matrix genes.counts.matrix --method edgeR --samples_file samples.txt --min_rowSum_counts 40 --output edgeR.genes.dir
/opt/biosoft/Trinity-v2.15.1/Analysis/DifferentialExpression/run_DE_analysis.pl --matrix gene.rawCount.matrix --method edgeR --samples_file samples.txt --output edgeR.genes.dir
cd edgeR.genes.dir
[train@MiWiFi-R3P-srv edgeR.genes.dir]$ ls
gene.rawCount.matrix.S2_vs_S4.edgeR.count_matrix gene.rawCount.matrix.S2_vs_S4.edgeR.DE_results.MA_n_Volcano.pdf
gene.rawCount.matrix.S2_vs_S4.edgeR.DE_results gene.rawCount.matrix.S2_vs_S4.S2.vs.S4.EdgeR.Rscript
[train@MiWiFi-R3P-srv edgeR.genes.dir]$ head gene.rawCount.matrix.S2_vs_S4.edgeR.DE_results
sampleA sampleB logFC logCPM PValue FDR
TRINITY_DN190_c0_g1 S2 S4 8.99472274370147 15.8184438491505 2.64453178222291e-123 1.81943786616937e-120
TRINITY_DN1984_c0_g1 S2 S4 7.77684165600643 13.7102324085575 1.8328163121404e-117 6.30488811376297e-115
TRINITY_DN14_c0_g1 S2 S4 7.01104197690975 18.0524459121504 6.43603234776519e-93 1.47599675175415e-90
TRINITY_DN1460_c0_g1 S2 S4 6.55399916827941 8.70473121995052 1.3399809141212e-65 2.30476717228846e-63
TRINITY_DN198_c0_g1 S2 S4 3.87279572531066 10.0995425444949 1.04447111920147e-59 1.43719226002122e-57
TRINITY_DN400_c0_g1 S2 S4 7.18963166670787 8.10209691711092 1.34154919588681e-57 1.53830974461688e-55
TRINITY_DN4100_c0_g1 S2 S4 7.49481831648387 8.10572476735801 6.0425927424905e-57 5.93900543833352e-55
TRINITY_DN96_c0_g1 S2 S4 5.82899775763059 12.2441502545614 4.70019463324168e-54 4.04216738458785e-52
TRINITY_DN221_c0_g1 S2 S4 3.83239810737771 8.84603355603372 1.18232923097752e-52 9.03825012125039e-51
#一般pvalue和FDR芹助,參考FDR
# 提取差異表達(dá)基因進(jìn)行聚類分析和熱圖制作
/opt/biosoft/Trinity-v2.15.1/Analysis/DifferentialExpression/analyze_diff_expr.pl --matrix ../gene.TPM.TMM.matrix -P 0.001 -C 2 --samples ../samples.txt
#/opt/biosoft/Trinity-v2.15.1/Analysis/DifferentialExpression/analyze_diff_expr.pl --matrix ../gene.TPM.TMM.matrix -P 0.01 -C 1 --samples ../samples.txt --order_columns_by_samples_file
# 根據(jù)距離結(jié)果進(jìn)行分類
# 自動(dòng)分類
# 共表達(dá)趨勢分析
/opt/biosoft/Trinity-v2.15.1/Analysis/DifferentialExpression/define_clusters_by_cutting_tree.pl --Ptree 50 -R diffExpr.P0.001_C2.matrix.RData
# 手動(dòng)分類
# R
# > load("diffExpr.P0.001_C2.matrix.RData")
# > source("/opt/biosoft/Trinity-v2.15.1/Analysis/DifferentialExpression/R/manually_define_clusters.R")
# > manually_define_clusters(hc_genes, data)
# 然后左鍵點(diǎn)擊選擇子類堂湖,右鍵結(jié)束選擇。
# 結(jié)果生成了文件夾 manually_defined_clusters_4 状土。其中 4 表示手動(dòng)分成了 4 類无蜂。
# cd manually_defined_clusters_4
# 對(duì)每類結(jié)果進(jìn)行趨勢線的繪制
# /opt/biosoft/Trinity-v2.15.1/Analysis/DifferentialExpression/plot_expression_patterns.pl cluster_*
image.png
sudo cpan -i Statistics::Basic
DEG_compare_2_samples.pl
Usage:
/home/train/bin/DEG_compare_2_samples.pl [options] --FDR_edgeR 0.001 --FDR_DESeq2 0.001 --log2FC 2 genes.TPM_TMM.matrix Sample1:LabelA,LabelB,LabelC Sample2:LabelD,LabelE,LabelF
程序用于分析兩個(gè)樣本的差異表達(dá)基因。注意事項(xiàng):
(1)程序輸入的第一個(gè)信息為所有樣本的表達(dá)量矩陣文件蒙谓,第二個(gè)信息為樣本1的標(biāo)簽信息斥季,第二個(gè)信息為樣本2的標(biāo)簽信息。樣本和標(biāo)簽之間用冒號(hào)分割彼乌,標(biāo)簽之間用逗號(hào)分割泻肯。
(2)輸入的表達(dá)量矩陣文件中第一行一定要包含后面輸入的所有Label名稱渊迁,Label名稱不推薦含有怪異字符慰照,特別是逗號(hào)和冒號(hào)等。表達(dá)量矩陣文件中可以包含有更多Label的表達(dá)量信息琉朽,但程序會(huì)忽略之毒租。
(3)表達(dá)量矩陣文件中的數(shù)據(jù)需要為使用TMM或其它算法進(jìn)行了樣本間標(biāo)準(zhǔn)化的表達(dá)量。程序進(jìn)行edgeR分析時(shí)不會(huì)額外再進(jìn)行TMM標(biāo)準(zhǔn)化了。推薦使用normalizing_TPM_matrix_by_TMM.pl命令對(duì)TPM表達(dá)量進(jìn)行TMM標(biāo)準(zhǔn)化墅垮。
(4)程序僅讀取后面的兩個(gè)樣本信息惕医,樣本名字可以隨意取,但推薦不要含有怪異字符算色。
(5)--FDR_edgeR抬伺、--FDR_DESeq2和--log2FC參數(shù)必須至少有一個(gè),或推薦全部都有灾梦。
(6)程序輸出三個(gè)結(jié)果文件峡钓,DE_result.tab包含所有基因的FDR、log2FC和表達(dá)量信息若河,Up.tab和Down.tab分別是在第二個(gè)樣本中上調(diào)或下調(diào)的基因信息能岩,當(dāng)使用--annot_file參數(shù)提供注釋文件時(shí),在第二列插入基因的功能注釋信息萧福。三個(gè)輸出文件均按FDR和log2FC進(jìn)行了排序拉鹃。
(7)程序在標(biāo)準(zhǔn)錯(cuò)誤輸出中給出用于差異分析的最小表達(dá)量值、各個(gè)算法得到的差異表達(dá)基因數(shù)量鲫忍。
參數(shù):
--tmp_dir <string> default: tmp_$date_$pid
程序運(yùn)行時(shí)臨時(shí)文件夾名稱膏燕。
--out_prefix <string> default: Sample1_VS_Sample2
設(shè)置輸出文件前綴。
--FDR_edgeR <float> default: None
設(shè)置該參數(shù)后悟民,則使用edgeR進(jìn)行差異表達(dá)基因分析煌寇,并要求差異表達(dá)量基因的FDR <= 該閾值,推薦設(shè)置為 0.001逾雄。若不設(shè)置該參數(shù)值阀溶,則程序不會(huì)調(diào)用edgeR進(jìn)行分析。
--edgeR_dispersion <float> default: 0.1
若兩個(gè)樣本都沒有生物學(xué)重復(fù)鸦泳,使用edgeR的exactTest進(jìn)行p值計(jì)算時(shí)银锻,設(shè)置其dispersion參數(shù)的值。
--FDR_DESeq2 <float> default: None
設(shè)置該參數(shù)后做鹰,則使用DESeq2進(jìn)行差異表達(dá)基因分析击纬,并要求差異表達(dá)量基因的FDR <= 該閾值,推薦設(shè)置為 0.001钾麸。若不設(shè)置該參數(shù)值更振,則程序不會(huì)調(diào)用DEseq2進(jìn)行分析。當(dāng)兩個(gè)樣本都沒有生物學(xué)重復(fù)時(shí)饭尝,不會(huì)進(jìn)行DESeq2分析肯腕。雖然無生物學(xué)重復(fù)DESeq2運(yùn)行不報(bào)錯(cuò),但是一般得不到差異基因钥平,導(dǎo)致分析沒有意義实撒。
--log2FC <float> default: None
設(shè)置該參數(shù)后,則使用表達(dá)量差異倍數(shù)進(jìn)行差異基因分析,并要求差異表達(dá)量基因的 |log2表達(dá)量差異倍數(shù)| >= 該閾值知态,推薦設(shè)置為 2捷兰。若不設(shè)置該參數(shù)值,則程序不會(huì)則使用表達(dá)量差異倍數(shù)進(jìn)行分析负敏。
--annot_file <string> default: None
若輸入基因功能注釋文件贡茅,則會(huì)在輸出文件中包含基因的功能注釋信息。要求文件使用制表符分割其做,第一列是基因ID友扰,后面的一列或多列注釋信息都能被識(shí)別,也支持一個(gè)基因在多行上有注釋庶柿。程序會(huì)將一個(gè)基因所有行所有列的注釋信息用 | 符號(hào)分割村怪,賦值為其基因的注釋信息,放入到差異表達(dá)基因結(jié)果文件的第二列浮庐。
--min_value <float> default: None
設(shè)置最小的表達(dá)量值甚负。當(dāng)基因的表達(dá)量低于該值時(shí),則將其表達(dá)量修改為該值审残,再用于差異基因分析梭域。
--median_ratio <float> default: 0.01
若未設(shè)置--min_value參數(shù)值,則使用該參數(shù)值按如下步驟計(jì)算出最小表達(dá)量值:(1)去除表達(dá)量矩陣中所有不為0的數(shù)據(jù)搅轿,求中位數(shù)病涨;(2)再去除表達(dá)量值 < 中位數(shù) * 該參數(shù)值的數(shù)據(jù),再次求中位數(shù)璧坟;(3)使用第二次的中位數(shù) * 該參數(shù)設(shè)定值即得到最小表達(dá)量值既穆。
--keep_tmp default: None
添加該參數(shù),程序會(huì)保留臨時(shí)文件夾雀鹃。
--help default: None
display this help and exit.
功能注釋
[train@MiWiFi-R3P-srv transdecoder]$ ls ../exp_cal/out.*
../exp_cal/out.effective.fasta #轉(zhuǎn)錄本序列文件幻工,文件較大
../exp_cal/out.gene_rawCounts.matrix
../exp_cal/out.gene_TPM.not_cross.matrix
../exp_cal/out.unigene.fasta #unigene文件,使用該文件
# 使用 transdecoder 進(jìn)行ORF預(yù)測
mkdir -p /home/train/08.RNA-seq_analysis_by_trinity/transdecoder
cd /home/train/08.RNA-seq_analysis_by_trinity/transdecoder
#ln -s ../exp_cal/out.unigene.fasta Unigene.fasta
cp ~/00.incipient_data/data_for_gene_prediction_and_RNA-seq/Unigene.fasta ./
# 首先黎茎,根據(jù)6個(gè)讀碼框翻譯得到蛋白序列
TransDecoder.LongOrfs -t Unigene.fasta -m 100 #-m 100 至少有100個(gè)氨基酸
# real 0m11.108s
# user 0m11.068s
# sys 0m0.042s
perl -pe 's/^>(\S+).*/>$1/' Unigene.fasta.transdecoder_dir/longest_orfs.pep > longest_orfs.pep #尋找每個(gè)轉(zhuǎn)錄本最長的開放性閱讀框
# 然后囊颅,將ORF蛋白序列比對(duì)到公共數(shù)據(jù)庫,能比對(duì)上的則表示存在對(duì)應(yīng)轉(zhuǎn)錄本序列
# 比對(duì)到SwissProt數(shù)據(jù)庫
#wget ftp://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/complete/uniprot_sprot.fasta.gz -P ~/software/
#gzip -dc ~/software/uniprot_sprot.fasta.gz > uniprot_sprot.fasta
#diamond makedb --threads 8 --db uniprot_sprot --in uniprot_sprot.fasta
#diamond blastp --db uniprot_sprot --query longest_orfs.pep --out blast.xml --outfmt 5 --sensitive --max-target-seqs 20 --evalue 1e-5 --id 20 --tmpdir /dev/shm --index-chunks 1
#parsing_blast_result.pl --no-header --max-hit-num 20 --query-coverage 0.2 --subject-coverage 0.1 blast.xml > blastp.outfmt6
cp ~/00.incipient_data/data_for_gene_prediction_and_RNA-seq/blastp.outfmt6 ./
# 比對(duì)到PFAM數(shù)據(jù)庫
#para_hmmscan.pl --cpu 40 --chunk 30 --no_cut_ga --hmm_db /opt/biosoft/bioinfomatics_databases/Pfam/Pfam-AB.hmm longest_orfs.pep | grep -P "^#" -v > pfam.domtbl
cp ~/00.incipient_data/data_for_gene_prediction_and_RNA-seq/pfam.domtbl ./
# 最后傅瞻,去除假陽性O(shè)RF踢代,保留正確的ORF
TransDecoder.Predict -t Unigene.fasta --retain_pfam_hits pfam.domtbl --retain_blastp_hits blastp.outfmt6
# real 0m51.079s
# user 0m50.418s
# sys 0m0.671s
#統(tǒng)計(jì)ORF名稱
[train@MiWiFi-R3P-srv transdecoder]$ grep ">" Unigene.fasta.transdecoder.pep | perl -e 'while (<>) { print "$1\n" if m/^>(\w+)/;}' | head
TRINITY_DN0_c0_g1
TRINITY_DN0_c0_g1
TRINITY_DN0_c0_g1
TRINITY_DN0_c0_g1
TRINITY_DN0_c1_g1
TRINITY_DN0_c1_g1
TRINITY_DN1000_c0_g2
TRINITY_DN1000_c0_g2
TRINITY_DN1002_c0_g1
TRINITY_DN1004_c0_g1
[train@MiWiFi-R3P-srv transdecoder]$ grep ">" Unigene.fasta.transdecoder.pep | perl -e 'while (<>) { print "$1\n" if m/^>(\w+)/;}' | uniq | wc -l
2266
# 檢查正義鏈負(fù)義鏈的ORF長度和數(shù)量
perl -e 'while (<>) { if (m/\tCDS\t(\d+)\t(\d+)\t\S+\t(\S+)\t/) { $length = $2 - $1 + 1; push @stats1, $length if $3 eq "+"; push @stats2, $length if $3 eq "-"; } } @stats1 = sort {$a <=> $b} @stats1; @stats2 = sort {$a <=> $b} @stats2; $stats1 = @stats1; $stats2 = @stats2; print "strand\tnum\tmedian_length\n+\t$stats1\t$stats1[$stats1/2]\n-\t$stats2\t$stats2[$stats2/2]\n";' Unigene.fasta.transdecoder.gff3
# 測試連特異性測序轉(zhuǎn)錄組De novo組裝序列
#Trinity --seqType fq --max_memory 2G --left ../E.1.fastq --right ../E.2.fastq --CPU 40 --jaccard_clip --normalize_reads --output trinity_denovo_SS --bflyCalculateCPU --SS_lib_type RF &> trinity_denovo_SS.log
#extract_longest_isoforms_from_TrinityFasta.pl trinity_denovo_SS.Trinity.fasta > trinity_denovo_SS.unigene.longest.fasta
cp ~/00.incipient_data/data_for_gene_prediction_and_RNA-seq/trinity_denovo_SS.unigene.longest.fasta ./
TransDecoder.LongOrfs -t trinity_denovo_SS.unigene.longest.fasta -m 100
TransDecoder.Predict -t trinity_denovo_SS.unigene.longest.fasta
# 檢測在正負(fù)鏈上同時(shí)進(jìn)行預(yù)測ORF情況
perl -e 'while (<>) { if (m/\tCDS\t(\d+)\t(\d+)\t\S+\t(\S+)\t/) { $length = $2 - $1 + 1; push @stats1, $length if $3 eq "+"; push @stats2, $length if $3 eq "-"; } } @stats1 = sort {$a <=> $b} @stats1; @stats2 = sort {$a <=> $b} @stats2; $stats1 = @stats1; $stats2 = @stats2; print "strand\tnum\tmedian_length\n+\t$stats1\t$stats1[$stats1/2]\n-\t$stats2\t$stats2[$stats2/2]\n";' trinity_denovo_SS.unigene.longest.fasta.transdecoder.gff3
# 檢測僅在正以鏈上進(jìn)行預(yù)測ORF情況
perl -e 'while (<>) { if (m/\tCDS\t(\d+)\t(\d+)\t\S+\t(\S+)\t/) { $length = $2 - $1 + 1; push @stats1, $length if $3 eq "+"; push @stats2, $length if $3 eq "-"; } } @stats1 = sort {$a <=> $b} @stats1; @stats2 = sort {$a <=> $b} @stats2; $stats1 = @stats1; $stats2 = @stats2; print "strand\tnum\tmedian_length\n+\t$stats1\t$stats1[$stats1/2]\n-\t$stats2\t$stats2[$stats2/2]\n";' ~/00.incipient_data/data_for_gene_prediction_and_RNA-seq/trinity_denovo_SS.unigene.longest.fasta.transdecoder.gff3
cd ..
組裝后的文件就是轉(zhuǎn)錄本文件
在未消除樣品之間的差異之前,用TPM來比較勉強(qiáng)可以嗅骄,但是FPKM不行胳挎。