Step1:鏈特異性比對并組裝轉(zhuǎn)錄本
用HISAT2對每個樣本進(jìn)行鏈特異性比對衡未,隨后利用StringTie對每個樣本的轉(zhuǎn)錄本進(jìn)行單獨(dú)組裝
mkdir cleandata hisat2 stringtie count target
#利用HISAT2進(jìn)行鏈特異性比對,--rna-strandness RF
#以sample1為例
hisat2 --dta -x ~/genome/hisat2_index/genome.hisat2 -p 4 --rna-strandness RF -1 ./cleandata/lnc.rna.sample1_R1.fq.gz -2 ./cleandata/lnc.rna.sample1_R2.fq.gz -S ./hisat2/lnc.rna.sample1.sam
samtools view -@ 4 -bS -q 30 ./hisat2/lnc.rna.sample1.sam | samtools sort -@ 4 - -o ./hisat2/lnc.rna.sample1.bam
rm ./hisat2/lnc.rna.sample1.sam
samtools flagstat -@ 4 ./hisat2/lnc.rna.sample1.bam > ./hisat2/lnc.rna.sample1.stat.txt
#利用StringTie對每個樣本單獨(dú)組裝轉(zhuǎn)錄本
stringtie -p 4 --rf -G ~/genome/genome.gff3 -o ./stringtie/lnc.rna.sample1.gtf -l lnc.rna.sample1 ./hisat2/lnc.rna.sample1.bam
Step2:過濾lncRNA
將所有樣本組裝的轉(zhuǎn)錄本合并青责,隨后根據(jù)lncRNA的特征進(jìn)行過濾
#利用StringTie將所有樣本的轉(zhuǎn)錄本合并
##其中g(shù)tf.list.txt包含所有要合并的gtf文件,一行一個
ls ./stringtie/*gtf > gtf.list.txt
stringtie --merge -p 4 --rf -G ~/genome/genome.gff3 -o ./stringtie/all.merged.gtf -l lncrna gtf.list.txt
#利用gffcompare將組裝好的轉(zhuǎn)錄本與原始基因注釋文件進(jìn)行比較
gffcompare -r ~/genome/genome.gff3 -o ./stringtie/gffcmp ./stringtie/all.merged.gtf
#根據(jù)gffcompare的結(jié)果過濾轉(zhuǎn)錄本,根據(jù)lncRNA的特性管呵,只保留calss code為i,u,x,o并且長度大于200bp的轉(zhuǎn)錄本
#i: 完全包含在參考的內(nèi)含子內(nèi)
#u: 位于基因間區(qū)
#x: 與參考外顯子存在overlap但與參考轉(zhuǎn)錄本方向相反
#o: 與參考外顯子存在overlap且與參考轉(zhuǎn)錄本方向一致
awk '$3=="i" || $3=="u" || $3=="x" || $3=="o" && $10>200 {print "\"" $5 "\""}' ./stringtie/gffcmp.all.merged.gtf.tmap > ./stringtie/filter.iuxo.l200.transcript.id.txt
LC_ALL=C fgrep -f ./stringtie/filter.iuxo.l200.transcript.id.txt ./stringtie/gffcmp.annotated.gtf > ./stringtie/filter.iuxo.l200.gtf
#根據(jù)gtf文件提取轉(zhuǎn)錄本的序列
gffread ./stringtie/filter.iuxo.l200.gtf -g ~/genome/genome.fa -w ./stringtie/filter.iuxo.l200.transcript.fa
#預(yù)測轉(zhuǎn)錄本的編碼能力
##CPC2袭蝗,依賴python3
python ~/software/CPC2/CPC2_standalone-1.0.1/bin/CPC2.py -i ./stringtie/filter.iuxo.l200.transcript.fa -o ./stringtie/CPC2.out --ORF
#CNCI唤殴,依賴python2.7
#-m pl指定模型為植物
python2.7 ~/software/CNCI/CNCI-master/CNCI.py -f ./stringtie/filter.iuxo.l200.transcript.fa -o ./stringtie/CNCI.out -p 4 -m pl
##LGC,依賴python2.7
python2.7 ~/software/LGC/LGC-master/scr/lgc-1.0.0.py ./stringtie/filter.iuxo.l200.transcript.fa ./stringtie/LGC.out.txt
#過濾掉具有蛋白編碼能力的轉(zhuǎn)錄本
awk '$9=="noncoding"{print "\"" $1 "\""}' ./stringtie/CPC2.out.txt > ./stringtie/tmp.CPC2.txt
awk '$5=="Non-coding"{print "\"" $1 "\""}' ./stringtie/LGC.out.txt > ./stringtie/tmp.LGC.txt
awk '$3=="noncoding"{print "\"" $1 "\""}' ./stringtie/CNCI.out/CNCI.index > ./stringtie/tmp.CNCI.txt
cat ./stringtie/tmp.CPC2.txt ./stringtie/tmp.LGC.txt | sort | uniq -d > ./stringtie/tmp1
cat ./stringtie/tmp.CPC2.txt ./stringtie/tmp.CNCI.txt | sort | uniq -d > ./stringtie/tmp2
cat ./stringtie/tmp1 ./stringtie/tmp2 | sort | uniq -d > ./stringtie/filter.iuxo.l200.ncd.transcript.txt
LC_ALL=C fgrep -f ./stringtie/filter.iuxo.l200.ncd.transcript.txt ./stringtie/filter.iuxo.l200.gtf > ./stringtie/filter.iuxo.l200.ncd.gtf
gffread ./stringtie/filter.iuxo.l200.ncd.gtf -g ~/genome/genome.fa -w ./stringtie/filter.iuxo.l200.ncd.transcript.fa
#利用pfam數(shù)據(jù)庫進(jìn)行過濾
##用Transeq將轉(zhuǎn)錄本翻譯為6種可能的蛋白序列
transeq ./stringtie/filter.iuxo.l200.ncd.transcript.fa ./stringtie/filter.iuxo.l200.ncd.pep6.fa -frame=6
##設(shè)置E-value < 1e-5為閾值
pfam_scan.pl -cpu 4 -fasta ./stringtie/filter.iuxo.l200.ncd.pep6.fa -dir ~/database/pfam-A/ -o ./stringtie/pfam.out
##根據(jù)pfam結(jié)果過濾存在蛋白結(jié)構(gòu)域的transcripts
sed "/#/d" ./stringtie/pfam.out | sed '/^$/d' | awk '$13 < 1e-5 {print $1}' | sed "s/_/\t/g" | awk '{print "\"" $1 "\""}' > ./stringtie/pfam.coding.txt
LC_ALL=C fgrep -v -f ./stringtie/pfam.coding.txt ./stringtie/filter.iuxo.l200.ncd.gtf > ./stringtie/final.lncRNA.gtf
Step3:轉(zhuǎn)錄本定量
將lncRNA與基因組注釋整合到腥,隨后用Stringtie進(jìn)行轉(zhuǎn)錄本水平FPKM定量
#進(jìn)行轉(zhuǎn)錄本水平定量(以sample1為例)
cat ./stringtie/final.lncRNA.gtf ~/genome/genome.gtf > count/final.all.gtf
##轉(zhuǎn)錄本的FPKM值在sample1目錄下的t_data.ctab文件中
stringtie -p 4 -b ./count/sample1 --rf -G count/final.all.gtf -e -o ./count/tmp.sample1.gtf -l sample1 ./hisat2/lnc.rna.sample1.bam
從中提取count值用于后續(xù)差異分析
#提取count值
##其中l(wèi)ncrna.sample.txt包含了所有樣本的信息朵逝,兩列,第一列為樣本名乡范,第二列為樣本的gtf名
prepDE.py -i ./count/lncrna.sample.txt -g ./count/all.gene.count.csv -t ./count/all.transcript.count.csv
Step4:lncRNA靶基因預(yù)測
分別預(yù)測lncRNA的cis和trans的靶基因
#lncRNA靶基因預(yù)測(只針對FPKM>1的lncRNA和mRNA)
##cis-lncRNA靶基因配名,一般根據(jù)lncRNA和mRNA的位置信息進(jìn)行判定,比如上下游100k
awk '{print $1 "\t" $2-100000 "\t" $3+100000 "\t" $4}' ~/genome/all.gene.bed | awk '$2<0 {print $1 "\t" 0 "\t" $3 "\t" $4} $2>=0 {print $0}' > ./target/all.gene.+-100k.bed
awk '$3=="transcript" {print $1 "\t" $4 "\t" $5 "\t" $10 }' ./stringtie/final.lncRNA.gtf | sed "s/\"http://g" | sed "s/\;//g" > ./stringtie/final.lncRNA.bed
bedtools intersect -wo -a ./stringtie/final.lncRNA.bed -b ./target/all.gene.+-100k.bed | cut -f 4,8 > ./target/lncrna.cis.target.txt
##trans-lncRNA靶基因
##利用LucTar預(yù)測lncRNA經(jīng)過trans作用的靶基因
#-d設(shè)置ndG(the normalized deltaG)閾值晋辆,小于該閾值則認(rèn)為存在互作渠脉,大于該閾值則認(rèn)為不存在互作。建議設(shè)為:-0.08(lowest);-0.10(low);-0.13(medium);-0.15(high);-0.20(highest)
#-s F/T瓶佳,設(shè)置是否輸出配對的序列(批量運(yùn)算時不建議輸出)
##保證fa文件的基因ID行只有基因ID芋膘,沒有空格后面的,否則會報錯“the format of file is error, Please check! The name and the sequence do not put in a row!”
gffread final.lncRNA.gtf -g ~/genome/genome.fa -w final.lncRNA.transcript.fa
perl LncTar.pl -p 1 -l final.lncRNA.transcript.fa -m mRNA.transcript.fa -d ndg -s F -o lncRNA.trans.mRNA.LucTar.txt