背景
在處理NGS 數(shù)據(jù)時,尤其是突變的檢測。我們需要對bam 文件進(jìn)行讀寫。 目前有相對方便的庫 pysam 和 htsjdk 桩卵,這兩個庫提供的api 可以方便python 和java 讀寫,操作BAM倍宾。
隨著測序深度的提高雏节,尤其是UMI技術(shù)的普及,上萬層的測序數(shù)據(jù)下高职,我們需要提高分析軟件的效率钩乍,C++是比較好的選擇。
C++ 中可以讀寫B(tài)AM 的庫有 htslib 怔锌、bamtools寥粹、Seqan3 等变过。Seqan3的文檔信息比較全,但是我使用文檔中的測試案例發(fā)現(xiàn)排作,速度很慢(也許是我打開方式有問題?)亚情;bamtools 和 htslib 的效率挺好的妄痪,但是都缺乏詳細(xì)的文檔,用起來真的頭大楞件。
本文衫生,粗略的學(xué)習(xí)了htslib sam.h sam.c 部分的代碼,以及參考了陳師傅的gencore 代碼土浸,總結(jié)了一些htslib的使用方法罪针。(pileup 部分好難,略過~)
C++ 代碼
-
獲得Cigar
/* @discussion In the CIGAR array, each element is a 32-bit integer. The lower 4 bits gives a CIGAR operation and the higher 28 bits keep the length of a CIGAR. */ string getCigar(const bam1_t *b) { uint32_t *data = (uint32_t *)bam_get_cigar(b); int cigarNum = b->core.n_cigar; stringstream ss; for(int i=0; i<cigarNum; i++) { uint32_t val = data[i]; char op = bam_cigar_opchr(val); uint32_t len = bam_cigar_oplen(val); ss << len << op; } return ss.str(); }
-
獲得序列和質(zhì)量值
//Each base is encoded in 4 bits: 1 for A, 2 for C, 4 for G, 8 for T and 15 for N. char fourbits2base(uint8_t val) { switch(val) { case 1: return 'A'; case 2: return 'C'; case 4: return 'G'; case 8: return 'T'; case 15: return 'N'; default: cerr << "ERROR: Wrong base with value "<< (int)val << endl ; return 'N'; } } // get seq 序列的信息記錄在8bit 的數(shù)據(jù)結(jié)構(gòu)中黄伊,前4bit 是前面的堿基泪酱,后4bit 是后面的堿基 string getSeq(const bam1_t *b) { uint8_t *data = bam_get_seq(b); int len = b->core.l_qseq; string s(len, '\0'); for(int i=0; i<len; i++) { char base; if(i%2 == 1) base = fourbits2base(data[i/2] & 0xF); else base = fourbits2base((data[i/2]>>4) & 0xF); s[i] = base; } return s; } // get seq quality string getQual(const bam1_t *b) { uint8_t *data = bam_get_qual(b); int len = b->core.l_qseq; string s(len, '\0'); for(int i=0; i<len; i++) { s[i] = (char)(data[i] + 33); // 轉(zhuǎn)換成打印的ascci } return s; }
-
讀取完整BAM
int main(int argc,char *argv[]) { if(argc < 2) { cerr << "need bam path"; return -1; } samFile *bam_in = sam_open(argv[1],"r"); // open bam file bam_hdr_t *bam_header = sam_hdr_read(bam_in); // read header bam1_t *aln = NULL; aln = bam_init1(); //initialize an alignment //return >= 0 on successfully reading a new record, -1 on end of stream, < -1 on error while (sam_read1(bam_in,bam_header,aln) >= 0) { int pos = aln->core.pos ; string chr = "*"; if (aln->core.tid != -1) // 存在無法比對到基因組的reads chr = bam_header->target_name[aln->core.tid]; // config name(chromosome) string queryname = bam_get_qname(aln); string cigar = getCigar(aln); string seq = getSeq(aln); string qual = getQual(aln); cout << "QueryName: " << queryname << '\n' << "Positon: " << chr << '\t' << pos <<'\n' << "Cigar: " << cigar << '\n' << "Seq: " << seq << '\n' << "Qual: " << qual << '\n'; } bam_destroy1(aln); // 回收資源 bam_hdr_destroy(bam_header); sam_close(bam_in); cout << "Finshed!\n"; return 0; }
-
讀取選定區(qū)域的BAM
int main(int argc,char *argv[]) { if(argc < 3) { cerr << "need bam path; chrom: start pos-end pos"; return -1; } samFile *bam_in = sam_open(argv[1],"r"); // open bam file hts_idx_t *bam_index = sam_index_load(bam_in,argv[1]); //load index bam_hdr_t *bam_header = sam_hdr_read(bam_in); // read header bam1_t *aln = NULL; aln = bam_init1(); //initialize an alignment /* Regions are parsed by hts_parse_reg(), and take one of the following forms: region | Outputs --------------- | ------------- REF | All reads with RNAME REF REF: | All reads with RNAME REF REF:START | Reads with RNAME REF overlapping START to end of REF REF:-END | Reads with RNAME REF overlapping start of REF to END REF:START-END | Reads with RNAME REF overlapping START to END . | All reads from the start of the file * | Unmapped reads at the end of the file (RNAME '*' in SAM) */ string regin = argv[2]; hts_itr_t *iter = sam_itr_querys(bam_index,bam_header,regin); if(!iter) cerr << "invalid regin\n"; // while (sam_read1(bam_in,bam_header,aln) >= 0) while (sam_itr_next(bam_in, iter, aln) >= 0) { int pos = aln->core.pos ; string chr = "*"; if (aln->core.tid != -1) chr = bam_header->target_name[aln->core.tid]; // config name(chromosome) string queryname = bam_get_qname(aln); string cigar = getCigar(aln); string seq = getSeq(aln); string qual = getQual(aln); string md = getAux(aln,"MD"); //int64_t newPos = pos - 5; cout << "QueryName: " << queryname << '\n' << "Positon: " <<aln->core.tid << '\t' << chr << '\t' << pos <<'\n' << "Cigar: " << cigar << '\n' << md << '\n' << "Seq: " << seq << '\n' << "Qual: " << qual << '\n'; } sam_itr_destroy(iter) ;// free iter bam_destroy1(aln); bam_hdr_destroy(bam_header); sam_close(bam_in); cout << "Finshed!\n"; return 0; }
-
輔助字段的讀取
string getAux(const bam1_t *b,const char tag[2]) { kstring_t res = KS_INITIALIZE; // 需要初始化 if(bam_aux_get_str(b,tag,&res) == 1) //kstring的string buffer 沒有\(zhòng)0終止 { int len = ks_len(&res); char *ks_s = ks_str(&res); string s(len, '\0'); for (int i = 0;i<len;i++ ){ s[i] = ks_s[i]; } ks_free(&res); // 釋放資源 return s; } else { cerr << "no tag :" << tag << '\n'; ks_free(&res); return ""; } } int main() { 。还最。墓阀。 string md = getAux(aln,"MD"); 。拓轻。斯撮。 }