1. 環(huán)境準備
python
gff3文件:https://www.gencodegenes.org/human/release_19.html
python包:
pip3 install pandas, pysam, numpy
2. 輸入文件
#Chr Begin End Size(Mb) Nb_variants Percentage_homozygosity
chr1 37356469 39797055 2.44 57 89.47
chr1 89731959 91742055 2.01 31 90.32
chr1 92327126 94544234 2.22 59 91.53
chr1 172100831 176102969 4 78 91.03
chr2 11718591 15732002 4.01 26 100
chr2 39262068 41384390 2.12 26 92.31
chr2 59614572 62047433 2.43 33 93.94
chr2 80809027 85143406 4.33 27 92.59
chr2 143787161 159672168 15.89 163 93.87
3. 輸出文件
樣式1
chr start end Size(Mb) Nb_variants Percentage_homozygosity genes
chr1 37356469 39797055 2.44 57 89.47 GRIK3(5.87);ZC3H12A(0.4);MEAF6(0.91);SNIP1(0.81);DNALI1(0.41);GNL2(1.19);RSPO1(0.97);C1orf109(0.44);CDCA8(0.71);EPHA10(2.1);MANEAL(0.3);YRDC(0.21);C1orf122(0.1);MTF1(2.05);INPP5B(3.54);SF3A3(1.39);FHL3(0.36);UTP11L(0.64);POU3F1(0.12);RRAGC(0.89);MYCBP(0.76);GJA9(0.7);RHBDL2(2.29);AKIRIN1(0.61);NDUFS5(0.34);MACF1(10.25)
chr1 89731959 91742055 2.01 31 90.32 GBP5(0.33);GBP6(1.11);LRRC8B(3.63);LRRC8C(6.81);LRRC8D(5.75);ZNF326(2.01);BARHL2(0.28);ZNF644(5.32);HFM1(0.78)
chr1 92327126 94544234 2.22 59 91.53 TGFBR3(2.02);BRDT(2.93);EPHX4(1.51);BTBD8(3.05);KIAA1107(0.8);C1orf146(1.26);GLMN(2.37);RPAP2(4.65);GFI1(0.55);EVI5(12.8);RPL5(0.45);FAM69A(5.38);MTF2(2.7);TMED5(1.4);CCDC18(4.46);DR1(1.06);FNBP1L(4.8);BCAR3(12.87);DNTTIP2(0.55);GCLM(1.09);ABCA4(3.87)
chr1 172100831 176102969 4 78 91.03 DNM3(7.17);PIGC(1.85);C1orf105(1.2);SUCO(1.99);FASLG(0.2);TNFSF18(0.27);TNFSF4(0.59);PRDX6(0.29);SLC9C2(2.56);ANKRD45(1.51);KLHL20(1.79);CENPL(0.63);DARS2(0.85);ZBTB37(0.89);SERPINC1(0.34);RC3H1(2.28);RABGAP1L(20.89);GPR52(0.04);CACYBP(0.31);MRPS14(0.32);TNN(2.0);KIAA0040(0.9);TNR(10.71);RFWD2(4.72)
chr2 11718591 15732002 4.01 26 100 GREB1(1.6);NTSR2(0.3);LPIN1(3.73);TRIB2(0.64);FAM84A(0.45);AC011897.1(0.39);NBAS(9.83);DDX1(0.02)
chr2 39262068 41384390 2.12 26 92.31 SOS1(4.21);CDKL4(2.54);MAP4K3(8.86);TMEM178A(2.5);THUMPD2(2.04);SLC8A1(24.21)
chr2 59614572 62047433 2.43 33 93.94 BCL11A(4.21);PAPOLG(1.88);REL(2.06);PUS10(3.21);PEX13(1.43);KIAA1841(4.07);C2orf74(0.81);AHSA2(0.57);USP34(11.64);XPO1(2.5)
chr2 80809027 85143406 4.33 27 92.59 CTNNA2(1.54);SUCLG1(0.84);DNAH6(6.99);TRABD2A(1.97);TMSB10(0.02)
chr2 143787161 159672168 15.89 163 93.87 KYNU(0.08);ARHGAP15(4.26);GTDC1(2.48);ZEB2(0.88);ACVR2A(0.54);ORC4(0.57);MBD5(3.13);EPC2(0.9);KIF5C(1.58);LYPD6B(1.12);LYPD6(0.91);MMADHC(0.11);RND3(0.45);RBM43(0.09);NMI(0.12);TNFAIP6(0.14);RIF1(0.62);NEB(1.57);ARL5A(0.25);CACNB4(1.68);STAM2(0.37);FMNL2(1.98);PRPF40A(0.42);ARL6IP6(0.27);RPRM(0.01);GALNT13(3.66);KCNJ3(1.01);NR4A2(0.11);GPD2(1.12);GALNT5(0.36);ERMN(0.06);CYTIP(0.47);ACVR1C(0.64);ACVR1(0.88);UPP2(1.63);CCDC148(1.8);PKP4(1.42);DAPL1(0.13)
輸出樣式2
chr start end Size(Mb) Nb_variants Percentage_homozygosity gene_name gene_coverage
chr1 37356469 39797055 2.44 57 89.47 GRIK3 5.87
chr1 37356469 39797055 2.44 57 89.47 ZC3H12A 0.4
chr1 37356469 39797055 2.44 57 89.47 MEAF6 0.91
chr1 37356469 39797055 2.44 57 89.47 SNIP1 0.81
chr1 37356469 39797055 2.44 57 89.47 DNALI1 0.41
chr1 37356469 39797055 2.44 57 89.47 GNL2 1.19
chr1 37356469 39797055 2.44 57 89.47 RSPO1 0.97
chr1 37356469 39797055 2.44 57 89.47 C1orf109 0.44
chr1 37356469 39797055 2.44 57 89.47 CDCA8 0.71
chr1 37356469 39797055 2.44 57 89.47 EPHA10 2.1
chr1 37356469 39797055 2.44 57 89.47 MANEAL 0.3
chr1 37356469 39797055 2.44 57 89.47 YRDC 0.21
3. 提取gff3文件的python代碼
# 導(dǎo)入包
import pandas as pd
import numpy as np
import pysam
# 讀取gff3文件
gencode = pd.read_table("/mnt/f/genome/Homo_sapiens_gff3/gencode.v19.annotation.gff3", comment="#",
sep = "\t", names = ['seqname', 'source', 'feature', 'start' , 'end', 'score', 'strand', 'frame', 'attribute'])
gencode.head()
提取gff3文件中所需要的信息
gencode_genes = gencode[(gencode.feature == "gene")][['seqname', 'start', 'end', 'attribute']].copy().reset_index().drop('index', axis=1) # Extract genes
gencode_genes.head()
提取基因名空厌,基因類型瀑踢,基因狀態(tài)浮创,基因水平(1verified loci新症,2manually annotated loci,3automatically annotated loci)
def gene_info(x):
# Extract gene names
g_name = list(filter(lambda x: 'gene_name' in x, x.split(";")))[0].split("=")[1]
g_type = list(filter(lambda x: 'gene_type' in x, x.split(";")))[0].split("=")[1]
g_status = list(filter(lambda x: 'gene_status' in x, x.split(";")))[0].split("=")[1]
g_leve = int(list(filter(lambda x: 'level' in x, x.split(";")))[0].split("=")[1])
return (g_name, g_type, g_status, g_leve)
gencode_genes["gene_name"], gencode_genes["gene_type"], gencode_genes["gene_status"], gencode_genes["gene_level"] = zip(*gencode_genes.attribute.apply(lambda x: gene_info(x)))
gencode_genes.head()
查看基因類型分布
gencode_genes['gene_type'].drop_duplicates()
提取所有已知的蛋白編碼基因
gencode_genes = gencode_genes[gencode_genes['gene_status'] == 'KNOWN'].reset_index().drop('index', axis=1)
gencode_genes = gencode_genes[gencode_genes['gene_type'] == 'protein_coding'].reset_index().drop('index', axis=1)
gencode_genes.info()
# 移除重復(fù)的記錄并保存
## Sort gene_leve in each chromosome (ascending oder) and remove duplicates
gencode_genes = gencode_genes.sort_values(['gene_level', 'seqname'], ascending=True).drop_duplicates('gene_name', keep='first').reset_index().drop('index', axis=1)
gencode_genes.to_csv('/mnt/f/genome/Homo_sapiens_gff3/gencode.v19.annotation.gff3_all_known_genes.txt', index=False, header = False, sep="\t")
gencode_genes.info()
壓縮
%%bash -s /mnt/f/genome/Homo_sapiens_gff3/gencode.v19.annotation.gff3_all_known_genes.txt
cut -f 1,2,3,5 $1 | sortBed -i > /mnt/f/genome/Homo_sapiens_gff3/gencode.v19.annotation.gff3_all_known_genes.txt.sorted.formatted.bed
bgzip /mnt/f/genome/Homo_sapiens_gff3/gencode.v19.annotation.gff3_all_known_genes.txt.sorted.formatted.bed
tabix -p bed /mnt/f/genome/Homo_sapiens_gff3/gencode.v19.annotation.gff3_all_known_genes.txt.sorted.formatted.bed.gz
ls -l
4. 注釋染色體位置包含的基因名
定義重疊位置計算函數(shù)
def overlap(q_st, q_end, res_st, res_end):
o = min(q_end, res_end)-max(q_st, res_st)
return o
定義輸入與輸入文件目錄
import os
input_path = "/mnt/f/projects/gene/patient1/"
input_file_name = "patient1.HomRegions.tsv"
input_file = os.path.join(input_path,input_file_name)
# 讀取輸入文件
### Read the sample file in to a pandas dataframe
df = pd.read_table(input_file,
names=['chr', 'start', 'end', 'Size(Mb)', 'Nb_variants', 'Percentage_homozygosity'], skiprows=1)
df.head()
計算重疊基因及重疊部位占染色體位置的百分比
def gencode_all_known_genes(a, tb):
genes = []
try:
for region in tb.fetch(a['chr'], int(a['start']), int(a['end'])):
if region:
r = region.split('\t')
overlap_len = overlap(int(a['start']), int(a['end']), int(r[1]), int(r[2]))
ret_val = '{}({})'.format(r[3], np.round(overlap_len/float(int(a['end'])-int(a['start']))*100, 2)) ### Percentage of the input interval that overlap with the gene
genes.append(ret_val)
if len(genes)>0:
return ";".join(genes)
else:
return "NA(0)"
except ValueError:
return "NA(0)"
import pysam
gencode_v19 = pysam.TabixFile('/mnt/f/genome/Homo_sapiens_gff3/gencode.v19.annotation.gff3_all_known_genes.txt.sorted.formatted.bed.gz')
df['genes'] = df.apply(lambda x: gencode_all_known_genes(x[['chr', 'start', 'end']], gencode_v19), axis=1)
df.head()
輸出樣式1
df.to_csv(os.path.join(input_path,'out_df.tsv'), index=False, header = True, sep="\t")
轉(zhuǎn)為一個基因一行的格式
# 移除無基因的染色體位置
## Remove all the intervals that do not overlap with genes
df2 = df[df['genes'] != "NA(0)"].reset_index(drop=True)
# 每個基因一行
new_rows = []
for i,r in df2.iterrows():
g_list = r['genes'].split(";")
for g in g_list:
g = g.replace(" ","")
new_rows.append(np.append(r[['chr', 'start', 'end', 'Size(Mb)', 'Nb_variants', 'Percentage_homozygosity', 'genes']].values, g))
df_perGene = pd.DataFrame()
df_perGene = df_perGene.append(pd.DataFrame(new_rows, columns=['chr', 'start', 'end', 'Size(Mb)', 'Nb_variants', 'Percentage_homozygosity', 'genes', 'gene_ID'])).reset_index().drop('index', axis=1)
df_perGene['gene_name'] = df_perGene['gene_ID'].apply(lambda x: x.split("(")[0])
df_perGene['gene_coverage'] = df_perGene['gene_ID'].apply(lambda x: x.split("(")[1].replace(")", ""))
## 移除之前的重復(fù)列 drop the genes column
df_perGene = df_perGene.drop(["genes", "gene_ID"], axis=1)
df_perGene.head()
## 輸出
df_perGene.to_csv(os.path.join(input_path,'out_df_perGene.tsv'), index=False, header = True, sep="\t")