Orthofinder產(chǎn)生的Orthogroups.GeneCount.tsv不能直接作為CAFE軟件的輸入文件眶诈,需要做如下處理:
filter_large_gene_family.py
#!/usr/bin/python
# -*- coding: utf-8 -*-
"""
作者:徐詩芬
內(nèi)容:1.cafe的輸入格式:去掉total這一列,在第一列添加Description项炼,然后這一列都為空
2.過濾掉一個(gè)或多個(gè)物種有超過200個(gè)基因拷貝的基因家族宪拥,size=200
日期:2021.2.3
"""
import sys
from itertools import islice
def usage():
print('Usage: python filter_large_gene_family.py [index_file] [size]')
def main():
ouf1 = open("cafe_gene_family_format.txt", 'wt')
ouf2 = open("cafe_large_gene_family_format.txt", 'wt')
size = eval(sys.argv[2])
with open(sys.argv[1], 'rt') as f:
for head in islice(f, 0, 1): #提取第一行內(nèi)容
head = head.strip().split("\t")[:-1]
ouf1.write("Description\t")
ouf2.write("Description\t")
for h in head:
ouf1.write(h + "\t")
ouf2.write(h + "\t")
ouf1.write("\n")
ouf2.write("\n")
for line in islice(f, 0, None): # 跳過第一行按行讀取文件內(nèi)容
gene_count = line.strip().split("\t")[:-1]
family_name = gene_count[0]
gene_list = list(map(int, gene_count[1:]))
x = [i for i in gene_list if i > size] # 從列表中找出大于某個(gè)值的元素
if len(x) > 0:
ouf2.write("\t")
for n in gene_count:
ouf2.write(n)
ouf2.write("\t")
ouf2.write("\n")
else:
ouf1.write("\t")
for n in gene_count:
ouf1.write(n)
ouf1.write("\t")
ouf1.write("\n")
ouf1.close()
ouf2.close()
try:
main()
except IndexError:
usage()