采用的策略:
起始位置減去終止位置為標記長度均唉,如果距離大于100kb
,保留這一行肚菠,如果標記是1或2舔箭,下一個標記(0,來自親本1)的長度小于100kb蚊逢,則合并起始位置
(這里主要保留導入片段)
import sys
def process_file(filename):
# 存儲處理后的行
processed_lines = []
# 用于臨時存儲上一行的信息
last_line = None
with open(filename, 'r') as file:
for line_number, line in enumerate(file, 1):
try:
# 分割每行數(shù)據(jù)
parts = line.strip().split('\t')
if len(parts) < 4:
continue
chrom, start, end, tag = parts[0], int(parts[1]), int(parts[2]), int(parts[3])
# 計算長度
length = end - start
# 如果是第一個標記层扶,直接保存
if last_line is None:
last_line = (chrom, start, end, tag, length)
continue
# 檢查是否需要合并
# if last_line[3] in [1, 2] and length < 100 * 1024: # 100kb
if last_line[0] == chrom and last_line[3] in [1, 2] and length < 100 * 1024: # 100kb
# 合并
last_line = (last_line[0], last_line[1], end, last_line[3], end - last_line[1])
else:
# 添加上一行到結果列表
processed_lines.append(f"{last_line[0]}\t{last_line[1]}\t{last_line[2]}\t{last_line[3]}")
last_line = (chrom, start, end, tag, length)
except ValueError as e:
print(f"Error processing line {line_number}: {e}")
except Exception as e:
print(f"Unexpected error on line {line_number}: {e}")
# 添加最后一行
if last_line is not None:
processed_lines.append(f"{last_line[0]}\t{last_line[1]}\t{last_line[2]}\t{last_line[3]}")
return processed_lines
if __name__ == "__main__":
if len(sys.argv) != 2:
print("Usage: python script.py filename")
sys.exit(1)
filename = sys.argv[1]
result = process_file(filename)
# 輸出結果
for line in result:
print(line)
合并相同標記
import sys
prev_chr = None
prev_mark = None
min_start = None
max_end = None
with open(sys.argv[1], 'r') as file:
# next(file) # 跳過表頭行
for line in file:
columns = line.strip().split()
if len(columns) < 4:
continue
current_chr = columns[0]
current_start = int(columns[1])
current_end = int(columns[2])
current_mark = columns[3]
if current_chr == prev_chr and current_mark == prev_mark:
max_end = current_end
else:
if prev_chr is not None:
print(f"{prev_chr}\t{min_start}\t{max_end}\t{prev_mark}")
prev_chr = current_chr
prev_mark = current_mark
min_start = current_start
max_end = current_end
if prev_chr is not None:
print(f"{prev_chr}\t{min_start}\t{max_end}\t{prev_mark}")