??這兩天在學(xué)習(xí)面向?qū)ο螅鴷緦懥艘粋€小例子。這種編程思想以前沒用過韧骗,在慢慢熟悉其中的套路。事實上盡管學(xué)習(xí)生信有一年半了姿鸿,仍然覺得自己編程能力較弱,原因是之前處理數(shù)據(jù)除了需要用到哈希我會用Perl寫腳本以外倒源,其它的場景我基本都用命令行可以解決苛预,所以編程能力也還在哈希那個層次停留。這也是為什么最近在練習(xí)編程的原因笋熬。
??下面這個小腳本的作用是:輸入一段序列热某,可以查詢它的長度、類型胳螟、各堿基的個數(shù)昔馋,查詢是否包含某一段特定的序列,DNA序列還能查詢GC含量以及求反向互補(bǔ)序列糖耸。
包含兩個類的模塊(用到了繼承)
sequence.py
class Sequence():
def __init__(self, seq):
#屬性1:序列本身
self.seq = seq
#屬性2:序列類型
set_seq = set(seq)
set_dna = set("ATGCN")
set_rna = set("AUGCN")
set_pro = set("ACDEFGHIKLMNPQRSTVWY")
if set_seq <= set_dna:
self.seqtype = "DNA"
elif set_seq <= set_rna:
self.seqtype = "RNA"
elif set_seq <= set_pro:
self.seqtype = "protein"
else:
self.seqtype = "Unknown"
#屬性3:序列長度
self.seqlen = len(seq)
#DNA, RNA, protein各種元素的含量
def seq_element(self):
element_counts = {}
for i in self.seq:
if i in element_counts.keys():
element_counts[i] += 1
else:
element_counts[i] = 1
num = 1
for key, value in element_counts.items():
this_key_percent = value / self.seqlen
print(str(num)+"\t" + key + "\t" + str(value) + "\t" + str(this_key_percent))
num +=1
#查詢是否含有特定的子序列
#如果有秘遏,則返回具體的坐標(biāo),從1開始
def sub_seq(self, query_seq):
query_seq_len = len(query_seq)
times = self.seqlen+1-query_seq_len
hit_num = 0
for i in range(times):
part_of_self = self.seq[i:i+query_seq_len]
if part_of_self == query_seq:
hit_num += 1
if hit_num == 1:
print("the query seq exists in the ref seq, and the coordinates: \n")
print("Num.\tLeft\tRight")
print(str(hit_num) + "\t" + str(i+1) + "\t" + str(i+query_seq_len))
class DnaSequence(Sequence):
def __init__(self, seq):
super().__init__(seq)
#求DNA序列的反向互補(bǔ)序列
def rev_com(self):
if self.seqtype == "DNA":
seq_length = len(self.seq)
rc_self = ""
for i in range(1,seq_length+1):
tmp_base = self.seq[-i]
if tmp_base == "A":
rc_self += "T"
elif tmp_base == "T":
rc_self += "A"
elif tmp_base == "G":
rc_self += "C"
elif tmp_base == "C":
rc_self += "G"
else:
rc_self += "N"
return rc_self
else:
print("Please make sure your sequence is a DNA sequence!")
#求DNA序列的GC含量
def seq_gc(self):
if self.seqtype == "DNA":
seq_length = len(self.seq)
gc_counts = 0
for i in range(seq_length):
tmp_base = self.seq[i]
if tmp_base == "G" or tmp_base == "C":
gc_counts += 1
gc_percent = gc_counts / seq_length
return gc_percent
else:
print("Please make sure your sequence is a DNA sequence!")
測試腳本:test_sequence.py
from sequence import DnaSequence
#seq = input("Please input your sequence: ")
seq = "AGCTCGTACGTCGTACGTACGTACG"
my_seq = DnaSequence(seq)
print(my_seq.seq)
print(my_seq.seqlen)
print(my_seq.seqtype)
print(my_seq.seq_gc())
print(my_seq.rev_com())
my_seq.seq_element()
my_seq.sub_seq("CGT")
結(jié)果
print(my_seq.seq)
AGCTCGTACGTCGTACGTACGTACG
print(my_seq.seqlen)
25
print(my_seq.seqtype)
DNA
print(my_seq.seq_gc())
0.56
print(my_seq.rev_com())
CGTACGTACGTACGACGTACGAGCT
my_seq.seq_element()
1 A 5 0.2
2 G 7 0.28
3 C 7 0.28
4 T 6 0.24
my_seq.sub_seq("CGT")
the query seq exists in the ref seq, and the coordinates:
Num. Left Right
1 5 7
2 9 11
3 12 14
4 16 18
5 20 22