Python學習的第八天间护,主要學習文本挖掘和模式匹配歉胶。
生物信息大多數(shù)問題都是核酸或氨基酸序列的比對和尋找motif献联,利用正則表達式顯得十分重要。比如一條核酸或氨基酸序列颠焦,你可能想尋找其中的磷酸化位點斩熊,甘露糖基化位點,轉錄結果位點等特定的序列結構伐庭,正則表達式可以幫助尋找特定模式的序列粉渠。
1. 在一條氨基酸序列中尋找特定序列
import re
seq = 'VSVLTMFRYAGWLDRLYMLVGTQLAAIIHGVALPLMMLI'
pattern = re.compile("[ST]Q") #"[ST]Q"表示"SQ"或"TQ"。
match = pattern.search(seq) #搜索seq中符合pattern的序列
print(match.start())
21
print(match.end())
23
if match:
print('%10s' %(seq[match.start() - 4:match.end() + 4])) #'%10s' 序列右對齊,
#顯示從匹配初始位置前4個字符到匹配結束位置后4個字符
print('%6s' % match.group())
else:
print("no match")
MLVGTQLAAI
TQ
######
import re
seq = 'RQSAMGSNKSKPKDASQRRRSLEPAENVHGAGGGAFPASQRPSKP'
pattern = re.compile('R.[ST][^P]') #查找符合'R.[ST][^P]'的序列似忧,
#第一個是氨基酸R,第二"."表示任意氨基酸渣叛,
#第三個是S或T丈秩,第四個是除了P以外其他的任意氨基酸盯捌。
matches = pattern.findall(seq)
print(matches)
['RQSA', 'RRSL', 'RPSK']
match_iter = pattern.finditer(seq)
for match in match_iter:
print(match.group(), match.span()) #顯示匹配的字符位置
print(match.start(), match.end())
RQSA (0, 4)
0 4
RRSL (18, 22)
18 22
RPSK (40, 44)
40 44
####
import re
seq = 'QSAMGSNKSKPKDASQRRRSLEPAENVHGAGGGAFPASQRPSKP'
pattern1 = re.compile('R(.)[ST][^P]')
match1 = pattern1.search(seq)
print(match1.group())
RRSL
print(match1.group(1))
R
2. 匹配中的切片和替換
import re
separator = re.compile("\|") #以"|"為分隔符
annotation = 'ATOM:CA|RES:ALA|CHAIN:B|NUMRES:166'
columns = separator.split(annotation) #以"|"為分隔符將annotation切片
print(columns)
['ATOM:CA', 'RES:ALA', 'CHAIN:B', 'NUMRES:166']
###
new_annotation = separator.sub("@", annotation)
print(new_annotation)
ATOM:CA@RES:ALA@CHAIN:B@NUMRES:166
new_annotation2 = separator.sub('@', annotation, 2)
print(new_annotation2)
ATOM:CA@RES:ALA@CHAIN:B|NUMRES:166
new_annotation3 = separator.subn('@', annotation)
print(new_annotation3)
('ATOM:CA@RES:ALA@CHAIN:B@NUMRES:166', 3)
3. 將prosite文件模式中的符號轉換為正則表達式
pattern = '[DEQN]-x-[DEQN](2)-C-x(3,14)-C-x(3,7)-C-x-[DN]-x(4)-[FY]-x-C'
pattern = pattern.replace('{', '[^')
pattern = pattern.replace('}', ']')
pattern = pattern.replace('(', '{')
pattern = pattern.replace(')', '}')
pattern = pattern.replace('-', '')
pattern = pattern.replace('x', '.')
pattern = pattern.replace('>', '$')
pattern = pattern.replace('<', '^')
print(pattern)
[DEQN].[DEQN]{2}C.{3,14}C.{3,7}C.[DN].{4}[FY].C
4. 尋找基因組序列上的轉錄因子結合位點
已知的轉錄因子結合位點
TFBS文件
import re
genome_seq = open("genome.txt").read() #讀取基因組序列信息
sites = []
for line in open("TFBS.txt"): #讀取已知的轉錄因子結合位點信息文件
fields = line.split()
tf = fields[0]
site = fields[1]
sites.append((tf,site)) #從文件中提取必要信息并格式化
sites
Out[44]:
[('UAS(G)-pMH100', 'CGGAGTACTGTCCTCCG'), #查看新的轉錄因子結合位點的表格
('TFIIIC-Xls-50', 'TGGATGGGAG'),
('HSE_CS_inver0', 'CTNGAANNTTCNAG'),
('ZDNA_CS', '0'),
('GCN4-his3-180', 'ATGACTCAT')]
for tf, site in sites:
tfbs_regexp = re.compile(site) #正則表達式的模式
all_matches = tfbs_regexp.findall(genome_seq) #顯示出所有匹配的序列
matches = tfbs_regexp.finditer(genome_seq)
if all_matches:
print(tf, ":")
for tfbs in matches:
print("\t", tfbs.group(),tfbs.start(), tfbs.end()) #顯示序列和匹配的位置
UAS(G)-pMH100 :
CGGAGTACTGTCCTCCG 0 17
CGGAGTACTGTCCTCCG 60 77
CGGAGTACTGTCCTCCG 120 137
CGGAGTACTGTCCTCCG 180 197
CGGAGTACTGTCCTCCG 240 257
CGGAGTACTGTCCTCCG 300 317
CGGAGTACTGTCCTCCG 360 377
CGGAGTACTGTCCTCCG 420 437
TFIIIC-Xls-50 :
TGGATGGGAG 17 27
TGGATGGGAG 77 87
TGGATGGGAG 137 147
TGGATGGGAG 197 207
TGGATGGGAG 257 267
TGGATGGGAG 317 327
TGGATGGGAG 377 387
TGGATGGGAG 437 447
HSE_CS_inver0 :
CTNGAANNTTCNAG 27 41
CTNGAANNTTCNAG 87 101
CTNGAANNTTCNAG 147 161
CTNGAANNTTCNAG 207 221
CTNGAANNTTCNAG 267 281
CTNGAANNTTCNAG 327 341
CTNGAANNTTCNAG 387 401
CTNGAANNTTCNAG 447 461
GCN4-his3-180 :
ATGACTCAT 50 59
ATGACTCAT 110 119
ATGACTCAT 170 179
ATGACTCAT 230 239
ATGACTCAT 290 299
ATGACTCAT 350 359
ATGACTCAT 410 419
ATGACTCAT 470 479
5. 從PubMed HTML 頁面提取標題和摘要信息
import urllib.request as urllib2
import re
pmid = '18235848'
url = 'http://www.ncbi.nlm.nih.gov/pubmed?term=%s' % pmid #pmid = '18235848'的網(wǎng)頁地址
url
Out[56]: 'http://www.ncbi.nlm.nih.gov/pubmed?term=18235848'
handler = urllib2.urlopen(url) #打開這個網(wǎng)頁
html = handler.read()
html = html.decode("utf-8") #使用utf-8解碼html
title_regexp = re.compile('<h1>.{5,400}</h1>') #html信息中標題是被<h1> 和 </h1>括起來的
title_text = title_regexp.search(html)
abstract_regexp = re.compile('<h3>Abstract</h3><div class=""><p><AbstractText>.{20,3000}</AbstractText></p></div>')
abstract_text = abstract_regexp.search(html)
print(url)
http://www.ncbi.nlm.nih.gov/pubmed?term=18235848
print('TITLE:', title_text.group())
TITLE: <h1>Quantitative high-throughput screen identifies inhibitors of the Schistosoma mansoni redox cascade.</h1>
print('ABSTRACT:', abstract_text.group()) #error!不知道原因蘑秽,
#是不是ncbi頁面改版導致代碼現(xiàn)在不能用了饺著?
6. 檢測PubMed 文章摘要中的特定單詞
import urllib.request as urllib2
import re
# word to be searched
word_regexp = re.compile('schistosoma')
# list of PMIDs where we want to search the word
pmids = ['18235848', '22607149', '22405002', '21630672']
for pmid in pmids:
url = 'http://www.ncbi.nlm.nih.gov/pubmed?term=' + pmid
handler = urllib2.urlopen(url)
html = handler.read()
title_regexp = re.compile('<h1>.{5,400}</h1>')
title = title_regexp.search(html)
title = title.group()
abstract_regexp = re.compile('<h3>Abstract</h3><p>.{20,3000}</p></div>')
abstract = abstract_regexp.search(html)
abstract = abstract.group()
word = word_regexp.search(abstract, re.IGNORECASE)
if word:
# display title and where the keyword was found
print(title)
print(word.group(), word.start(), word.end())
Traceback (most recent call last):
File "<ipython-input-71-3ad765bdb689>", line 23, in <module>
handler = urllib2.urlopen(url)
File "C:\ProgramData\Anaconda3\lib\urllib\request.py", line 222, in urlopen
return opener.open(url, data, timeout)
File "C:\ProgramData\Anaconda3\lib\urllib\request.py", line 531, in open
response = meth(req, response)
File "C:\ProgramData\Anaconda3\lib\urllib\request.py", line 641, in http_response
'http', request, response, code, msg, hdrs)
File "C:\ProgramData\Anaconda3\lib\urllib\request.py", line 569, in error
return self._call_chain(*args)
File "C:\ProgramData\Anaconda3\lib\urllib\request.py", line 503, in _call_chain
result = func(*args)
File "C:\ProgramData\Anaconda3\lib\urllib\request.py", line 649, in http_error_default
raise HTTPError(req.full_url, code, msg, hdrs, fp)
HTTPError: Forbidden
#失敗箫攀,訪問獲取不了Abstract信息