匹配標(biāo)簽
import re
ret = re.search('<(?P<tag_name>\w+)>\w+</(?P=tag_name)>','<h1>hello</h1>')
#還可以在分組中利用?P<name>的形式給分組起名字
#獲取的匹配結(jié)果可以直接用group('名字')拿到對(duì)應(yīng)的值
print(ret.group())
print(ret.group('tag_name'))
# -------------------------
ret = re.search(r'<(\w+)>\w+</\1>','<h1>hello</h1>')
#如果不給分組起名字,也可以用\序號(hào)來(lái)找到對(duì)應(yīng)的組,表說(shuō)要找的內(nèi)容和前面的組的內(nèi)容一致
#獲取的匹配結(jié)果可以直接用group(序號(hào))拿到對(duì)應(yīng)的值
print(ret.group())
print(ret.group(1))
匹配整數(shù)
import re
ret = re.findall(r'\d+',"1-2*(60+(-40.35/5)-(-4*3))")
print(ret)
import re
ret = re.findall(r'-?\d+\.\d*|(-?\d+)',"1-2*(60+(-40.35/5)-(-4*3))")
print(ret)
ret.remove('')
print(ret)
數(shù)字匹配
數(shù)字匹配
# 3.數(shù)字匹配
# 1.匹配一段文本中的每行的時(shí)間字符串师妙,比如’1990-07-12‘
# 1.分別取出一年的十二個(gè)月:
while True:
cmd = input('請(qǐng)輸入月份:>>')
if re.match(r'^(0?[1-9]|1[0-2])$',cmd):
print('格式正確')
else:
print('格式錯(cuò)誤')
# 2.
ret =re.match(r'^(0?[1-9]|1[0-2])$','11')
print(ret.group())
# 一個(gè)月的31天
ret = re.match(r'^((0?[1-9])|((1|2)[0-9])|30|31)$','31')#從頭匹配
print(ret.group())
# 匹配QQ號(hào)
while True:
cmd = input('請(qǐng)輸入你扣扣號(hào):>>')
if re.match(r'^[1-9][0-9]{4,10}$',cmd):
print('輸入正確')
else:
print('輸入錯(cuò)誤')
# 浮點(diǎn)數(shù)
ret = re.match(r'-?\d+\.?\d*','21.5')
print(ret.group())
數(shù)字匹配
爬蟲(chóng)練習(xí)
import requests
import re
import json
def getPage(url):
response=requests.get(url)
return response.text
def parsePage(s):
com=re.compile('<div class="item">.*?<div class="pic">.*?<em .*?>(?P<id>\d+).*?<span class="title">(?P<title>.*?)</span>'
'.*?<span class="rating_num" .*?>(?P<rating_num>.*?)</span>.*?<span>(?P<comment_num>.*?)評(píng)價(jià)</span>',re.S)
ret=com.finditer(s)
for i in ret:
yield {
"id":i.group("id"),
"title":i.group("title"),
"rating_num":i.group("rating_num"),
"comment_num":i.group("comment_num"),
}
def main(num):
url='https://movie.douban.com/top250?start=%s&filter='%num
response_html=getPage(url)
ret=parsePage(response_html)
print(ret)
f=open("move_info7","a",encoding="utf8")
for obj in ret:
print(obj)
data=json.dumps(obj,ensure_ascii=False)
f.write(data+"\n")
if __name__ == '__main__':
count=0
for i in range(10):
main(count)
count+=25