import requests# 導(dǎo)入requests包
import re
from bs4import BeautifulSoup
headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36'}
# 中國旅游網(wǎng)
url ='http://www.cntour.cn/'
strhtml = requests.get(url,headers=headers)
# 用lxml解析
soup = BeautifulSoup(strhtml.text, 'lxml')
# 抓取select的內(nèi)容
data = soup.select('#main > div > div.mtop.firstMod.clearfix > div.centerBox > ul.newsList > li > a')
for itemin data:
result = {
'title': item.get_text(),
? ? ? ? 'link': item.get('href'),
? ? ? ? # 正則去ID數(shù)字
? ? ? ? 'ID':re.findall('\d+',item.get('href'))
}
print(result)