大眾點評網(wǎng)有許多美食舌界,可以直接通過requests和Beautiful Soup進(jìn)行爬取赋元,其他就不說了忘蟹,上代碼。
import requests
from bs4 import BeautifulSoup
import csv
import os
# 鏈接url
def gethtml(num):
try:
number = num + 1
print('{:<2d}{:<}{:<}'.format(number,'頁',':'))#打印正在爬取的頁數(shù)
url = 'https://www.dianping.com/search/category/5/10/p' + str(num)
r = requests.get(url)
r.raise_for_status()
r.encoding = 'utf-8'#轉(zhuǎn)碼
return r.text
except Exception as e:
print(e)
return ''
# 爬取資源
def findhtml(text, ul):
soup = BeautifulSoup(text, 'lxml')
links = soup.find_all('li', class_='')
for link in links:
ui = []
if link.h4 != None:#爬取店鋪名
ui.append(link.h4.string)
print('{:^50s}'.format(link.h4.string))#打印店鋪名
a1 = link.find('a', class_='review-num')#爬取點評數(shù)
if a1:
ui.append(a1.b.string)
else:
ui.append(' ')
a2 = link.find('a', class_='mean-price')#爬取花費
try:
if a2:
ui.append(a2.b.string)
else:
ui.append(' ')
except:
ui.append('')
a3 = link.find('a', {'data-midas-extends': 'module=5_ad_kwcat'})#爬取菜系
if a3:
ui.append(a3.string)
else:
ui.append(' ')
a4 = link.find('a', {'data-midas-extends': 'module=5_ad_kwregion'})#爬取口味搁凸,環(huán)境媚值,服務(wù)
span1 = link.find('span', {'class': 'addr'})
if a4 and span1:
ui.append(a4.string + ' ' + span1.string)
elif a4 == None and span1 != None:
ui.append(span1.string)
elif a4 != None and span1 == None:
ui.append(a4.string)
else:
ui.append(' ')
try:
spans = link.find('span', class_='comment-list')
spanss = spans.contents
ui.append(spanss[1].b.string)
ui.append(spanss[3].b.string)
ui.append(spanss[5].b.string)
except:
ui.append('')
ui.append('')
ui.append('')
ul.append(ui)
# 保存資源
def savehtml(uls):
path = 'D:/數(shù)據(jù)/'
if not os.path.exists(path):
os.makedirs(path)
with open(os.path.join(path, '大眾點評南京美食.csv'),'w+') as f:
writer = csv.writer(f)
writer.writerow(['店名', '點評數(shù)', '花費', '菜系', '地點', '口味', '環(huán)境', '服務(wù)'])
for i in range(len(uls)):
try:
if uls[i]:
writer.writerow(
[uls[i][0], uls[i][1], uls[i][2], uls[i][3], uls[i][4], uls[i][5], uls[i][6], uls[i][7]])#寫入csv文件
except:
continue
# main()
def main(i):
ulist = []
it = int(i)
for number in range(it):
html = gethtml(number)
findhtml(html, ulist)
savehtml(ulist)
yeshu = input('輸入要查詢的總頁數(shù)(1~50):')
main(yeshu)
感興趣的童鞋可以試試爬取你們地方的美食!;ぬ恰杂腰!