最近閑著在家無(wú)聊摇幻,就看看爬蟲的書籍颓芭,突然發(fā)現(xiàn)很有趣顷锰,就寫了許多代碼,爬取了許多的網(wǎng)站亡问,今天就分享爬取京東的源代碼官紫。
#京東商品信息爬蟲
#爬取京東商品信息并保存到csv格式文件中
#2017-7-23
import os
import requests
import csv
from bs4 import BeautifulSoup
#獲取url請(qǐng)求
def gethtml(kind,page):
'''''獲取url請(qǐng)求'''
pagenum = str(2 * page)
try:
r = requests.get('https://search.jd.com/Search?keyword=' + \
kind + '&enc=utf-8&page=' + pagenum)#鏈接url
r.raise_for_status()
r.encoding = r.apparent_encoding#轉(zhuǎn)碼
print('爬取第{}頁(yè):'.format(page))
return r.text#返回html
except:
print('鏈接異常!V菖骸束世!')
return ''
#獲取定位資源
def findhtml(html,httplist):
"""尋找資源"""
soup = BeautifulSoup(html,'lxml')
links = soup.find_all('div', class_='gl-i-wrap')#尋找'div'標(biāo)簽
for link in links:
ui = []
namediv = link.find('div', class_='p-name p-name-type-2')#尋找商品名稱和鏈接
title = namediv.a['title']
href = namediv.a['href']
ui.append(title)#名稱加入到ui中
pricediv = link.find('div', class_='p-price')#尋找商品價(jià)格
try:
price = pricediv.strong['data-price']
ui.append(price)#價(jià)格加入到ui中
except:
ui.append('')
if 'https:' not in href:#添加鏈接
ui.append('https:' + href)
else:
ui.append(href)
aggressmentdiv = link.find('div', class_='p-commit')#尋找評(píng)論
number = aggressmentdiv.strong.contents[1].string
ui.append(number)#評(píng)論數(shù)添加到ui中
httplist.append(ui)
try:
if price:
print('{:^10s}:{:<}元'.format(title,price))
else:
print('{:^10s}'.format(title))
except:
print('{:^10s}'.format(title))
#保存資源
def savehtml(ul):
path = 'D:/數(shù)據(jù)/'
if not os.path.exists(path):
os.mkdir(path)#創(chuàng)建一個(gè)文件
with open(path + '京東商品信息爬蟲.csv','w+') as f:
writer = csv.writer(f)
writer.writerow(['商品','價(jià)格','鏈接','評(píng)價(jià)數(shù)'])
for u in range(len(ul)):
if ul[u]:
writer.writerow([ul[u][0],ul[u][1],ul[u][2],ul[u][3]])
#程序主體
if __name__ == '__main__':
goods = input('請(qǐng)輸入要搜索的物品:')
yeshu = int(input('請(qǐng)輸入要查詢到的頁(yè)數(shù):'))
ulist = []
for i in range(yeshu+1):
try:
if i != 0:
text = gethtml(goods,i)
findhtml(text,ulist)
savehtml(ulist)
except:
break