導(dǎo)入相關(guān)模塊
import re
import requests
定義函數(shù)
利用requests模塊提取url信息
def getHTMLText(url):
????try:
????????r = requests.get(url,timeout = 30)
????????r.raise_for_status()
????????r.encoding = r.apparent_encoding
????????return r.text
????except:
????????return ""
這里并沒有利用BeautifuSoup模塊對url信息進行結(jié)構(gòu)化處理,而是直接利用正則表達式提取信息
def parserPage(ilt,html):
????try:
????????plt = re.findall(r'\"view_price\"\:\"[\d\.]*\"',html)? #正則表達式
????????tlt = re.findall(r'\"raw_title\"\:\".*?\"',html)?#正則表達式
? ? ? ? for i in range(len(plt)):
????????????price = eval(plt[i].split(':')[1])
????????????title = eval(tlt[i].split(':')[1])
????????????ilt.append([price,title])
????except:
????????print("")
打印信息
def printGoodsList(ilt):
????tplt = "{:4}\t{:8}\t{:16}"
? ? print(tplt.format("序號","價格","商品名稱"))
????count = 0
? ? for g in ilt:
????????count += 1
????????print(tplt.format(count,g[0],g[1]))
def main():
????goods = '太平鳥'
????depth = 10
????start_url = 'https://s.taobao.com/search?q=' + goods
????infoList = []
????for i in range(depth):
????????try:
????????????url = start_url + '&s=' + str(44*i)
????????????html = getHTMLText(url)
????????????parserPage(infoList,html)
????????except:
????????????continue
????????printGoodsList(infoList)
main()