github地址:https://github.com/A-mermaid-Line-Engineer/python-begin.git
由于畢業(yè)論文要求破婆,用Python做了一個(gè)爬蟲抓取天貓店鋪基本信息该抒,在此共享潮孽。
1.安裝Python2.7版本
在Downloads列表選擇window版本直接下載安裝朴译。和平時(shí)安裝程序一樣坏快,可以參考百度經(jīng)驗(yàn)http://jingyan.baidu.com/article/19192ad8399a62e53e5707e0.html
2.安裝第三方庫beautifulsoup
http://cuiqingcai.com/1319.html 這個(gè)博客中對beautifulsoup的安裝講的十分明白笼裳。
建議直接用Python自帶的pip包安裝
在命令行中輸入
pip install beautifulsoup4
可能還需要安裝lxml唯卖,同樣的
pip install lxml
3.使用命令行運(yùn)行程序
win+r調(diào)出搜索框粱玲,輸入cmd調(diào)出親切的黑底白字
輸入 cd+空格+程序路徑獲取程序目錄
輸入 python+空格+anay.py(主程序名稱)開始運(yùn)行程序
在彈出的 Please input product:后輸入你想抓取的商品品類,例如雪地靴
等待程序自動(dòng)運(yùn)行并聲稱表格拜轨。
注:抓取前50頁大約3000條信息需要一個(gè)小時(shí)左右抽减。也可以在主程序的page中修改抓取頁數(shù)。
附:主程序源代碼
抓取天貓店鋪相關(guān)信息主程序代碼
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# version python27
'''分析內(nèi)容'''
from get_html import download_html as get
from bs4 import BeautifulSoup as bs
import re,sys,urllib
type = sys.getfilesystemencoding()
def get_url(key, page = 50): #采集50頁的內(nèi)容撩轰,根據(jù)需求自己改
print 'get urls...'
keyword = urllib.quote(key.strip())
urls = []
i=1
while(i<=page):
url = "https://list.tmall.com/search_product.htm?type=pc&q=%s&totalPage=100&sort=s&style=g&from=mallfp..pc_1_suggest&suggest=0_1&jumpto=%d#J_Filter"%(keyword,i)
urls.append(url)
i = i + 1
return urls
def get_content(url):
html = get(url)
soup = bs(html, 'lxml')
res = soup.select(".ks-datalazyload")
ms = re.compile(r"<em\sclass=\"count\"[\s\S]*?>([\s\S]*?)<\/em>",re.I|re.M)
ar = re.compile(r"<li\sclass=\"locus\"[\s\S]*?>([\s\S]*?)<\/div>",re.I|re.M)
age = re.compile(r"<span\sclass=\"tm-shop-age-content\"[\s\S]*?>([\s\S]*?)<\/span>",re.I|re.M)
for i in res:
try:
s = ms.findall(str(i))
except:
s = ['None','None','None']
try:
area = ar.findall(str(i))
areas = re.sub(r'<[^>]+>','',area[0].decode('utf-8').encode(type).strip())
areas = areas.replace('\r','')
areas = areas.replace('\n','')
areas = areas.replace('\t','')
areas = areas.replace(' ','')
except:
areas = 'None'
try:
ages = age.findall(str(i))
agess = ages[0].decode('utf-8').encode(type).strip()
except:
agess = 'None'
s.append(areas)
s.append(agess)
return s
def get_link(html):
soup = bs(html ,'lxml')
l = soup.select('.productTitle a')
link = 'https:'+l[0].get('href')
return link
def xls(key,url):
keyword = urllib.quote(key.strip())
html = get(url)
soup = bs(html, 'lxml')
res = soup.select(".product-iWrap")
p = re.compile(r"<p\sclass=\"productPrice\">([\s\S]*?)<\/p>",re.I|re.M)
t = re.compile(r"<p\sclass=\"productTitle\">([\s\S]*?)<\/p>",re.I|re.M)
c = re.compile(r"<p\sclass=\"productStatus\">([\s\S]*?)<\/span>",re.I|re.M)
for i in res:
try:
price = re.sub(r'<[^>]+>','',p.search(str(i)).group(1)).decode('utf-8').encode(type).strip()
title = re.sub(r'<[^>]+>','',t.search(str(i)).group(1)).decode('utf-8').encode(type).strip()
count = re.sub(r'<[^>]+>','',c.search(str(i)).group(1)).decode('utf-8').encode(type).strip()
link = get_link(str(i))
con = get_content(link)
with open(key+'.xls','a') as f:
txt = '%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n'%(title,price,count,con[0],con[1],con[2],con[3],con[4])
f.write(txt)
except:
pass
key = raw_input("Please input product:")
if key.strip() == '':
key = input("Please input product:")
urls = get_url(key)
f = open(key+'.xls','w')
title = '商品名稱\t價(jià)格\t銷量\t描述\t服務(wù)\t物流\t所在地\t開店時(shí)長\n'
f.write(title.decode('utf-8').encode(type))
f.close()
for u in urls:
xls(key,u)
print 'End!'
通用抓取網(wǎng)頁代碼
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# version python27
'''下載網(wǎng)頁'''
import urllib2,gzip,StringIO
def download_html(url, num_retries=2):
print 'Download url:', url
header = {'accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
'accept-encoding':'gzip, deflate, sdch, br',
'accept-language':'en-US,en;q=0.8',
'cache-control':'max-age=0',
'user_agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.143 Safari/537.36'
}
try:
req = urllib2.Request(url,headers = header)
page = urllib2.urlopen(req,timeout=10)
rpheader = page.info()
body = page.read()
except urllib2.URLError as e:
print 'Download Error:', e.reason
body = None
if num_retries > 0:
if hasattr(e, 'code') and 500 <= e.code < 600:
return download_html(url, num_retries-1)
encoding = rpheader.get("Content-Encoding")
if encoding == 'gzip':
content=gz_decoding(body).strip()
else:
content=body.strip()
return content
def gz_decoding(data):
compressedstream = StringIO.StringIO(data)
gziper = gzip.GzipFile(fileobj=compressedstream)
data2 = gziper.read()
return data2