https://www.hardtofind.com.au/
Paste_Image.png
1.獲取分類的鏈接不恭,寫成數(shù)組的形式崩侠,保存在一個(gè)文件中
2.獲取每個(gè)分類的網(wǎng)頁(yè)數(shù)據(jù)原在,保存在csv文件饭宾。
3.讀取每個(gè)分類的商品鏈接,并獲取該鏈接下的商品凶异。
引入插件 蜀撑。
from selenium import webdriver
# get encode uncode by base64
import base64
# get xpath by BeautifulSoup == make base sdk
from bs4 import BeautifulSoup
import time
# get get time by sleep eg:time.sleep(0.1)
from time import sleep
import random
# get re way eg:match = re.search(r'[\w.-]+@[\w.-]+', Fi_Email)
import re
import csv
import unicodecsv
#import unicodecsv ========make
from cStringIO import StringIO
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
import json
import urllib2
from lxml import etree
圖片地址分析:
大圖:https://res.cloudinary.com/hardtofind/image/upload/c_pad,h_580,w_580/b_rgb:ffffff,h_580,w_580/cs_srgb,f_auto,fl_lossy/v1/product_image/santa_friends
小圖:
https://res.cloudinary.com/hardtofind/image/upload/c_pad,h_235,w_235/b_rgb:ffffff,h_235,w_235/cs_srgb,f_auto,fl_lossy/v1/product_image/santa_friends
實(shí)現(xiàn)
#_*_coding:utf8_*_
#!/usr/local/bin/python MAC
# Filename: hard.py
#intend: image,title,price,class
from selenium import webdriver
from bs4 import BeautifulSoup
import urllib2
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
import csv
import unicodecsv
import codecs
dcap = dict(DesiredCapabilities.PHANTOMJS)
dcap["phantomjs.page.settings.userAgent"] = (
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/53 "
"(KHTML, like Gecko) Chrome/15.0.87"
)
driver = webdriver.PhantomJS(desired_capabilities=dcap)
# step 1 get all class
hrefarray = ['gifts','home-garden','prints-art','fashion','jewellery','men','kids','weddings']
for href in hrefarray:
page = -1;
while (page < 200):
page = page + 1
strPageIndex = str(page)
# step 2 get all href
href_a = "https://www.hardtofind.com.au/categories/"+href+"?page="+strPageIndex
print "======"+strPageIndex+"==="+href
csvPageindexfile = file('hrefhardtofind.csv', 'a+')
writer = unicodecsv.writer(csvPageindexfile,encoding='utf-8')
#writer.writerow(['name', 'class', 'type','Address','Po_Address','Phone','Fax','Ma_Office','email'])
hrefdb = (strPageIndex,href_a)
hrefdata = [
hrefdb
]
writer.writerows(hrefdata)
csvPageindexfile.close()
content = urllib2.urlopen(href_a).read()
s = BeautifulSoup(content,"html5lib")
nav_Data = s.find("div", {"id": "products"})
roothref_data = nav_Data.findAll("div", {"class": "sale-item-text-wrap"})
index = 0
maxindex = len(roothref_data) - 1
for webhref in roothref_data:
if (index >= maxindex):
break
index = index + 1
lablehref = webhref.find("a",href=True)
page_href = lablehref["href"]
threecontent = urllib2.urlopen(page_href).read()
s = BeautifulSoup(threecontent,"html5lib")
page_form = s.find("form",{"id":"form_add_to_cart"})
page_title = page_form.find("div",{"class":"title"})
title = page_title.h1.string
print title
price = page_title.find("span",{"itemprop":"price"}).string
print price
page_image =s.find("div",{"class":"galleryContainer"})
page_image_a = page_image.findAll("ul",{"id":"imageGallery"})
# image = page_image.findAll("a")
print page_image_a[0].findAll("a")
# print image
page_class = s.find("div",{"class":"pagination"})
classname = page_class.get_text().strip().encode('utf-8')
classname = classname.replace(' ', '').replace('\n','>')
print classname
# step 3 get all content
csvfile = file('hardtofind_com_au.csv', 'a+')
writer = unicodecsv.writer(csvfile,encoding='utf-8')
csvfile.write(codecs.BOM_UTF8)
writer = csv.writer(csvfile)
# step 4 seve to csv
db = (title,price,classname, page_image_a)
data = [
db
]
writer.writerows(data)
csvfile.close()
# break
page += 1
time.sleep(3)