元素定位
requests返回的response是html格式悯蝉,我們需要把需要的數(shù)據(jù)提取出來托慨,那么就需要元素定位。常用的元素定位方式有xpath和css,如果你熟悉javascript蕉世,也可以使用pyquery狠轻。
相關(guān)的庫有l(wèi)xml,BeautifuleSoap(官方已經(jīng)將BeautifulSoup改名為bs4了)。相關(guān)的教程太多了向楼,這里為了完整性,舉一個xpath例子湖蜕,做個小總結(jié)昭抒。
例子是抓取美容下所有分類和具體項(xiàng)目的相關(guān)信息。
# -*- coding:utf-8 -*-
"""
File Name : 'Spider_soyoung'.py
Description:
Author: 'chengwei'
Date: '2016/4/22' '9:43'
"""
import sys
import requests
import json
import random
import redis
import logging
import pymssql
import copy
import datetime
import time
import json
from lxml import etree
import re
reload(sys)
sys.setdefaultencoding('utf8')
class Spider_plastics(object):
def __init__(self):
self.user_agents = ['Mozilla/5.0 (Windows NT 6.1; WOW64; rv:23.0) Gecko/20130406 Firefox/23.0',
'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:18.0) Gecko/20100101 Firefox/18.0',
'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/533+ (KHTML, like Gecko) Element Browser 5.0',
'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; WOW64; Trident/6.0)',
'Opera/9.80 (Windows NT 6.0) Presto/2.12.388 Version/12.14',
'Mozilla/5.0 (iPad; CPU OS 6_0 like Mac OS X) AppleWebKit/536.26 (KHTML, like Gecko) Version/6.0 Mobile/10A5355d Safari/8536.25',
'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/28.0.1468.0 Safari/537.36',
'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.0; Trident/5.0; TheWorld)',
'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2227.1 Safari/537.36'
]
self.root_url = 'http://plastics.517mr.com/'
#log
self.logfilename = self.__class__.__name__ + '.log'
logging.basicConfig(level=logging.INFO, format='%(asctime)s %(levelname)s:%(message)s', datefmt='%m/%d/%Y %I:%M:%S %p',
filename=self.logfilename, filemode='a')
# SQL
self.conn = pymssql.connect(host='99.48.58.23', user='sa', password='123456', database='meirong', charset="utf8")
self.cur = self.conn.cursor(as_dict=True)
def get_detail_url(self):
user_agent = random.choice(self.user_agents)
header_2 = {
"User-Agent": user_agent
}
s = requests.Session()
url_list = []
html = s.get(self.root_url, headers=header_2)
time.sleep(3)
selector = etree.HTML(html.text)
content_1 = selector.xpath('//*[@id="zxmr"]//div[starts-with(@class,"xm_list")]')
content_2 = selector.xpath('//*[@id="pfmr"]//div[starts-with(@class,"xm_list")]')
content_3 = selector.xpath('//*[@id="zsmr"]//div[starts-with(@class,"xm_list")]')
content_4 = selector.xpath('//*[@id="jgmr"]//div[starts-with(@class,"xm_list")]')
content_5 = selector.xpath('//*[@id="sssx"]//div[starts-with(@class,"xm_list")]')
content_6 = selector.xpath('//*[@id="mfzz"]//div[starts-with(@class,"xm_list")]')
content_7 = selector.xpath('//*[@id="myjc"]//div[starts-with(@class,"xm_list")]')
content_8 = selector.xpath('//*[@id="zymr"]//div[starts-with(@class,"xm_list")]')
content_9 = selector.xpath('//*[@id="sbxf"]//div[starts-with(@class,"xm_list")]')
temp_list = [
{'type': u'整形美容', 'content': content_1},
{'type': u'皮膚美容', 'content': content_2},
{'type': u'注射美容', 'content': content_3},
{'type': u'激光美容', 'content': content_4},
{'type': u'瘦身美容', 'content': content_5},
{'type': u'毛發(fā)種植', 'content': content_6},
{'type': u'美牙健齒', 'content': content_7},
{'type': u'中醫(yī)美容', 'content': content_8},
{'type': u'失敗修復(fù)', 'content': content_9}
]
for item in temp_list:
for element in item['content']:
link = element.xpath('.//a/@href')
i = 3
for m in range(0, len(link)):
if m == 0:
continue
else:
item_dict = {}
item_dict['categories'] = item['type']
name = element.xpath('string(.)').replace(' ', '').replace('\t', '').strip().split('\n')
link = element.xpath('.//a/@href')
item_dict['location'] = name[0]
item_dict['project_classification'] = name[i]
item_dict['url'] = link[m]
i += 1
url_list.append(copy.deepcopy(item_dict))
time.sleep(0.1)
s.close()
return url_list
def get_detail_info(self):
user_agent = random.choice(self.user_agents)
header_2 = {
"User-Agent": user_agent
}
url_list = self.get_detail_url()
s = requests.Session()
n = 0
for item in url_list:
n += 1
if n == 153:
print "test"
res = s.get(item['url'], headers=header_2)
if res.status_code == 200:
selector = etree.HTML(res.text)
content = selector.xpath('//*[@id="catelist"]//div[@class = "diy_tr"]')
content_2 = selector.xpath('//div[@class = "price"]/em')
else:
logging.error("%s:%d" %(item['url'], res.status_code))
continue
if len(content) != 0:
for element in content:
info_1 = element.xpath('./span[@class = "w1 outer"]')[0].xpath('string(.)').strip().split('\n')[0]
info_2 = element.xpath('./span[@class = "w3 outer"]')[0].xpath('string(.)').strip()
info_3 = element.xpath('./span[@class = "w4 outer"]')[0].xpath('string(.)').strip()
info_4 = element.xpath('./span[@class = "w5 outer"]')[0].xpath('string(.)').strip()
info_5 = element.xpath('./span[@class = "w6 outer"]')
temp_dict = {}
temp_dict['categories'] = item['categories']
temp_dict['location'] = item['location']
temp_dict['project_classification'] = item['project_classification']
temp_dict['feature'] = info_1
temp_dict['apply_to'] = info_2
temp_dict['price'] = info_3
temp_dict['refresh_cycle'] = info_4
temp_dict['attention'] = len(info_5[0].xpath('.//div[@class = "c6"]/em[@class = "x"]'))
time.sleep(0.6)
try:
sql = "INSERT INTO kanghua (categories, location, project_classification, feature, " \
"apply_to, price, refresh_cycle, attention) VALUES ('%s','%s','%s', '%s', '%s', " \
"'%s','%s','%s')" %(temp_dict['categories'],temp_dict['location'],temp_dict['project_classification'],
temp_dict['feature'], temp_dict['apply_to'],temp_dict['price'], temp_dict['refresh_cycle'],
temp_dict['attention'])
self.cur.execute(sql)
self.conn.commit()
except:
logging.error(" 第一種布局 INSERT ERROR")
continue
if len(content_2) != 0:
logging.info("another css:%s %s %s" %(item['categories'], item['location'], item['project_classification']))
temp_dict = {}
temp_dict['price'] = selector.xpath('//div[@class = "price"]/em')[0].xpath('string(.)')
temp_dict['categories'] = item['categories']
temp_dict['location'] = item['location']
temp_dict['project_classification'] = item['project_classification']
temp_dict['feature'] = ''
temp_dict['apply_to'] = ''
temp_dict['refresh_cycle'] = ''
temp_dict['attention'] = ''
time.sleep(0.6)
try:
sql = "INSERT INTO kanghua (categories, location, project_classification, feature, " \
"apply_to, price, refresh_cycle, attention) VALUES ('%s','%s','%s', '%s', " \
"'%s', '%s','%s','%s')" %(temp_dict['categories'],temp_dict['location'],
temp_dict['project_classification'],temp_dict['feature'],
temp_dict['apply_to'],temp_dict['price'], temp_dict['refresh_cycle'],
temp_dict['attention'])
self.cur.execute(sql)
self.conn.commit()
except:
logging.error("第二種布局 INSERT ERROR")
else:
logging.error("error:%s %s %s" %(item['categories'], item['location'], item['project_classification']))
test = Spider_plastics()
test.get_detail_info()
xpath說明:
基本語法可參考W3CSchool
獲取某個節(jié)點(diǎn)下的所有文本可以使用string(.)
element.xpath('string(.)')
- 常用的功能函數(shù)
starts-with
//div[starts-with(@id,'res')]
contains和and(.代表當(dāng)前節(jié)點(diǎn)怕磨,..表示父節(jié)點(diǎn))
//span[contains(.,'_Test') and contains(.,'KPI')]
- charome插件XPather肠鲫,測試xpath的好工具
- beautifulSoap文檔