說明:本文僅供初學(xué)者學(xué)習(xí)交流征懈;請勿用作其他用途
1.分析過程
- 通過分析石咬,我們可以發(fā)現(xiàn)除了北京以外,其他新房二手房url都有共同點卖哎,以上海為例,新房鏈接為https://sh.newhouse.fang.com/house/s/
二手房鏈接為https://sh.esf.fang.com/删性,只有城市簡稱部分不同亏娜,所以我們只需要找到所有城市列表就能實現(xiàn)爬取全部城市新房,二手房 - 進(jìn)入房天下首頁蹬挺,查看更多城市
點擊更多城市维贺,出現(xiàn)城市列表就是我們需要的開始爬取頁面,url為https://www.fang.com/SoufunFamily.htm
2.開始編碼
以下部分直接上代碼巴帮,基本上都是分析爬取信息的xpath的過程溯泣,熟練之后就會發(fā)現(xiàn)是一項體力活...
# -*- coding: utf-8 -*-
"""
items.py
"""
import scrapy
class NewHouseItem(scrapy.Item):
province = scrapy.Field()#省份
city = scrapy.Field()#城市
name = scrapy.Field()#名稱
price = scrapy.Field()#價格
rooms = scrapy.Field()#幾居室
ares = scrapy.Field()#面積
address = scrapy.Field()#地址
district = scrapy.Field()#區(qū)域
sale = scrapy.Field()#是否在售
origin_url = scrapy.Field()#原始url
class ESFHouseItem(scrapy.Item):
province = scrapy.Field()#省份
city = scrapy.Field()#城市
name = scrapy.Field()#名稱
price = scrapy.Field()#總價
rooms = scrapy.Field()#幾居室
floor = scrapy.Field()#層
toward = scrapy.Field()#朝向
year = scrapy.Field()#年代
ares = scrapy.Field()#面積
address = scrapy.Field()#地址
unit = scrapy.Field()#單價
origin_url = scrapy.Field()#原始url
以下是爬蟲代碼部分:
# -*- coding: utf-8 -*-
"""
soufang.py
"""
import re
import scrapy
from scrapy_redis.spiders import RedisSpider
from fang.items import NewHouseItem, ESFHouseItem
class SoufangSpider(RedisSpider):
name = 'soufang'
allowed_domains = ['fang.com']
# start_urls = ['https://www.fang.com/SoufunFamily.htm']
redis_key = "soufang:start_urls"
def parse(self, response):
trs = response.xpath("http://div[@class='outCont']//tr")
province = ''
for tr in trs:
tds = tr.xpath(".//td[not(@class)]")
province_td = tds[0]
province_text = province_td.xpath(".//text()").get()
province_text = re.sub(r"\s", "", province_text)
if province_text:
province = province_text
if province == '其它':
continue
city_td = tds[1]
city_links = city_td.xpath(".//a")
for city_link in city_links:
city = city_link.xpath(".//text()").get()
city_url = city_link.xpath(".//@href").get()
url_module = city_url.split("http://")
scheme = url_module[0]
domain = url_module[1]
if 'bj.' in domain:
newhouse_url = 'https://newhouse.fang.com/house/s/'
esf_url = 'http://esf.fang.com/'
else:
newhouse_url = scheme + '//' + 'newhouse.' + domain + 'house/s/'
esf_url = scheme + '//' + 'esf.' + domain
yield scrapy.Request(url=newhouse_url, callback=self.parse_newhouse, meta={"info": (province, city)})
yield scrapy.Request(url=esf_url, callback=self.parse_esf, meta={"info": (province, city)})
break
break
def parse_newhouse(self, response):
province, city = response.meta.get('info')
lis = response.xpath("http://div[contains(@class, 'nl_con')]/ul/li")
for li in lis:
li_sect = li.xpath(".//div[@class='nlcd_name']/a/text()")
if not li_sect:
continue
name = li_sect.get().strip()
house_type = li.xpath(".//div[contains(@class, 'house_type')]/a/text()").getall()
rooms = '/'.join([item.strip() for item in house_type if item.endswith('居')]) or '未知'
ares = li.xpath("string(.//div[contains(@class, 'house_type')])").get()
ares = ares.split('-')[1].strip() if '-' in ares else '未知'
address = li.xpath(".//div[@class='address']/a/@title").get()
address_info = li.xpath("string(.//div[@class='address'])").get()
district = re.search(r'.*\[(.*)\].*', address_info).group(1)
sale = li.xpath(".//div[contains(@class, 'fangyuan')]/span/text()").get()
price = li.xpath("string(.//div[@class='nhouse_price'])").get().strip()
origin_url = li.xpath(".//div[@class='nlcd_name']/a/@href").get()
item = NewHouseItem(name=name, rooms=rooms, ares=ares, address=address, district=district, sale=sale,
price=price, origin_url=origin_url, province=province, city=city)
yield item
next_url = response.xpath("http://div[@class='page']//a[@class='next']/@href").get()
if next_url:
print('下一頁:新房》》》', response.urljoin(next_url))
yield scrapy.Request(url=response.urljoin(next_url), callback=self.parse_newhouse,
meta={"info": (province, city)})
else:
print("未找到下一頁新房數(shù)據(jù)")
def parse_esf(self, response):
province, city = response.meta.get('info')
print(province, city)
dls = response.xpath("http://div[contains(@class, 'shop_list')]/dl")
for dl in dls:
name = dl.xpath(".//span[@class='tit_shop']/text()").get()
infos = dl.xpath(".//p[@class='tel_shop']/text()").getall()
rooms, floor, toward, ares, year = '未知', '未知','未知','未知','未知'
for info in infos:
if '廳' in info:
rooms = info.strip()
elif '層' in info:
floor = info
elif '向' in info:
toward = info
elif '㎡' in info:
ares = info
elif '建' in info:
year = info
address=dl.xpath(".//p[@class='add_shop']/span/text()").get()
price = dl.xpath("string(.//dd[@class='price_right']/span[1])").get()
unit = dl.xpath("string(.//dd[@class='price_right']/span[2])").get()
detail_url = dl.xpath(".//p[@class='title']/a/@href").get()
origin_url = response.urljoin(detail_url)
item = ESFHouseItem(name=name, rooms=rooms, ares=ares, address=address, toward=toward, floor=floor,
price=price, origin_url=origin_url, province=province, city=city, year=year, unit=unit)
yield item
next_url = None
next_page_info = response.xpath("http://div[@class='page_al']//p")
for info in next_page_info:
if info.xpath("./a/text()").get() == "下一頁":
next_url = info.xpath("./a/@href").get()
print(next_url)
if next_url:
print('下一頁:二手房》》》',response.urljoin(next_url))
yield scrapy.Request(url=response.urljoin(next_url), callback=self.parse_esf,
meta={"info": (province, city)})
else:
print("未找到下一頁二手房數(shù)據(jù)")
加了一個請求頭的中間件,里面有兩種獲取方式
# -*- coding: utf-8 -*-
"""
middlewares.py
"""
import random
from faker import Factory
from scrapy import signals
f = Factory.create()
class UserAgentDownloadMiddleWare(object):
#user-agent隨機(jī)請求頭中間件
USER_AGENTS = [
# Opera
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36 OPR/26.0.1656.60",
"Opera/8.0 (Windows NT 5.1; U; en)",
"Mozilla/5.0 (Windows NT 5.1; U; en; rv:1.8.1) Gecko/20061208 Firefox/2.0.0 Opera 9.50",
"Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; en) Opera 9.50",
# Firefox
"Mozilla/5.0 (Windows NT 6.1; WOW64; rv:34.0) Gecko/20100101 Firefox/34.0",
"Mozilla/5.0 (X11; U; Linux x86_64; zh-CN; rv:1.9.2.10) Gecko/20100922 Ubuntu/10.10 (maverick) Firefox/3.6.10",
# Safari
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/534.57.2 (KHTML, like Gecko) Version/5.1.7 Safari/534.57.2",
# chrome
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.71 Safari/537.36",
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11",
"Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/534.16 (KHTML, like Gecko) Chrome/10.0.648.133 Safari/534.16",
# 360
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/30.0.1599.101 Safari/537.36",
"Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like Gecko",
# 淘寶瀏覽器
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.11 TaoBrowser/2.0 Safari/536.11",
# 獵豹瀏覽器
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.71 Safari/537.1 LBBROWSER",
"Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E; LBBROWSER)",
"Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E; LBBROWSER)",
# QQ瀏覽器
"Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E; QQBrowser/7.0.3698.400)",
"Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E)",
# sogou瀏覽器
"Mozilla/5.0 (Windows NT 5.1) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.84 Safari/535.11 SE 2.X MetaSr 1.0",
"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; SV1; QQDownload 732; .NET4.0C; .NET4.0E; SE 2.X MetaSr 1.0)",
# maxthon瀏覽器
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Maxthon/4.4.3.4000 Chrome/30.0.1599.101 Safari/537.36",
# UC瀏覽器
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.122 UBrowser/4.0.3214.0 Safari/537.36",
]
def process_request(self, request, spider):
user_agent = random.choice(self.USER_AGENTS)
# user_agent = f.user_agent() #另外一種方式榕茧,需要安裝faker庫
print(user_agent)
request.headers['User-Agent'] = user_agent
setting部分
DEFAULT_REQUEST_HEADERS = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Language': 'en',
}
DOWNLOADER_MIDDLEWARES = {
'fang.middlewares.UserAgentDownloadMiddleWare': 543,
}
##########scrspy-redis setting##############
#確保request存儲到redis中
SCHEDULER = "scrapy_redis.scheduler.Scheduler"
#確保所有爬蟲共享相同的去重指紋
DUPEFILTER_CLASS ="scrapy_redis.dupefilter.RFPDupeFilter"
ITEM_PIPELINES={
"scrapy_redis.pipelines.RedisPipeline":300
}
#實現(xiàn)暫停和恢復(fù)
SCHEDULER_PERSIST = True
REDIS_HOST='127.0.0.1' #redis數(shù)據(jù)庫host
REDIS_PORT=6379 #redi數(shù)據(jù)庫默認(rèn)端口
#############################################
3 執(zhí)行爬蟲
前面我們在爬蟲代碼里面我們定義了一個redis的key:redis_key = "soufang:start_urls"
垃沦,用于告訴爬蟲開始爬取的url。
- 進(jìn)入爬蟲目錄spiders,執(zhí)行命令
scrapy runspider soufang.py
,此時爬蟲開始運行用押,但是會阻塞住肢簿,監(jiān)聽開始爬取的url,如下:
2.目前我只在windows上測試過爬取過程蜻拨,結(jié)果是正常的池充,嚴(yán)格意義上分布式爬取應(yīng)該是多臺機(jī)器同時爬才能看到效果(打臉了。缎讼。)收夸,這里主要給大家看下思路,在本地windows安裝redis,先后啟動服務(wù)端redis-server.exe和客戶端redis-cli.exe血崭,在客戶端push一個開始url進(jìn)去卧惜,命令:lpush soufang:start_urls https://www.fang.com/SoufunFamily.htm
這里的soufang:start_urls
是前面soufang.py里面定義的key值」Π保回車序苏,此時可以看到前面阻塞的爬蟲開始工作了