用python的requests灾票,bs4等庫爬取了小豬短租網(wǎng)的杭州租房信息划栓。
注意
- 有些房子居然沒有寫地址茧彤,這房還怎么租常侦。
- python的UnboundLocalError: local variable 'xxx' referenced b
參考 博客和博客
其他一些應(yīng)注意的點(diǎn)寫在注釋里
這里總共獲取了300多條信息洋机,下一步應(yīng)該考慮把這些信息存儲(chǔ)在mongdb里面坠宴,并且采用多進(jìn)程在加速爬取速度。
import requests,time
from bs4 import BeautifulSoup
start_urls = ['http://hz.xiaozhu.com/search-duanzufang-p{}-0/'.format(str(i)) for i in range(1,14)] #摘取13頁的信息
count = 0
urls = []
def get_info(url):
try:
r = requests.get(url,timeout=30)
r.raise_for_status()
r.encoding = 'utf-8'
except:
print('error')
soup = BeautifulSoup(r.text,'lxml')
title = soup.select('div.pho_info > h4')
address = soup.select('span.pr5')
price = soup.select('div#pricePart > div.day_l > span')
name = soup.select('a.lorder_name')
sex = 'boy' if soup.findAll('span',class_='member_boy_ico') else 'girl' #簡單的判斷男女性別
houseImage = soup.select('img#curBigImage')
menberImage = soup.select('div.member_pic > a > img')
info ={
'title':title[0].get_text().strip(),
'address':address[0].get_text().strip() if address else '杭州', #為防止地址不存在绷旗,特地加以判斷
'price':price[0].get_text(),
'name':name[0].get_text().strip(),
'sex':sex,
'houseImage':houseImage[0].get('src'),
'menberImage':menberImage[0].get('src')
}
global count #全局變量在局部變量使用時(shí)需要指定global
print(count)
count = count + 1
print(info)
def get_urls(url,urls):
r = requests.get(url,timeout = 30)
soup = BeautifulSoup(r.text,'lxml')
links = soup.select('a.resule_img_a')
for link in links:
urls.append(link.get('href'))
def main():
for url in start_urls: # 獲取所有url
get_urls(url,urls)
for url in urls: #對(duì)每個(gè)url進(jìn)行信息獲取
get_info(url)
if __name__ == '__main__':
main()
更新:
把數(shù)據(jù)存入數(shù)據(jù)庫mongdb
import requests,time,pymongo
from bs4 import BeautifulSoup
start_urls = ['http://hz.xiaozhu.com/search-duanzufang-p{}-0/'.format(str(i)) for i in range(1,14)] #摘取13頁的信息
count = 0
urls = []
client = pymongo.MongoClient('localhost',27017) #打開數(shù)據(jù)庫客戶端
xiaozhu = client.xiaozhu
hz = xiaozhu.hz
def get_info(url):
try:
r = requests.get(url,timeout=30)
r.raise_for_status()
r.encoding = 'utf-8'
except:
print('error')
soup = BeautifulSoup(r.text,'lxml')
title = soup.select('div.pho_info > h4')
address = soup.select('span.pr5')
price = soup.select('div#pricePart > div.day_l > span')
name = soup.select('a.lorder_name')
sex = 'boy' if soup.findAll('span',class_='member_boy_ico') else 'girl' #簡單的判斷男女性別
houseImage = soup.select('img#curBigImage')
menberImage = soup.select('div.member_pic > a > img')
info ={
'title':title[0].get_text().strip(),
'address':address[0].get_text().strip() if address else '杭州', #為防止地址不存在喜鼓,特地加以判斷
'price':price[0].get_text(),
'name':name[0].get_text().strip(),
'sex':sex,
'houseImage':houseImage[0].get('src'),
'menberImage':menberImage[0].get('src')
}
global count #全局變量在局部變量使用時(shí)需要指定global
#print(count)
count = count + 1
#print(info)
hz.insert_one(info) #插入數(shù)據(jù)庫
print('\r當(dāng)前進(jìn)度為:{:.2f}%'.format(((count/299)*100)),end="") #在命令行運(yùn)行可以顯示動(dòng)態(tài)窗口,end=""是關(guān)鍵
def get_urls(url,urls):
r = requests.get(url,timeout = 30)
soup = BeautifulSoup(r.text,'lxml')
links = soup.select('a.resule_img_a')
for link in links:
urls.append(link.get('href'))
def main():
for url in start_urls: # 獲取所有url
get_urls(url,urls)
for url in urls: #對(duì)每個(gè)url進(jìn)行信息獲取
get_info(url)
if __name__ == '__main__':
main()