我的代碼
from bs4 import BeautifulSoup
import requests
#觀察每一頁url的規(guī)律
urls = ['http://bj.xiaozhu.com/search-duanzufang-p{}-0/'.format(i) for i in range(1,10)]
#獲取房東性別信息
def get_lorder_sex(class_name):
if class_name == ['member_girl_ico']:
return '女'
elif class_name == ['member_boy_ico']:
return '男'
#獲取房源鏈接信息
def get_links(url):
wq_data = requests.get(url)
soup = BeautifulSoup(wq_data.text,'lxml')
links = soup.select('#page_list > ul > li > a')
for link in links:
href = link.get('href')
get_attraction(href)
def get_attraction(url,data=None):
wb_data = requests.get(url)
#采用lxml引擎解析請求得到的列表頁面數(shù)據(jù)
soup = BeautifulSoup(wb_data.text,'lxml')
#Chrome瀏覽器打開網(wǎng)頁,把鼠標(biāo)放相應(yīng)信息上,右鍵,檢查元素,Copy Css Path,去掉:nth-child()
titles = soup.select('body > div.wrap.clearfix.con_bg > div.con_l > div.pho_info > h4 > em')
adds = soup.select('body > div.wrap.clearfix.con_bg > div.con_l > div.pho_info > p > span')
rends = soup.select('div.day_l > span')
imgs = soup.select('div.pho_show_l > div.pho_show_big > div > img')
img_householders = soup.select('div.js_box.clearfix > div.member_pic > a > img')
names = soup.select('div.js_box.clearfix > div.w_240 > h6 > a')
genders = soup.select('div.js_box.clearfix > div.w_240 > h6 > span')
for title,add,rend,img,img_householder,name,gender in zip(titles,adds,rends,imgs,img_householders,names,genders):
#從標(biāo)簽里提取內(nèi)容,get_text()得到文本,get()得到屬性內(nèi)容,get()得到的是列表,不是字符串
data = {
'title':title.get_text(),
'add':add.get_text(),
'rend':rend.get_text(),
'img':img.get('src'),
'img_householder':img_householder.get('src'),
'name':name.get_text(),
'gender':get_lorder_sex(gender.get('class'))
}
print(data)
for single_url in urls:
get_links(single_url)
總結(jié)
- 使用time.sleep()方法避開網(wǎng)站反爬取
- BeautifulSoup的get()方法得到是列表诗舰,不是字符串
最后編輯于 :
?著作權(quán)歸作者所有,轉(zhuǎn)載或內(nèi)容合作請聯(lián)系作者