from bs4 import BeautifulSoup
import requests
import time
def url_get(url_number):
urls=[]
urls_homes = ['http://bj.xiaozhu.com/search-duanzufang-p{}-0/'.format(str(i)) for i in range(1, url_number)]
for urls_home in urls_homes:
time.sleep(4)
wb_data = requests.get(urls_home)
soup = BeautifulSoup(wb_data.text, 'lxml')
srcs = soup.select('div[id="page_list"] > ul > li > a')
for src in srcs:
time.sleep(4)
wb_data1 = requests.get(src.get('href'))
soup1 = BeautifulSoup(wb_data1.text,'lxml')#response has no len() .text
titles = soup1.select( 'body > div.wrap.clearfix.con_bg > div.con_l > div.pho_info > h4 > em')
addresses= soup1.select('body > div.wrap.clearfix.con_bg > div.con_l > div.pho_info > p > span')
prices = soup1.select('div.day_l > span')
imgs = soup1.select('div.pho_show_l > div > div > img')
human_imgs =soup1.select('div.js_box.clearfix > div.member_pic > a > img')
names =soup1.select('div.js_box.clearfix > div.w_240 > h6 > a')
sexs = soup1.select(' div.js_box.clearfix > div.member_pic > div')
for title, address, price, img, human_img, name, sex in zip(titles,addresses,prices,imgs,human_imgs,names,sexs):
#性別
if sex.get('class')[0] == 'member_ico1':
sex_self = "man"
elif sex.get('class')[0] == '':
sex_self = "unknown"
else:
sex_self = "women"
data = {
'title':title.get_text(),
'address':address.get_text().strip(),
'price':price.get_text(),
'img':img.get('src'),
'human_img':human_img.get('src'),
'name':name.get_text(),
'sex':sex_self
}
print(data)
url_get(2)
重點(diǎn)
- 性別判斷
sex.get('class')[0]
是一個(gè)列表
- class用點(diǎn)
屬性用[id="xxxx"]
- strip()用來(lái)去除空白符
最后編輯于 :
?著作權(quán)歸作者所有,轉(zhuǎn)載或內(nèi)容合作請(qǐng)聯(lián)系作者