*#coder:samko date:5.20 10:05#一個詳情頁爬取
c = ['female','male']url = 'http://bj.xiaozhu.com/fangzi/1779571235.html'page = requests.get(url)
soup = BeautifulSoup(page.text,'lxml')title = soup.select('h4 > em')
address = soup.select('span.pr5')
img = soup.select('img[id="curBigImage"]')
dailyrent = soup.select('div.day_l > span')
landlordimg = soup.select('div.member_pic > a > img')
landlordname = soup.select('h6 > a[class="lorder_name"]')
landlordgender = soup.select('div.w_240 > h6 > span')print(img)
for i,j,k,l,m,n,o in zip(title,address,img,dailyrent,landlordgender,landlordimg,landlordname): def gender(): if 'member_girl_ico' in m: return c[0] else: return c[1] data = { 'title':i.get_text(), 'address':j.get_text(), 'img':k.get('src'), 'rent':l.get_text()+'元', 'lordimg':n.get('src'), 'lordname':o.get_text(), 'gender':gender() } print(data)#多個詳情頁爬取:如何批量獲取鏈接from bs4 import BeautifulSoupimport requests,re,urllib.requestlinks = []#url = 'http://bj.xiaozhu.com'def get_page(PageNumbers): for page in range(2,PageNumbers):# 每頁24個鏈接,這里輸入的是頁碼 full_url = 'http://bj.xiaozhu.com/search-duanzufang-p{}-0/'.format(str(page)) wb_data = requests.get(full_url)#不用在一個大頁面下將每一個小的頁面都打開分析 soup = BeautifulSoup(wb_data.text,'lxml') for link in soup.select('a.resule_img_a'): # 找到這個 class 樣為resule_img_a 的 a 標簽即可 links.append(link['href'])#具體分析詳情頁蠢正,從這里面找就行沛简!if __name__ == '__main__': get_page(3) print(links)#還有一種方法李根,是爬取所有的具體網(wǎng)頁:'''def get_pages(): r = r'^http://bj.xiaozhu.com/fangzi/\d{9,10}\.html$' lalala = re.compile(r) lalala.findall(page)#page用urllib.request來寫九府,就不具體分析了··冗荸,剩下的步驟與分析某一詳情頁是一樣的禀梳! ````'''```
簡書的markdown真的不好用杜窄,不如jupyter