剛剛完成了本周的作業(yè)团驱,開始很奇怪除了推廣和轉轉沒有正常的商品了...詢問之后就開始抓轉轉吧摸吠,整體感覺難度不大,較好的實踐了本周的知識嚎花。
我的成果
Paste_Image.png
我的代碼
from bs4 import BeautifulSoup
import requests
import time
headers={'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36'}
def get_info(url):
time.sleep(2)
wb_data=requests.get(url,headers=headers)
soup=BeautifulSoup(wb_data.text,'lxml')
titles=soup.select('body > div.content > div > div.box_left > div.info_lubotu.clearfix > div.box_left_top > h1')
cates=soup.select('#nav > div > span > a')
prices=soup.select('body > div.content > div > div.box_left > div.info_lubotu.clearfix > div.info_massege.left > div.price_li > span.price_now > i')
areas=soup.select('body > div.content > div > div.box_left > div.info_lubotu.clearfix > div.info_massege.left > div.palce_li > span > i')
pageviews=soup.select('body > div.content > div > div.box_left > div.info_lubotu.clearfix > div.box_left_top > p > span.look_time')
for title,cate,price,area,pageview in zip(titles,cates,prices,areas,pageviews):
data={
'title':title.get_text(),
'cate':cate.get_text(),
'price':price.get_text(),
'area':area.get_text(),
'pageview':pageview.get_text()
}
print(data)
def get_links():
url='http://bj.58.com/pbdn/'
page_data=requests.get(url)
soup=BeautifulSoup(page_data.text,'lxml')
links=soup.select('#infolist > div.infocon > table > tbody > tr > td.t > a')
urls=[]
for link in links:
if link.get('onclick')=="clickLog('from=zzpc_infoclick');":
info_link=link.get('href').split('?')[0]
urls.append(info_link)
return urls
urls=get_links()
for url in urls:
get_info(url)
總結
- 發(fā)現(xiàn)網(wǎng)頁上已經(jīng)沒有發(fā)帖時間和成色了
- 排除推廣頁面的時候寸痢,用if語句判斷了一個字段篩選
- 兩個函數(shù),一個抓鏈接紊选,一個抓詳情