HTTP1.1中Request方法7種
get post head put options connect trace delete真實網(wǎng)頁解析
監(jiān)視網(wǎng)頁:Network
刷新網(wǎng)頁:第一個文件,request和response的信息全部顯示在里面
import requests
from bs4 import BeautifulSoup
import time#插入時間
url = 'http://www.tripadvisor.cn/Attractions-g60763-Activities-New_York_City_New_York.html'
urls = ['http://www.tripadvisor.cn/Attractions-g60763-Activities-oa{}-New_York_City_New_York.html#ATTRACTION_LIST'.format(str(i)) for i in range(30,1030,30)]
user_saves = 'http://www.tripadvisor.cn/Saves#1'
headers = { 'user-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36', 'Cookie':'TAUnique=%1%enc%3AGt%2BTZhWhYRLlya%2Bb84AAmksGWRwjidrr8w%2F6Ze%2BL2cUnuvWISCXjiA%3D%3D; __gads=ID=14ede999d17f3c90:T=1461160891:S=ALNI_MZNR9_0t0Q1iGGOaY9f7Nxo_uwI4Q; bdshare_firstime=1461163798929; TAAuth2=%1%3%3A0196062bdc62174625411e900aaf8dc0%3AAAbn4kxcinEu%2FY1ZBVHGXA1vuNmknYlm2BX6q79fzLVxpkyNxzjcz03cx%2BjTj%2BnIDud%2FtrnQW1Kj08wg%2BXccFPaCh9673sKMNdESJOiei28DW8p%2F3GkBIRN8MDPdq486%2F3DicH7JxYeiHlJp03fLgXgKM6X%2FMereL6%2F7%2B%2BtKwRdsPT%2F31vFSIDei%2B%2FSSkT60CJ%2FwlSMY3sigkA%2BMWAsoex8%3D; _jzqy=1.1461160723.1461204501.2.jzqsr=baidu|jzqct=tripadvisor.jzqsr=baidu|jzqct=%E7%8C%AB%E9%80%94%E9%B9%B0%E7%BD%91; taMobileRV=%1%%7B%2210021%22%3A%5B1951181%5D%2C%2210028%22%3A%5B60763%5D%7D; ServerPool=A; TASSK=enc%3Ahwdy10o2uWvTDzq0MQZXeA5tD6r7MOpWpPLWsEVezsyeBefYE30WLhybhKPN4yl9; TAPD=tripadvisor.cn; _smt_uid=57178b12.4d58ed6c; _jzqckmp=1; TATravelInfo=V2*A.2*MG.-1*HP.2*FL.3*RVL.60763_153l1687489_153*RS.1; CM=%1%HanaPersist%2C%2C-1%7Ct4b-pc%2C%2C-1%7CHanaSession%2C%2C-1%7CFtrSess%2C%2C-1%7CRCPers%2C%2C-1%7CHomeAPers%2C%2C-1%7CWShadeSeen%2C%2C-1%7CRCSess%2C%2C-1%7CFtrPers%2C%2C-1%7CHomeASess%2C4%2C-1%7Csh%2C%2C-1%7CLastPopunderId%2C137-1859-null%2C-1%7Cpssamex%2C%2C-1%7C2016sticksess%2C%2C-1%7CCCPers%2C%2C-1%7CCpmPopunder_1%2C1%2C1464913708%7CCCSess%2C%2C-1%7CCpmPopunder_2%2C5%2C-1%7CWAR_RESTAURANT_FOOTER_SESSION%2C%2C-1%7Cb2bmcsess%2C%2C-1%7Csesssticker%2C%2C-1%7C%24%2C%2C-1%7C2016stickpers%2C%2C-1%7Ct4b-sc%2C%2C-1%7CMC_IB_UPSELL_IB_LOGOS2%2C%2C-1%7Cb2bmcpers%2C%2C-1%7CMC_IB_UPSELL_IB_LOGOS%2C%2C-1%7Csess_rev%2C11%2C-1%7Csessamex%2C%2C-1%7CSaveFtrPers%2C%2C-1%7CSaveFtrSess%2C%2C-1%7Cpers_rev%2C%2C-1%7CRBASess%2C%2C-1%7Cperssticker%2C%2C-1%7CMetaFtrSess%2C%2C-1%7Cmds%2C%2C-1%7CRBAPers%2C%2C-1%7CWAR_RESTAURANT_FOOTER_PERSISTANT%2C%2C-1%7CMetaFtrPers%2C%2C-1%7C; TAReturnTo=%1%%2FAttraction_Review-g60763-d1687489-Reviews-The_National_9_11_Memorial_Museum-New_York_City_New_York.html; _jzqx=1.1461163798.1464829056.3.jzqsr=tripadvisor%2Ecn|jzqct=/attractions-g60763-activities-new_york_city_new_york%2Ehtml.jzqsr=tripadvisor%2Ecn|jzqct=/attractions-g60763-activities-new_york_city_new_york%2Ehtml; roybatty=AMO%2BuRqD4X6mrI%2FdkihO6SQRm8U1MzgRaLqYtAv1%2BnH%2BbBqTWloasiGsBbHvzicfw5Hz1hzJidthRhOOdKhEyEmdAnN7dLInMp06y2BBQ23lWR4m%2FyebLmBmvWLYuIiDeaGI5CbGAr%2BA%2F3TYUxxLA947TrYhXrXWzQ0uG8paNGZd%2C1; TASession=%1%V2ID.BD0BBE2EED6EB075774995BCEB9C8B43*SQ.20*LS.SavesAjax*GR.56*TCPAR.67*TBR.92*EXEX.53*ABTR.32*PPRP.76*PHTB.6*FS.28*CPU.56*HS.popularity*ES.popularity*AS.popularity*DS.5*SAS.popularity*FPS.oldFirst*TS.5D6F093B439A5AD40CB39E156980DB8B*LF.zhCN*FA.1*DF.0*LP.%2FLangRedirect%3Fauto%3D3%26origin%3Den_US%26pool%3DA%26returnTo%3D%252F*IR.3*OD.en_US*MS.-1*RMS.-1*FLO.60763*TRA.true*LD.1687489; TAUD=LA-1464827274965-1*LG-1988476-2.1.F*LD-1988478-.....; Hm_lvt_2947ca2c006be346c7a024ce1ad9c24a=1464827094; Hm_lpvt_2947ca2c006be346c7a024ce1ad9c24a=1464829074; ki_t=1461160724394%3B1464827095962%3B1464829073873%3B3%3B24; ki_r=; _qzja=1.398601154.1461160723640.1464827095558.1464829055540.1464829055540.1464829073970..0.0.24.7; _qzjb=1.1464829055539.2.0.0.0; _qzjc=1; _qzjto=7.2.0; _jzqa=1.1187422896885783000.1461160723.1464827094.1464829056.7; _jzqc=1; _jzqb=1.2.10.1464829056.1; NPID='}
def get_attractions(url,data=None):
wb_data = requests.get(url)
soup = BeautifulSoup(wb_data.text,'lxml') #變?yōu)榭勺x的文件务蝠,使用text方法 time.sleep(2)#2秒訪問一次
#print(soup)
titles = soup.select('div.property_title > a[target="_blank]')#去除聚合性標(biāo)簽,通過觀察發(fā)現(xiàn)聂示,非聚合性的標(biāo)簽中target=_blank
images = soup.select('img[width="160"]') #標(biāo)簽+[特定屬性的值] 通過該這種方式找到想要的某種元素
cates = soup.select('div.p13n_reasoning_v2') #標(biāo)簽一對多就乓,要在它的上一級就停下來查找
#為了方便查找,將以上的信息裝入字典中
for title, img, cate in zip(titles,images,cates):
data = {
'title':title.get_text(),
'img': img.get('src'),
'cate':list(cate.stripped_strings) #stripped_strings方法獲得一個副標(biāo)簽下的所有子標(biāo)簽的文本,由于內(nèi)容是成組的镜盯,所以列表化 } print(data)'''打印結(jié)果顯示腹忽,圖片地址完全一樣建芙,是因為網(wǎng)站進行了反爬取没隘,這時在檢查里復(fù)制一個圖片鏈接然后在網(wǎng)頁中點擊顯示原代碼,查找ctrl+F這個圖片鏈接,再查找loayload禁荸,需要正則匹配查找右蒲,但不是長久之計。之后會有簡單方法爬取圖片'''#對登錄后保存的清單進行爬取赶熟。這需要登錄和密碼才能看到瑰妄,現(xiàn)在需要告訴瀏覽器,我們是誰映砖,需要在network中header里能作為身份識別的cookie,就能告訴服務(wù)器我們的狀態(tài)#構(gòu)造向服務(wù)器提交的參數(shù):headers,在request headersdef get_fav(url,data=None):
wb_data = requests.get(user_saves,headers=headers)#添加默認(rèn)參數(shù)间坐,
soup = BeautifulSoup(wb_data.text,'lxml')
titles = soup.select('a.location-name')
imgs = soup.select('img.photo_image')
addresses = soup.select('span.format_address')
for title,img,address in zip(titles,imgs,addresses):
data = {
'title':title.get_text(),
'img':img.get('src'),
'address':list(address.stripped_strings),
}
print(data)
#print(urls)for singgle_url in urls:
get_attractions(singgle_url)