今天終于靜下心來(lái)學(xué)習(xí)python爬蟲(chóng)了睛挚,寫(xiě)了一個(gè)爬取貓眼top100榜單電影的小爬蟲(chóng),效率不高,下次一定加油则酝。
一張效果圖(1~100)
都是上學(xué)期學(xué)過(guò)的,現(xiàn)在復(fù)習(xí)復(fù)習(xí)
上代碼
import requests
from bs4 import BeautifulSoup
import urllib
def download(url, num_retries=2):
try:
headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3278.0 Safari/537.36'}
html = requests.get(url, headers=headers)
'''html.encoding=gb2312'''
except Exception as e:
print ('Download error:', e.reason)
html = None
if num_retries > 0:
if 500 <= html.status_code < 600:
return download(url, num_retries-1)
return html
def tiqu(html):
contents = []
bsobject = BeautifulSoup(html, "lxml")
caption = bsobject.findAll('p',{"class":'board-content'})[0].string
for lists in bsobject.findAll('dl',{'class':'board-wrapper'})[0]:
score = ''
content = []
if len(lists) == 1:
continue
content.append(lists.find('i').string)
test = lists.findAll('p',)
for aa in test:
bb = str(aa.string)
bb = bb.replace('\n','').strip()
if bb == 'None':
continue
content.append(bb)
for num in test[3]:
score = score + num.string
content.append(float(score.strip()))
contents.append(content)
return caption, contents
def getUrl(url, html):
urllist = []
listobject = BeautifulSoup(html, 'lxml')
urlone = listobject.findAll('a', {"class":'page_2'})[0].attrs['href'][:-2]
for num in range(0,100,10):
urls = urlone + str(num)
urllist.append(urllib.parse.urljoin(url, urls))
return urllist
def main():
contents = []
url = "http://maoyan.com/board/4"
html = download(url)
urllist = getUrl(url, html.text)
for url in urllist:
html = download(url)
caption , content = tiqu(html.text)
for con in content:
contents.append(con)
for con in contents:
print(con)
if __name__ == '__main__':
main()
看起來(lái)效果不好闰集,還是上傳文件吧
鏈接:https://pan.baidu.com/s/1VBFH4RUagRMgIINT30xP7g 密碼:tydw