從網(wǎng)站抓取標(biāo)題及日期_005
import urllib
from bs4 import BeautifulSoup
url = "http://www.autohome.com.cn/list/c70-1.html"
def get_content_from_autohome(url_address):
html = urllib.urlopen(url_address)
content = html.read()
html.close()
soup = BeautifulSoup(content)
找到唯一的ID標(biāo)簽,找到所有的文章標(biāo)題
all_title = soup.find_all('div', id = "ArticlesTitlesLeft")
for t in all_title:
print t.a['href'] //拿出所有文章鏈接
print t.a.string //提取所有標(biāo)題
找到唯一的ID標(biāo)簽,找到所有的文章日期
all_date = soup.find_all('div', id = "ArticlesTitlesRigth")
for d in all_date:
print d.get_text() //提取所有日期
列表推導(dǎo)式表達(dá)
all_title = soup.find_all('div', id = "ArticlesTitlesLeft")
all_date = soup.find_all('div', id = "ArticlesTitlesRigth")
title_string = [t.a.string for t in all_title]
date_string = [d.get_text() for d in all_date]
組織成字典的形式
all_info = zip(title_string载庭, date_string)
zip_to_dict = dict(all_info)
return zip_to_dict