程序?qū)崿F(xiàn)的原理很簡單,就是先把網(wǎng)頁提取出來痕慢,再提取a標(biāo)簽冈在,過濾出href瓷产。
方法1:
html = urllib2.urlopen(url).read()
# html - unicode(html, 'gb2312', 'ignore').encode('utf-8', 'ignore')
content = BS(html).findAll('a')
myfile = open(localfile, 'w')
pat = re.compile(r'href="([^"]*)"')
pat2 = re.compile(r'http')
for item in content:
h = pat.search(str(item))
href = h.group(1)
if pat2.search(href):
ans = href
else:
ans = url + href
方法2:
def extractlinks(html):
soup = BS(html)
anchors = soup.findAll('a')
links = []
for a in anchors:
links.append(a['href'])
return links
方法3:
base_url = "http://www.hao123.com"
html = urllib2.urlopen(base_url).read()
soup = BS(html)
urls = soup.findAll('a')
links = []
for url in urls:
links.append(url["href"])
for i in range(len(links)):
print links[i]