Python網(wǎng)絡(luò)數(shù)據(jù)采集2-wikipedia
隨機鏈接跳轉(zhuǎn)
獲取維基百科的詞條超鏈接鹰贵,并隨機跳轉(zhuǎn)】导危可能側(cè)邊欄和低欄會有其他鏈接碉输。這不是我們想要的,所以定位到正文亭珍。正文在id
為bodyContent
的div
標(biāo)簽里腊瑟。
import random
import re
import requests
from bs4 import BeautifulSoup
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko)'
' Chrome/52.0.2743.116 Safari/537.36 Edge/15.16193'}
start_url = '/wiki/Wiki'
def get_links(url):
r = requests.get('https://en.wikipedia.org' + url, headers=headers)
soup = BeautifulSoup(r.text, 'lxml')
# /wiki/some_words
link_list = soup.find('div', id='bodyContent').find_all('a', href=re.compile(r'^/wiki/[^/]*$'))
return link_list
links = get_links(start_url)
while len(links) > 0:
# 隨機選擇一個鏈接
link = random.choice(links).get('href')
print(link)
# 新的詞條覆蓋了原來的超鏈接,一直搜尋
links = get_links(link)
/wiki/Personal_wiki
/wiki/Database_management_system
/wiki/Netezza
/wiki/C%2B%2B
/wiki/C%2B%2B#Standardization
/wiki/ISO_9984
/wiki/Georgian_script
...
從首頁開始块蚌,將首頁的所有詞條放入集合中(去重)闰非,再遍歷集合,從集合中的鏈接遞歸搜索峭范。
import re
import requests
from bs4 import BeautifulSoup
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko)'
' Chrome/52.0.2743.116 Safari/537.36 Edge/15.16193'}
pages = set()
def get_links(url):
global pages
r = requests.get('https://en.wikipedia.org' + url, headers=headers)
soup = BeautifulSoup(r.text, 'lxml')
# /wiki/some_words
link_list = soup.find('div', id='bodyContent').find_all('a', href=re.compile(r'^/wiki/[^:/]*$'))
for link in link_list:
if link['href'] not in pages:
new_page = link['href']
pages.add(new_page)
print(new_page)
get_links(new_page)
if __name__ == '__main__':
# 空字符串表示财松,url為wiki主頁https://en.wikipedia.org
get_links('')
獲取詞條的標(biāo)題、正文
標(biāo)題在h1
標(biāo)簽中纱控,正文在id為mw-content-text
的div標(biāo)簽中辆毡。
import re
import requests
from bs4 import BeautifulSoup
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko)'
' Chrome/52.0.2743.116 Safari/537.36 Edge/15.16193'}
pages = set()
def get_links(url):
global pages
r = requests.get('https://en.wikipedia.org' + url, headers=headers)
soup = BeautifulSoup(r.text, 'lxml')
# /wiki/some_words
try:
print(soup.h1.string)
# 只打印第一段
print(soup.find(id='mw-content-text').find('p').text)
except AttributeError:
print('頁面缺少一些屬性。')
link_list = soup.find('div', id='bodyContent').find_all('a', href=re.compile(r'^/wiki/[^:/]*$'))
for link in link_list:
if link['href'] not in pages:
new_page = link['href']
pages.add(new_page)
print('----------\n' + new_page)
get_links(new_page)
if __name__ == '__main__':
# 空字符串表示甜害,url為wiki主頁https://en.wikipedia.org
get_links('')
Main Page
Noye's Fludde is a one-act opera written largely for young amateur performers, created by the British composer Benjamin Britten. First performed in 1958 at the annual Aldeburgh Festival, it is based on the 15th-century Chester "mystery" play which recounts the biblical story of Noah, the flood and the ark. Britten had written numerous works for mixed
...
--------
/wiki/Wikipedia
Wikipedia
Wikipedia (/?w?k??pi?di?/ ( listen) or /?w?ki?pi?di?/ ( listen) WIK-i-PEE-dee-?) is a free online encyclopedia with the aim to allow anyone to edit articles.[3] Wikipedia is the largest and most popular general reference work on the Internet[4][5][6] and is ranked among the ten most popular websites.[7] Wikipedia is owned by the nonprofit Wikimedia Foundation.[8][9][10]
--------
/wiki/Main_Page
...
尋找外鏈
從https://www.oreilly.com
開始不斷尋找外鏈舶掖,如果某個頁面沒有外鏈,則進入該頁面的某個內(nèi)鏈尔店,再重新找外鏈眨攘。感覺這個例子不是很好主慰,因為從其他外鏈又可能回到初始頁面。
import re
import random
import requests
from bs4 import BeautifulSoup
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko)'
' Chrome/52.0.2743.116 Safari/537.36 Edge/15.16193'}
def get_random_external_link(start_page):
r = requests.get(start_page, headers=headers)
soup = BeautifulSoup(r.text, 'lxml')
# 返回分割地址的第一個元素鲫售,一般是主頁的地址
ex_links = get_external_links(soup, split_address(start_page)[0])
# 如果該頁面沒有外鏈共螺,則獲取內(nèi)鏈,再從內(nèi)鏈里隨機選取一個,遞歸情竹,直到獲取到外鏈為止藐不。
if len(ex_links) == 0:
internal_links = get_internal_links(soup, split_address(start_page)[0])
return get_random_external_link(random.choice(internal_links))
else:
return random.choice(ex_links)
def get_internal_links(bs, include_url):
internal_links = []
# 找出所有以為'/'開頭的鏈接,此為內(nèi)鏈
in_links = bs.find_all('a', href=re.compile(r'^/|' + include_url))
for link in in_links:
if link['href'] not in internal_links:
internal_links.append(link['href'])
return internal_links
def get_external_links(bs, exclude_url):
external_links = []
# 找出所有以http秦效、https開頭的鏈接雏蛮,且不含內(nèi)鏈字符的,此為外鏈,(?!...)表示不包含
ex_links = bs.find_all('a', href=re.compile(r'^(https|http)((?!' + exclude_url + ').)*$'))
for link in ex_links:
if link['href'] not in external_links:
external_links.append(link['href'])
return external_links
def split_address(address):
address_parts = []
if address.split(':')[0] == 'http':
address_parts = address.replace('http://', '').split('/')
elif address.split(':')[0] == 'https':
address_parts = address.replace('https://', '').split('/')
return address_parts
# 只搜索外鏈
def follow_external_only(url):
external_link = get_random_external_link(url)
print(external_link)
follow_external_only(external_link)
all_ex_links = set()
all_in_links = set()
# 獲得所有外鏈和內(nèi)鏈阱州,并打印了外鏈
def get_all_external_links(url):
r = requests.get(url, headers=headers)
soup = BeautifulSoup(r.text, 'lxml')
internal_links = get_internal_links(soup, split_address(url)[0])
external_links = get_external_links(soup, split_address(url)[0])
for link in external_links:
if link not in all_ex_links:
all_ex_links.add(link)
print(link)
for link in internal_links:
if link not in all_in_links:
all_in_links.add(link)
get_all_external_links(link)
if __name__ == '__main__':
# follow_external_only('https://www.oreilly.com')
get_all_external_links('https://www.oreilly.com')
https://www.safaribooksonline.com/?utm_medium=content&utm_source=oreilly.com&utm_campaign=lgen&utm_content=20170601+nav
http://shop.oreilly.com/
http://members.oreilly.com
https://www.safaribooksonline.com/?utm_medium=content&utm_source=oreilly.com&utm_campaign=lgen&utm_content=20170505+homepage+get+started+now
https://www.safaribooksonline.com/accounts/login/?utm_medium=content&utm_source=oreilly.com&utm_campaign=lgen&utm_content=20170203+homepage+sign+in
https://www.safaribooksonline.com/?utm_medium=content&utm_source=oreilly.com&utm_campaign=lgen&utm_content=20170710+homepage+get+started+now
https://www.safaribooksonline.com/public/free-trial/?utm_medium=content&utm_source=oreilly.com&utm_campaign=lgen&utm_content=20170710+homepage+start+free+trial
https://www.safaribooksonline.com/accounts/login/?utm_medium=content&utm_source=oreilly.com&utm_campaign=lgen&utm_content=20170710+homepage+sign+in
...
上面的代碼經(jīng)常會出錯底扳,可能是正則表達式匹配的原因,也有可能是網(wǎng)絡(luò)原因贡耽。
by @sunhaiyu
2017.7.14