目標:通過高德地圖的搜索接口姑尺,抓取每個城市的所有銀行的分行信息
思路:1. 在本地mysql中存儲有全國各城市名稱竟终、編碼
將城市編碼讀取到數(shù)組中
按照數(shù)據(jù)讀取每個編碼,組拼URL切蟋,通過POST請求訪問接口
獲取xml后解析出我們需要的數(shù)據(jù)统捶,插入到mysql中
第一步,定義訪問接口的基本參數(shù)
file_name='result.txt' # write result to this file
url_header='http://restapi.amap.com/v3/place/text?&keyword=&types=160100&'
url_end='&citylimit=true&&output=xml&offset=20&page=1&key=c787ae8e49424a657127c3ed64cfe053&extensions=base'
url_amap='city='
each_page_rec=20 # results that displays in one page
xml_file='tmp.xml' # xml filen name
第二步,建立本地數(shù)據(jù)庫訪問請求喘鸟,獲取數(shù)據(jù)庫中的所有城市編碼
首先匆绣,在本地mysql中建立一張region表,可以從網上down一份全國各地省市區(qū)編碼表什黑,結構如下圖:
為了方便大家崎淳,我將表的結構及數(shù)據(jù)導出為sql語句,直接復制到mysql中執(zhí)行即可愕把,鏈接如下 http://www.reibang.com/p/0b9b0e3cda5f
def getallcity():
cityarr = []
connection = pymysql.connect(host='127.0.0.1', user='root', passwd='123456', port=3306,
db='icoachu', charset="utf8")
cursor = connection.cursor()
sql = "select * from region where parent_id in (select id from region where parent_id=0)"
try:
cursor.execute(sql)
rows = cursor.fetchall()
for row in rows:
cityarr.append(row[0])
return cityarr
finally:
cursor.close()
connection.close()
return cityarr
關于如何訪問本地mysql的拣凹,比較簡單,此處不做說明礼华,需要強調的是在try 語句中咐鹤,一定要在finally中關閉cursor及connection。
第三步圣絮,通過接口訪問獲取html數(shù)據(jù)祈惶,并將數(shù)據(jù)寫入到文件中
# get html by url and save the data to xml file
def gethtml(url):
page = urllib.request.urlopen(url)
html = page.read()
# print(html)
try:
# open xml file and save data to it
with open(xml_file, 'wb+') as xml_file_handle:
xml_file_handle.write(html)
except IOError as err:
print
"IO error: " + str(err)
return -1
return 0
第四步,獲取xml格式的數(shù)據(jù)之后扮匠,解析相關字段捧请,并插入到mysql中
# phrase data from xml
def parsexml():
total_rec = 1 # record number
# open xml file and get data record
try:
with open(file_name, 'a') as file_handle:
dom = minidom.parse(xml_file)
root = dom.getElementsByTagName("response") # The function getElementsByTagName returns NodeList.
for node in root:
total_rec = node.getElementsByTagName('count')[0].childNodes[0].nodeValue
pois = node.getElementsByTagName("pois")
for poi in pois[0].getElementsByTagName('poi'):
branch_id = poi.getElementsByTagName("id")[0].childNodes[0].nodeValue
branch_name = poi.getElementsByTagName("name")[0].childNodes[0].nodeValue
branch_type = poi.getElementsByTagName("type")[0].childNodes[0].nodeValue
bank_type = poi.getElementsByTagName("typecode")[0].childNodes[0].nodeValue
pname = poi.getElementsByTagName("pname")[0].childNodes[0].nodeValue
cityname = poi.getElementsByTagName("cityname")[0].childNodes[0].nodeValue
aname = poi.getElementsByTagName("adname")[0].childNodes[0].nodeValue
# address = poi.getElementsByTagName("address")[0].childNodes[0].nodeValue
# biz_type = poi.getElementsByTagName("biz_type")[0].childNodes[0].nodeValue
# tel = poi.getElementsByTagName("tel")[0].childNodes[0].nodeValue
# distance = poi.getElementsByTagName("distance")[0].childNodes[0].nodeValue
arr = branch_type.split(';')
bank_name = arr[-1]
sql = "insert into bankinfo(branch_id, branch_name, branch_type, bank_name, bank_type, pname, cityname, aname) values('%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s')" % (
branch_id, branch_name.replace('(', '').replace(')', ''), branch_type, bank_name, bank_type, pname, cityname, aname)
connection = pymysql.connect(host='127.0.0.1', user='root', passwd='123456', port=3306,
db='icoachu', charset="utf8")
cursor = connection.cursor()
try:
print(sql)
cursor.execute(sql)
connection.commit()
if cursor.rowcount != 1:
raise Exception("數(shù)據(jù)插入失敗%s", sql)
finally:
connection.close()
cursor.close()
except IOError as err:
print
"IO error: " + str(err)
return total_rec
第五步,在主函數(shù)中實現(xiàn)處理相關調用
if __name__ == '__main__':
cityarr = getallcity()
for cityId in cityarr:
url = r'%scity=%s%s' % (url_header, cityId, url_end)
if gethtml(url) == 0:
total_record_str = parsexml()
total_record = int(str(total_record_str))
if (total_record % each_page_rec) != 0:
page_number = total_record / each_page_rec + 2
else:
page_number = total_record / each_page_rec + 1
for each_page in frange(2, float(page_number)):
print
'parsing page ' + str(each_page) + ' ... ...'
url = url.replace('page=' + str(each_page - 1), 'page=' + str(each_page))
print(url)
gethtml(url)
parsexml()
else:
print
'error: fail to get xml from amap'
完整的代碼如下
# coding:utf-8
# 目標:通過高德地圖的搜索接口棒搜,抓取每個城市的所有銀行的分行信息
# 思路:1. 在本地mysql中存儲有全國各城市名稱疹蛉、編碼
# 2. 將城市編碼讀取到數(shù)組中
# 3. 按照數(shù)據(jù)讀取每個編碼,組拼URL力麸,通過POST請求訪問接口
# 4. 獲取xml后解析出我們需要的數(shù)據(jù)可款,插入到mysql中
import urllib
import xml.dom.minidom as minidom
import string
import urllib.request
import pymysql
file_name = 'result.txt' # write result to this file
url_header = 'http://restapi.amap.com/v3/place/text?&keyword=&types=160100&'
url_end = '&citylimit=true&&output=xml&offset=20&page=1&key=c787ae8e49424a657127c3ed64cfe053&extensions=base'
url_amap = 'city='
each_page_rec = 20 # results that displays in one page
xml_file = 'tmp.xml' # xml filen name
# get html by url and save the data to xml file
def gethtml(url):
page = urllib.request.urlopen(url)
html = page.read()
# print(html)
try:
# open xml file and save data to it
with open(xml_file, 'wb+') as xml_file_handle:
xml_file_handle.write(html)
except IOError as err:
print
"IO error: " + str(err)
return -1
return 0
# phrase data from xml
def parsexml():
total_rec = 1 # record number
# open xml file and get data record
try:
with open(file_name, 'a') as file_handle:
dom = minidom.parse(xml_file)
root = dom.getElementsByTagName("response") # The function getElementsByTagName returns NodeList.
for node in root:
total_rec = node.getElementsByTagName('count')[0].childNodes[0].nodeValue
pois = node.getElementsByTagName("pois")
for poi in pois[0].getElementsByTagName('poi'):
branch_id = poi.getElementsByTagName("id")[0].childNodes[0].nodeValue
branch_name = poi.getElementsByTagName("name")[0].childNodes[0].nodeValue
branch_type = poi.getElementsByTagName("type")[0].childNodes[0].nodeValue
bank_type = poi.getElementsByTagName("typecode")[0].childNodes[0].nodeValue
pname = poi.getElementsByTagName("pname")[0].childNodes[0].nodeValue
cityname = poi.getElementsByTagName("cityname")[0].childNodes[0].nodeValue
aname = poi.getElementsByTagName("adname")[0].childNodes[0].nodeValue
# address = poi.getElementsByTagName("address")[0].childNodes[0].nodeValue
# biz_type = poi.getElementsByTagName("biz_type")[0].childNodes[0].nodeValue
# tel = poi.getElementsByTagName("tel")[0].childNodes[0].nodeValue
# distance = poi.getElementsByTagName("distance")[0].childNodes[0].nodeValue
arr = branch_type.split(';')
bank_name = arr[-1]
sql = "insert into bankinfo(branch_id, branch_name, branch_type, bank_name, bank_type, pname, cityname, aname) values('%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s')" % (
branch_id, branch_name.replace('(', '').replace(')', ''), branch_type, bank_name, bank_type, pname, cityname, aname)
connection = pymysql.connect(host='127.0.0.1', user='root', passwd='123456', port=3306,
db='icoachu', charset="utf8")
cursor = connection.cursor()
try:
print(sql)
cursor.execute(sql)
connection.commit()
if cursor.rowcount != 1:
raise Exception("數(shù)據(jù)插入失敗%s", sql)
finally:
connection.close()
cursor.close()
except IOError as err:
print
"IO error: " + str(err)
return total_rec
def frange(start, stop, step=1):
i = start
while i < stop:
yield i
i += step
def getallcity():
cityarr = []
connection = pymysql.connect(host='127.0.0.1', user='root', passwd='123456', port=3306,
db='icoachu', charset="utf8")
cursor = connection.cursor()
sql = "select * from region where parent_id in (select id from region where parent_id=0)"
try:
cursor.execute(sql)
rows = cursor.fetchall()
for row in rows:
cityarr.append(row[0])
return cityarr
finally:
cursor.close()
connection.close()
return cityarr
if __name__ == '__main__':
cityarr = getallcity()
for cityId in cityarr:
url = r'%scity=%s%s' % (url_header, cityId, url_end)
if gethtml(url) == 0:
total_record_str = parsexml()
total_record = int(str(total_record_str))
if (total_record % each_page_rec) != 0:
page_number = total_record / each_page_rec + 2
else:
page_number = total_record / each_page_rec + 1
for each_page in frange(2, float(page_number)):
print
'parsing page ' + str(each_page) + ' ... ...'
url = url.replace('page=' + str(each_page - 1), 'page=' + str(each_page))
print(url)
gethtml(url)
parsexml()
else:
print
'error: fail to get xml from amap'
數(shù)據(jù)庫中數(shù)據(jù)如下: