1莫矗、抓取省級(jí)地址
區(qū)劃和城鄉(xiāng)劃分的最新數(shù)據(jù)為2019年的葬凳,點(diǎn)擊上方鏈接即可查看2019年相關(guān)數(shù)據(jù)。分析網(wǎng)頁(yè)可以看出,各省的鏈接和文本信息都存放在如下的標(biāo)簽中。
<a href="11.html">北京市<br></a>
由于國(guó)家統(tǒng)計(jì)局的網(wǎng)址結(jié)構(gòu)比較簡(jiǎn)單,因此可以直接使用正則表達(dá)式提取
pattern = re.compile("<a href='(.*?)'>(.*?)<")
具體地序目,抓取31省數(shù)據(jù)代碼如下所示。由于后面抓取五級(jí)數(shù)據(jù)時(shí)需要頻繁訪問服務(wù)器伯襟,因此多準(zhǔn)備幾個(gè)請(qǐng)求頭猿涨。另外,url為上方的鏈接逗旁, 為了避免亂碼設(shè)置一下response的編碼嘿辟。
import requests
import re
import random
import time
import os
import pandas as pd
# 設(shè)置請(qǐng)求頭
def get_headers():
user_agent = [
"Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_8; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50",
"Mozilla/5.0 (Windows; U; Windows NT 6.1; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50",
"Mozilla/5.0 (Windows NT 10.0; WOW64; rv:38.0) Gecko/20100101 Firefox/38.0",
"Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; .NET4.0C; .NET4.0E; .NET CLR 2.0.50727; .NET CLR 3.0.30729; .NET CLR 3.5.30729; InfoPath.3; rv:11.0) like Gecko",
"Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0)",
"Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0)",
"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0)",
"Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1)",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv:2.0.1) Gecko/20100101 Firefox/4.0.1",
"Mozilla/5.0 (Windows NT 6.1; rv:2.0.1) Gecko/20100101 Firefox/4.0.1",
"Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; en) Presto/2.8.131 Version/11.11",
"Opera/9.80 (Windows NT 6.1; U; en) Presto/2.8.131 Version/11.11",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_0) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11",
"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Maxthon 2.0)",
"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; TencentTraveler 4.0)",
"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)",
"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; The World)",
"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; SE 2.X MetaSr 1.0; SE 2.X MetaSr 1.0; .NET CLR 2.0.50727; SE 2.X MetaSr 1.0)",
"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; 360SE)",
"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Avant Browser)",
"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)",
"Mozilla/5.0 (iPhone; U; CPU iPhone OS 4_3_3 like Mac OS X; en-us) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8J2 Safari/6533.18.5",
"Mozilla/5.0 (iPod; U; CPU iPhone OS 4_3_3 like Mac OS X; en-us) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8J2 Safari/6533.18.5",
"Mozilla/5.0 (iPad; U; CPU OS 4_3_3 like Mac OS X; en-us) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8J2 Safari/6533.18.5",
"Mozilla/5.0 (Linux; U; Android 2.3.7; en-us; Nexus One Build/FRF91) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1",
"MQQBrowser/26 Mozilla/5.0 (Linux; U; Android 2.3.7; zh-cn; MB200 Build/GRJ22; CyanogenMod-7) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1",
"Opera/9.80 (Android 2.3.4; Linux; Opera Mobi/build-1107180945; U; en-GB) Presto/2.8.149 Version/11.10",
"Mozilla/5.0 (Linux; U; Android 3.0; en-us; Xoom Build/HRI39) AppleWebKit/534.13 (KHTML, like Gecko) Version/4.0 Safari/534.13",
"Mozilla/5.0 (BlackBerry; U; BlackBerry 9800; en) AppleWebKit/534.1+ (KHTML, like Gecko) Version/6.0.0.337 Mobile Safari/534.1+",
"Mozilla/5.0 (hp-tablet; Linux; hpwOS/3.0.0; U; en-US) AppleWebKit/534.6 (KHTML, like Gecko) wOSBrowser/233.70 Safari/534.6 TouchPad/1.0",
"Mozilla/5.0 (SymbianOS/9.4; Series60/5.0 NokiaN97-1/20.0.019; Profile/MIDP-2.1 Configuration/CLDC-1.1) AppleWebKit/525 (KHTML, like Gecko) BrowserNG/7.1.18124",
"Mozilla/5.0 (compatible; MSIE 9.0; Windows Phone OS 7.5; Trident/5.0; IEMobile/9.0; HTC; Titan)",
"UCWEB7.0.2.37/28/999",
"NOKIA5700/ UCWEB7.0.2.37/28/999",
"Openwave/ UCWEB7.0.2.37/28/999",
"Mozilla/4.0 (compatible; MSIE 6.0; ) Opera/UCWEB7.0.2.37/28/999",
"Mozilla/6.0 (iPhone; CPU iPhone OS 8_0 like Mac OS X) AppleWebKit/536.26 (KHTML, like Gecko) Version/8.0 Mobile/10A5376e Safari/8536.25"
]
headers = {
'Cookie': '_trs_uv=kfp3v12j_6_8t0e; SF_cookie_1=37059734; _trs_ua_s_1=kfxdjigi_6_4w48',
'Host': 'www.stats.gov.cn',
'Referer': 'http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/',
'User-Agent': random.choice(user_agent),
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9'
}
return headers
# 獲取31省
def get_province():
url = 'http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2019/index.html'
response = requests.get(url, headers=get_headers())
response.raise_for_status()
response.encoding = response.apparent_encoding
# response.encoding = 'gbk'
response.close()
pattern = re.compile("<a href='(.*?)'>(.*?)<")
result = list(set(re.findall(pattern, response.text)))
return result
# 寫入到csv文件
def write_province():
province = get_province()
tem = []
for i in province:
tem.append([i[0], i[1]])
df_province = pd.DataFrame(tem)
df_province.to_csv('省.csv', index=0)
return None
2舆瘪、抓取市級(jí)地址
分析河北省的網(wǎng)址可以發(fā)現(xiàn)片效,url由http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2019/index.html變?yōu)?a target="_blank">http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2019/13.html,即后綴改為“13.html”英古,其中“13”為之前抓取的省級(jí)標(biāo)簽中的數(shù)據(jù)淀衣,如:
<a href="13.html">河北省<br></a>
。市級(jí)數(shù)據(jù)的存放與省級(jí)數(shù)據(jù)的存放存在一定的差異召调,市級(jí)數(shù)據(jù)依然存放在如下的標(biāo)簽中膨桥,不同之處在于蛮浑,如果使用抓取省級(jí)數(shù)據(jù)的正則表達(dá)式來(lái)抓取市級(jí)數(shù)據(jù),最終的結(jié)果會(huì)多了個(gè)地址編碼:“130100000000”只嚣。實(shí)際的處理也很簡(jiǎn)單沮稚,將抓取的結(jié)果中的地址編碼刪除即可。因?yàn)榈刂肪幋a為純數(shù)字册舞,容易刪除蕴掏。
<a href="13/1301.html">130100000000</a>
<a href="13/1301.html">石家莊市</a>
為了保證爬取的質(zhì)量,筆者實(shí)際爬取一級(jí)數(shù)據(jù)之后立即進(jìn)行保存调鲸,保存的文件中包含鏈接和文本盛杰,如河北的數(shù)據(jù)保存為:['11.html', '河北省']。爬取市級(jí)數(shù)據(jù)時(shí)只需適當(dāng)修改一下url以及請(qǐng)求頭中的referer參數(shù)藐石,具體代碼如下所示即供。
# 獲取31省
write_province()
province = pd.read_csv('省.csv').values
# 獲取342城市
def get_city(province_code):
url = 'http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2019/' + province_code
headers=get_headers()
headers['Referer'] = 'http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2019/index.html'
response = requests.get(url, headers=headers)
response.raise_for_status()
response.encoding = 'gbk'
response.close()
pattern = re.compile("<a href='(.*?)'>(.*?)<")
result = list(set(re.findall(pattern, response.text)))
res = []
for j in result:
if '0' not in j[1]:
res.append(j)
return res
def write_city():
tem = []
for i in province:
city = get_city(i[0])
print('正在抓取:' , i[1], '共{}個(gè)城市'.format(len(city)))
time.sleep(random.random())
for j in city:
tem.append([i[0], i[1], j[0], j[1]])
pd.DataFrame(tem).to_csv('市.csv', index=0)
return Non
3于微、抓取三級(jí)逗嫡、四級(jí)(區(qū)縣、街道)地址
三級(jí)株依、四級(jí)地址的抓取方式與市級(jí)地址的抓取類似祸穷,后面的代碼幾乎等于復(fù)制前面的代碼,不同之處在于url與referer的構(gòu)造勺三,三級(jí)雷滚、四級(jí)地址的抓取代碼如下所示。
# 獲取3068區(qū)縣
def get_district(city_code):
url = 'http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2019/' + city_code
headers=get_headers()
headers['Referer'] = 'http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2019/{}.html'.format(city_code.split('/')[0])
response = requests.get(url, headers=headers)
response.raise_for_status()
response.encoding = 'gbk'
response.close()
pattern = re.compile("<a href='(.*?)'>(.*?)<")
result = list(set(re.findall(pattern, response.text)))
res = []
for j in result:
if '0' not in j[1]:
res.append(j)
return res
def write_district():
tem = []
for i in city:
district = get_district(i[2])
print('正在抓嚷鸺帷:', i[1], i[3], '共{}個(gè)區(qū)'.format(len(district)))
time.sleep(random.random())
for j in district:
tem.append([i[0], i[1], i[2], i[3], j[0], j[1]])
print(tem[-1], '\n')
pd.DataFrame(tem).to_csv('區(qū).csv', index=0)
return None
# 獲取43027街道
def get_road(province_code, city_code, district_code):
url = 'http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2019/' + province_code.split('.')[0] + '/' + district_code
headers=get_headers()
headers['Referer'] = 'http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2019/' + city_code
response = requests.get(url, headers=headers)
response.raise_for_status()
response.encoding = 'gbk'
response.close()
pattern = re.compile("<a href='(.*?)'>(.*?)<")
result = list(set(re.findall(pattern, response.text)))
res = []
for j in result:
if '0' not in j[1]:
res.append(j)
return res
def write_road():
tem = []
for i in district:
success = False
while not success:
try:
road = get_road(i[0], i[2], i[4])
print(i[1], i[3], i[5], '爬取成功祈远,共{}個(gè)街道'.format(len(road)))
time.sleep(random.random() / 2)
success = True
except Exception as e:
print(e)
print(i[1], i[3], i[5], '爬取失敗,重新爬取')
for j in road:
tem.append([i[0], i[1], i[2], i[3], i[4], i[5], j[0], j[1]])
print(tem[-1], '\n')
pd.DataFrame(tem).to_csv('路.csv', index=0)
return None
# 獲取342城市
write_city()
city = pd.read_csv('市.csv').values
# 獲取3068區(qū)縣
write_district()
district = pd.read_csv('區(qū).csv').values
# 獲取43027街道
write_road()
df = pd.read_csv('路.csv')
4商源、抓取五級(jí)地址
抓取五級(jí)地址則略有不同车份,不同之處有兩點(diǎn)。
- 五級(jí)地址所在的標(biāo)簽有所變化
- 五級(jí)地址數(shù)量較大牡彻,需要加入一定的優(yōu)化手段
五級(jí)地址的標(biāo)簽多了兩個(gè)地址編碼扫沼,且標(biāo)簽類型有所改變,實(shí)際爬取中適當(dāng)修改正則表達(dá)式庄吼,并在結(jié)果中將地址編碼提取即可
<td>130202002001</td>
<td>111</td>
<td>友誼里社區(qū)居委會(huì)</td>
此外缎除,五級(jí)地址的抓取,增加了try总寻,except機(jī)制器罐,當(dāng)某條數(shù)據(jù)抓取失敗時(shí)重新抓取該條數(shù)據(jù),直至抓取成功渐行。同時(shí)轰坊,為了保證抓取的穩(wěn)定性铸董,筆者采取逐省抓取、立即保存的方式抓取肴沫,最終利用pandas將數(shù)據(jù)合并粟害。具體地、五級(jí)地址抓取代碼如下所示颤芬。
# 獲取656781五級(jí)地址
def get_community(province_code, district_code, road_code):
url = 'http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2019/' + province_code.split('.')[0] + '/' + district_code.split('/')[0] + '/' + road_code
headers=get_headers()
headers['Referer'] = 'http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2019/' + province_code.split('.')[0] + '/' + district_code
response = requests.get(url, headers=headers)
response.raise_for_status()
response.encoding = 'gbk'
response.close()
pattern = re.compile('<td>(.*?)</td>')
result = list(set(re.findall(pattern, response.text)))
res = []
for j in result:
if not re.findall('^\d*$', j):
res.append(j)
res.remove('名稱')
return res
def write_community(filename):
tem = []
for i in road:
success = False
while not success:
try:
community = get_community(i[0], i[4], i[6])
print(i[1], i[3], i[5], i[7], '\t------>爬取成功我磁,共{}個(gè)村委會(huì)'.format(len(community)))
time.sleep(random.random() / 4)
success = True
except Exception as e:
print(e)
print(i[1], i[3], i[5], i[7], '\t------>爬取失敗,重新爬取')
for j in community:
tem.append([i[1],i[3],i[5],i[7], j])
# print(tem[-1], '\n')
pd.DataFrame(tem).to_csv(filename, index=0)
return None
# 合并各省五級(jí)地址
def merge():
file_list = os.listdir('address/')
data = pd.DataFrame()
for i in file_list:
data = data.append(pd.read_csv('address/' + i))
data.rename(columns={'0':'一級(jí)', '1':'二級(jí)', '2':'三級(jí)', '3':'四級(jí)', '4':'五級(jí)', }, inplace=True)
return data
# 分省獲取656781五級(jí)地址
lis = df['1'].unique()
for i in lis:
road = df[df['1']==i].values
write_community(i + '.csv')
# 合并各省五級(jí)地址
address = merge()
address.to_csv('address.csv', index=0)
address.head()
5驻襟、數(shù)據(jù)展示
完整數(shù)據(jù):https://pan.baidu.com/s/1BAkVbjkJHipEArIrE7Ntwg
提取碼:z6dx