世界高峰數(shù)據(jù)可視化 (World's Highest Mountains)
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib import style
%matplotlib inline
style.use('ggplot') # 設(shè)置圖片顯示的主題樣式
# 解決matplotlib顯示中文問題
plt.rcParams['font.sans-serif'] = ['SimHei'] # 指定默認字體
plt.rcParams['axes.unicode_minus'] = False # 解決保存圖像是負號'-'顯示為方塊的問題
dataset_path = './dataset/Mountains.csv'
定義一會要用的一些函數(shù)
def preview_data(data):
"""
數(shù)據(jù)預(yù)覽
"""
# 數(shù)據(jù)預(yù)覽
print(data.head())
# 數(shù)據(jù)信息
print(data.info())
def proc_success(val):
"""
處理 'Ascents bef. 2004' 列中的數(shù)據(jù)
"""
if '>' in str(val):
return 200
elif 'Many' in str(val):
return 160
else:
return val
data = pd.read_csv(dataset_path)
preview_data(data)
Rank Mountain Height (m) Height (ft) \
0 1 Mount Everest / Sagarmatha / Chomolungma 8848 29029
1 2 K2 / Qogir / Godwin Austen 8611 28251
2 3 Kangchenjunga 8586 28169
3 4 Lhotse 8516 27940
4 5 Makalu 8485 27838
Prominence (m) Range Coordinates \
0 8848 Mahalangur Himalaya 27°59′17″N 86°55′31″E
1 4017 Baltoro Karakoram 35°52′53″N 76°30′48″E
2 3922 Kangchenjunga Himalaya 27°42′12″N 88°08′51″E
3 610 Mahalangur Himalaya 27°57′42″N 86°55′59″E
4 2386 Mahalangur Himalaya 27°53′23″N 87°05′20″E
Parent mountain First ascent Ascents bef. 2004 Failed attempts bef. 2004
0 NaN 1953 >>145 121.0
1 Mount Everest 1954 45 44.0
2 Mount Everest 1955 38 24.0
3 Mount Everest 1956 26 26.0
4 Mount Everest 1955 45 52.0
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 118 entries, 0 to 117
Data columns (total 11 columns):
Rank 118 non-null int64
Mountain 118 non-null object
Height (m) 118 non-null int64
Height (ft) 118 non-null int64
Prominence (m) 118 non-null int64
Range 118 non-null object
Coordinates 118 non-null object
Parent mountain 117 non-null object
First ascent 118 non-null object
Ascents bef. 2004 116 non-null object
Failed attempts bef. 2004 115 non-null float64
dtypes: float64(1), int64(4), object(6)
memory usage: 10.2+ KB
None
數(shù)據(jù)重構(gòu)
重命名列名
data.rename(columns={'Height (m)': 'Height', 'Ascents bef. 2004': 'Success',
'Failed attempts bef. 2004': 'Failed'}, inplace=True)
數(shù)據(jù)清洗
data['Failed'] = data['Failed'].fillna(0).astype(int)
data['Success'] = data['Success'].apply(proc_success)
data['Success'] = data['Success'].fillna(0).astype(int)
data = data[data['First ascent'] != 'unclimbed']
data['First ascent'] = data['First ascent'].astype(int)
可視化數(shù)據(jù)
1. 登頂次數(shù) vs 年份
plt.hist(data['First ascent'].astype(int), bins=20)
plt.ylabel('高峰數(shù)量')
plt.xlabel('年份')
plt.title('登頂次數(shù)')
plt.savefig('./first_ascent_vs_year.png')
plt.show()
首次登頂
data['Attempts'] = data['Failed'] + data['Success'] # 攀登嘗試次數(shù)
fig = plt.figure(figsize=(13, 7))
fig.add_subplot(211)
plt.scatter(data['First ascent'], data['Height'], c=data['Attempts'], alpha=0.8, s=50)
plt.ylabel('海拔')
plt.xlabel('登頂')
fig.add_subplot(212)
plt.scatter(data['First ascent'], data['Rank'].max() - data['Rank'], c=data['Attempts'], alpha=0.8, s=50)
plt.ylabel('排名')
plt.xlabel('登頂')
plt.savefig('./mountain_vs_attempts.png')
plt.show()