本案例是利用書中的示例
美國(guó)農(nóng)業(yè)部(USDA)制作了一份有關(guān)食物營(yíng)養(yǎng)信息的數(shù)據(jù)庫(kù)撬码。這邊提供一個(gè)JSON格式的文件 foods-2011-10-03.json
https://github.com/re4lfl0w/ipython/blob/master/books/python_data_analysis/ch07/foods-2011-10-03.json
這邊代碼運(yùn)行的環(huán)境是Jupyter Notebook with Python2.7
#coding:utf-8
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from pandas import Series,DataFrame
import re
import json
%matplotlib inline #在juypter notebook 內(nèi)嵌圖形
db = json.load(open('foods-2011-10-03.json'))
#db[:1]
#len(db)
#type(db)
#db[0].keys()
#db[0]['nutrients'][0]
#len(db[0]['nutrients'])
#nutrients
info_keys = ['description','group','id','manufacturer','tags'] #自定義列名
info = DataFrame(db, columns=info_keys)
info.drop('tags',axis=1,inplace=True) #刪除tags一列
pd.value_counts(info.group) #根據(jù)group列做統(tǒng)計(jì)
pd.value_counts(nutrients.group) #根據(jù)group列做統(tǒng)計(jì)
Nutrients = [] #新建一個(gè)空列表
#取特定key為nutrients id 生成一個(gè)list
for rec in db:
fnuts = DataFrame(rec['nutrients'])
fnuts['id'] = rec['id']
Nutrients.append(fnuts)
#Nutrients[:10]
Nutrients = pd.concat(Nutrients,ignore_index=True) #將list中的成員整合到一個(gè)DataFrame
#Nutrients[:10]
len(Nutrients)
Nutrients.duplicated().sum() #統(tǒng)計(jì)重復(fù)項(xiàng)剂邮,默認(rèn)是根據(jù)第一列nutrients 統(tǒng)計(jì)
Nutrients.duplicated('group').sum() #統(tǒng)計(jì)重復(fù)項(xiàng),根據(jù)指定group列統(tǒng)計(jì)
Nutrients_with_no_duplicates = Nutrients.drop_duplicates() #丟棄重復(fù)項(xiàng)
len(Nutrients_with_no_duplicates)
#兩個(gè)DataFrame 都有"group"抗斤、"description"列名重命名
col_mapping = {'description':'food','group':'fgroup'}
info = info.rename(columns=col_mapping,copy=False)
col_mapping = {'description':'nutrients','group':'nutgroup'}
Nutrients = Nutrients.rename(columns=col_mapping,copy=False)
#合并2個(gè)DataFrame 根據(jù)id關(guān)聯(lián) outer外連接
ndata = pd.merge(Nutrients,info,on='id',how='outer')
ndata.iloc[3000] #等同于ndata.ix[3000] 或者ndata.loc[3000]
#根據(jù)營(yíng)養(yǎng)分類得到鋅的中位值
result = ndata.groupby(['nutrients','fgroup'])['value'].quantile(0.5)
#result.head()
result['Zinc, Zn'].sort_values().plot(kind='barh')
#營(yíng)養(yǎng)成分最為豐富的食物
by_nutrients = ndata.groupby(['nutgroup','nutrients'])
get_maximum = lambda x: x.xs(x.value.idxmax())
get_minimum = lambda x: x.xs(x.value.idxmin())
max_food = by_nutrients.apply(get_maximum)[['value','food']]
max_food.food = max_food.food.str[:10]
max_food.loc['Amino Acids']['food']