%pwd#顯示路徑
'/Users/zhongyaode/pythonbook'
#讀取文件中的第一行數(shù)據(jù)
path = '/Users/zhongyaode/pythonbook/ch02/usagov_bitly_data2012-03-16-1331923249.txt'
open(path).readline()
'{ "a": "Mozilla\\/5.0 (Windows NT 6.1; WOW64) AppleWebKit\\/535.11 (KHTML, like Gecko) Chrome\\/17.0.963.78 Safari\\/535.11", "c": "US", "nk": 1, "tz": "America\\/New_York", "gr": "MA", "g": "A6qOVH", "h": "wfLQtf", "l": "orofrog", "al": "en-US,en;q=0.8", "hh": "1.usa.gov", "r": "http:\\/\\/www.facebook.com\\/l\\/7AQEFzjSi\\/1.usa.gov\\/wfLQtf", "u": "http:\\/\\/www.ncbi.nlm.nih.gov\\/pubmed\\/22415991", "t": 1331923247, "hc": 1331822918, "cy": "Danvers", "ll": [ 42.576698, -70.954903 ] }\n'
#將json字符串轉(zhuǎn)換成python字典對象慢睡。用json的模塊及其loads函數(shù)逐行加載已經(jīng)下載好的數(shù)據(jù)文件
import json
path = '/Users/zhongyaode/pythonbook/ch02/usagov_bitly_data2012-03-16-1331923249.txt'
records = [json.loads(line) for line in open(path)]#列表推導式频轿,這是一種在一組字符串(或一組別分的對象)上執(zhí)行一條相同操作(如json.loass)
#的簡潔方式念搬,在一個打開的文件句柄上進行迭代即可獲得一個由行組成的系列惹挟,現(xiàn)在烁落,records對象就成為一組Python字典了
import json
path = 'ch02/usagov_bitly_data2012-03-16-1331923249.txt'
records = [json.loads(line) for line in open(path)]
records[0]
{'a': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.78 Safari/535.11',
'al': 'en-US,en;q=0.8',
'c': 'US',
'cy': 'Danvers',
'g': 'A6qOVH',
'gr': 'MA',
'h': 'wfLQtf',
'hc': 1331822918,
'hh': '1.usa.gov',
'l': 'orofrog',
'll': [42.576698, -70.954903],
'nk': 1,
'r': 'http://www.facebook.com/l/7AQEFzjSi/1.usa.gov/wfLQtf',
't': 1331923247,
'tz': 'America/New_York',
'u': 'http://www.ncbi.nlm.nih.gov/pubmed/22415991'}
#現(xiàn)在只要以字符串形式給出想要訪問的鍵就可以得到當前記錄中相應的值
records[0]['u']
'http://www.ncbi.nlm.nih.gov/pubmed/22415991'
records[2]['u']
'http://boxer.senate.gov/en/press/releases/031612.cfm'
records[0]['tz']
'America/New_York'
print(records[0]['tz'])
America/New_York
得到數(shù)據(jù)集中最常出現(xiàn)的是那個時區(qū)(即tz區(qū))绎秒,有很多方法一姿,下面用列表推導式取出一組時區(qū)
time_zones=[rec['tz']for rec in records]
#不是所有記錄都有時區(qū)字段拖云,只需在列表推導式末尾加上一個 if 'tz' in rec判斷即可
time_zones=[rec['tz'] for rec in records if 'tz' in rec]
time_zones = [rec['tz'] for rec in records if 'tz' in rec]
time_zones[:10]#只看前十個時區(qū)
['America/New_York',
'America/Denver',
'America/New_York',
'America/Sao_Paulo',
'America/New_York',
'America/New_York',
'Europe/Warsaw',
'',
'',
'']
#對時區(qū)進行計數(shù),計數(shù)辦法之一是在遍歷時區(qū)的過程中將計數(shù)值保存在字典中
def get_counts(sequence)
counts={}
for x in sequence:
if x in counts:
counts[x] +=1
else:
counts[x] =1
return counts
File "<ipython-input-43-62431215ac18>", line 2
def get_counts(sequence)
^
SyntaxError: invalid syntax
def get_counts(sequence):
counts = {}
for x in sequence:
if x in counts:
counts[x] += 1
else:
counts[x] = 1
return counts
from collections import defaultdict
def get_counts2(sequence):
counts = defaultdict(int) # values will initialize to 0
for x in sequence:
counts[x] += 1
return counts
#對時區(qū)進行處理踏施,只需將time_zones傳入即可:
counts = get_counts(time_zones)
counts['America/New_York']
1251
len(time_zones)
3440
#想獲得前10位的時區(qū)及其計數(shù)值石蔗,需要用到一些有關(guān)字典的處理技巧:
def top_counts(count_dict,n=10):
value_key_pairs = [(count,tz) for tz,count in count_dict.items()]
value_key_pairs.sort()
return value_key_pairs[-n:]
top_counts(counts)
[(33, 'America/Sao_Paulo'),
(35, 'Europe/Madrid'),
(36, 'Pacific/Honolulu'),
(37, 'Asia/Tokyo'),
(74, 'Europe/London'),
(191, 'America/Denver'),
(382, 'America/Los_Angeles'),
(400, 'America/Chicago'),
(521, ''),
(1251, 'America/New_York')]
from collections import Counter
counts=Counter(time_zones)
counts.most_common(10)
[('America/New_York', 1251),
('', 521),
('America/Chicago', 400),
('America/Los_Angeles', 382),
('America/Denver', 191),
('Europe/London', 74),
('Asia/Tokyo', 37),
('Pacific/Honolulu', 36),
('Europe/Madrid', 35),
('America/Sao_Paulo', 33)]
#用pandas對時區(qū)進行計數(shù);Dataframe是pandas中最重要的數(shù)據(jù)結(jié)構(gòu)畅形,它用于表為一個表格
from pandas import DataFrame,Series
import pandas as pd;import numpy as np
frame=DataFrame(records)
frame
frame['tz'][:10]
0 America/New_York
1 America/Denver
2 America/New_York
3 America/Sao_Paulo
4 America/New_York
5 America/New_York
6 Europe/Warsaw
7
8
9
Name: tz, dtype: object
#frame['tz]所返回的Series對象有一個value_counts方法养距,該方法可以讓我們得到所需的信息
tz_counts=frame['tz'].value_counts()
tz_counts[:10]
clean_tz=frame['tz'].fillna('Missing')
clean_tz[clean_tz=='']='Unknown'
tz_counts=clean_tz.value_counts()
tz_counts[:10]
America/New_York 1251
Unknown 521
America/Chicago 400
America/Los_Angeles 382
America/Denver 191
Missing 120
Europe/London 74
Asia/Tokyo 37
Pacific/Honolulu 36
Europe/Madrid 35
Name: tz, dtype: int64
#利用counts對象的plot方法可得到一張水平條形圖
tz_counts[:10].plot(kind='barh', rot=0)
<matplotlib.axes._subplots.AxesSubplot at 0x10b5b67f0>
frame['a'][1]
'GoogleMaps/RochesterNY'
frame['a'][50]
'Mozilla/5.0 (Windows NT 5.1; rv:10.0.2) Gecko/20100101 Firefox/10.0.2'
frame['a'][51]
'Mozilla/5.0 (Linux; U; Android 2.2.2; en-us; LG-P925/V10e Build/FRG83G) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1'
results = Series([x.split()[0] for x in frame.a.dropna()])
results[:5]
results.value_counts()[:8]
cframe=frame[frame.a.notnull()]
operating_system=np.where(cframe['a'].str.contains('Windows'),
'Windows','Not Windows')
operating_system[:5]
array(['Windows', 'Not Windows', 'Windows', 'Not Windows', 'Windows'],
dtype='<U11')
by_tz_os=cframe.groupby(['tz',operating_system])
agg_counts=by_tz_os.size().unstack().fillna(0)
agg_counts[:10]
#選出最常出現(xiàn)的時區(qū),為了達到這個目的日熬,根據(jù)agg_counts中的行數(shù)構(gòu)造了一個間接索引數(shù)組
indexer=agg_counts.sum(1).argsort(0)
indexer[:10]
#通過take按照這個順序截取了最后10行
count_subset=agg_counts.take(indexer)[-10:]
count_subset
count_subset.plot(kind='barh', stacked=True)
<matplotlib.axes._subplots.AxesSubplot at 0x10de0aac8>
normed_subset = count_subset.div(count_subset.sum(1), axis=0)
normed_subset.plot(kind='barh', stacked=True)
<matplotlib.axes._subplots.AxesSubplot at 0x10dee09e8>