reading Foundations for Analysis with Python Day 7
前面已經(jīng)學(xué)習(xí)了基礎(chǔ)知識的學(xué)習(xí)和幾種數(shù)據(jù)分析師最常用文件的讀寫和處理蚊俺。因此《Python數(shù)據(jù)分析基礎(chǔ)》講完這些就用一章的篇幅講了幾個基于現(xiàn)實需求的項目敢辩,每個項目代碼量都挺大的蔽莱。我們就進入書的第五章 小型應(yīng)用的學(xué)習(xí),如果在理解下面的代碼過程中遇到了問題可以和我討論或試著在之前的筆記中找到答案或者網(wǎng)上搜索問題戚长。
目錄
- 在一個文件夾中的多個文件中搜索特定內(nèi)容
- 為CSV文件中數(shù)據(jù)的任意數(shù)目分類計算統(tǒng)計量
- 為文本文件中數(shù)據(jù)的任意數(shù)目分類計算統(tǒng)計量
- csv文件轉(zhuǎn)換為字典
在一個文件夾中的多個文件中搜索特定內(nèi)容
這一章的第一個應(yīng)用需求是在有大量歷史數(shù)據(jù)的時候盗冷,要找到真正需要的數(shù)據(jù)。
具體例子:
在有 300 個 Excel 工作簿和 200 個 CSV 文件的文件夾里找到需要的數(shù)據(jù)同廉。輸入:一個存著數(shù)據(jù)表達形式相同的多個文件的文件夾仪糖;要找的數(shù)據(jù)列表;輸出文件名迫肖。
import re
import csv
import glob
import sys
from datetime import date
from xlrd import open_workbook, xldate_as_tuple
def searchItemFromFolder(path_to_folder,item_numbers_to_find,output_file):
filewriter = csv.writer(open(output_file, 'a', newline=''))
file_counter = 0
line_counter = 0
count_of_item_numbers = 0
for input_file in glob.glob(os.path.join(path_to_folder, '*.*')):
file_counter += 1
f_type=fileType(input_file)
if f_type== 'csv': #可以re庫判斷是否為csv文件
with open(input_file, 'r', newline='') as csv_in_file:
filereader = csv.reader(csv_in_file)
header = next(filereader)
for row in filereader:
row_of_output = []
for column in range(len(header)):
if column < 3:
cell_value = str(row[column]).strip()
row_of_output.append(cell_value)
elif column == 3:
cell_value = str(row[column]).lstrip('$').replace(',','').split('.')[0].strip()
row_of_output.append(cell_value)
else:
cell_value = str(row[column]).strip()
row_of_output.append(cell_value)
row_of_output.append(os.path.basename(input_file))
if row[0] in item_numbers_to_find:
filewriter.writerow(row_of_output)
count_of_item_numbers += 1
line_counter += 1
elif f_type== 'Excel':
workbook = open_workbook(input_file) #Excel的打開
for worksheet in workbook.sheets():
try:
header = worksheet.row_values(0)
except IndexError:
pass
for row in range(1, worksheet.nrows):
row_of_output = []
for column in range(len(header)):
if column < 3:
cell_value = str(worksheet.cell_value(row,column)).strip()
row_of_output.append(cell_value)
elif column == 3:
cell_value = str(worksheet.cell_value(row,column)).split('.')[0].strip()
row_of_output.append(cell_value)
else:
cell_value = xldate_as_tuple(worksheet.cell(row,column).value,workbook.datemode)
cell_value = str(date(*cell_value[0:3])).strip()
row_of_output.append(cell_value)
row_of_output.append(os.path.basename(input_file))
row_of_output.append(worksheet.name) #把表名裝入數(shù)組
if str(worksheet.cell(row,0).value).split('.')[0].strip() in item_numbers_to_find:
filewriter.writerow(row_of_output)
count_of_item_numbers += 1
line_counter += 1
print('Number of files: {}'.format(file_counter))
print('Number of lines: {}'.format(line_counter))
print('Number of item numbers: {}'.format(count_of_item_numbers))
def fileType(in_file): #判斷文件類型
pcsv=re.compile(r'\.csv$')
pv=pcsv.search(in_file)
if pv!=None: #csv文件
return 'csv'
else:
pxls=re.compile(r'\.(xls)|(xlsx)$')
px=pxls.search(in_file)
if px!=None:
return 'Excel'
else:
return 'fileTypeNotSupport'
#調(diào)用:
searchItemFromFolder(path_to_folder,item_numbers_to_find锅劝,output_file)
我們之前沒有涉及到過這么長的代碼,但整體思路不難理解蟆湖,很多片段是我們之前的筆記寫過的故爵。
為CSV文件中數(shù)據(jù)的任意數(shù)目分類計算統(tǒng)計量
應(yīng)用場景:
在很多商業(yè)分析中,需要為一個特定時間段內(nèi)的未知數(shù)目的分類計算統(tǒng)計量隅津。舉例來說诬垂,假設(shè)我們銷售 5 種不同種類的產(chǎn)品劲室,你想計算一下在某一年中對于所有客戶的按產(chǎn)品種類分類的總銷售額。因為客戶具有不同的品味和偏好结窘,他們在一年中購買的產(chǎn)品也是不同的很洋。有些客戶購買了所有 5 種產(chǎn)品,有些客戶則只購買了一種產(chǎn)品隧枫。在這種客戶購買習(xí)慣之下喉磁,與每個客戶相關(guān)的產(chǎn)品分類數(shù)目都是不同的。
import csv
import sys
from datetime import date, datetime
#計算輸入csv文件的統(tǒng)計量
def calcStatistic(input_file,output_file):
#輸入?yún)?shù):輸入的csv文件悠垛,輸出結(jié)果文件
packages = {}
previous_name = 'N/A'
previous_package = 'N/A'
previous_package_date = 'N/A'
first_row = True #是否在處理輸入文件中的第一行數(shù)據(jù)
today = date.today().strftime('%m/%d/%Y')
with open(input_file, 'r', newline='') as input_csv_file:
filereader = csv.reader(input_csv_file)
header = next(filereader)
for row in filereader:
current_name = row[0]
current_package = row[1]
current_package_date = row[3]
if current_name not in packages:
packages[current_name] = {}
if current_package not in packages[current_name]:
packages[current_name][current_package] = 0
if current_name != previous_name:
if first_row:
first_row = False
else:
diff = date_diff(today, previous_package_date)
if previous_package not in packages[previous_name]:
packages[previous_name][previous_package] = int(diff)
else:
packages[previous_name][previous_package] += int(diff)
else:
diff = date_diff(current_package_date, previous_package_date)
packages[previous_name][previous_package] += int(diff)
previous_name = current_name
previous_package = current_package
previous_package_date = current_package_date
header = ['Customer Name', 'Category', 'Total Time (in Days)']
with open(output_file, 'w', newline='') as output_csv_file:
filewriter = csv.writer(output_csv_file)
filewriter.writerow(header)
for customer_name, customer_name_value in packages.items():
for package_category, package_category_value in packages[customer_name].items():
row_of_output = []
print(customer_name, package_category, package_category_value)
row_of_output.append(customer_name)
row_of_output.append(package_category)
row_of_output.append(package_category_value)
filewriter.writerow(row_of_output)
def date_diff(date1, date2): #時間戳處理函數(shù)
#計算并返回兩個日期之間間隔的天數(shù)
try:
diff = str(datetime.strptime(date1, '%m/%d/%Y') - datetime.strptime(date2, '%m/%d/%Y')).split()[0]
except:
diff = 0
if diff == '0:00:00':
diff = 0
return diff
calcStatistic(input_file,output_file)
從文本文件中提取數(shù)據(jù)并根據(jù)這些數(shù)據(jù)計算統(tǒng)計量
文本文件也是數(shù)據(jù)的重要保存媒介娩鹉,文本文件和csv文件相比村斟,有一定結(jié)構(gòu),但結(jié)構(gòu)不是簡單的行與列,所以處理起來相對復(fù)雜些眠蚂。像日志這種文本文件經(jīng)常保存著一些分散的數(shù)據(jù),可以進行分析调炬、聚集和解釋唯鸭,以產(chǎn)生一些新的知識。
具體的應(yīng)用場景:
MySQL錯誤日志文件記錄著錯誤的類型和發(fā)生時間搀缠。在它的原始狀態(tài)下铛楣,我們很難看出是否有某種錯誤發(fā)生的比其他錯誤更頻繁,或者是否某種錯誤的頻率會隨著時間發(fā)生變化艺普。通過解析這個文本文件簸州,將相關(guān)信息聚集起來,然后以合適的形式寫入一個輸出文件歧譬,就可以從數(shù)據(jù)中獲得知識岸浑,促使我們采取正確的行動。
對于這個應(yīng)用瑰步,思路是:因為結(jié)構(gòu)化不好矢洲,逐行讀寫缩焦,利用每行的標志區(qū)分是什么數(shù)據(jù)读虏,通過空格切分數(shù)據(jù)。并且可以將輸出的數(shù)據(jù)進行可視化袁滥,更形象盖桥,也能反映出一些規(guī)律√夥可視化會在后面有專門的筆記進行講解葱轩。
def parseTextFile(input_file,output_file):
#因為是讀寫文本文件,不需要其他庫,對結(jié)構(gòu)性不好的txt用numpy靴拱、pandas等處理并明智
messages = {}
#鑲套字典格式:{'錯誤發(fā)生日期':{'錯誤消息':'該日期錯誤發(fā)生次數(shù)'}}
notes = []
with open(input_file, 'r', newline='',encoding='UTF-8') as text_file:
for row in text_file:
if '[Note]' in row: #包含字符串 [Note] 的行就是包含錯誤消息的行
row_list = row.split(' ', 4) #拆分每行
day = row_list[0].strip()
note = row_list[4].strip('\n').strip()
if note not in notes:
notes.append(note)
if day not in messages:
messages[day] = {}
if note not in messages[day]: #檢驗變量 note 中的錯誤消息是否還不是內(nèi)部字典中的一個鍵
messages[day][note] = 1
else:
messages[day][note] += 1
filewriter = open(output_file, 'w', newline='')
header = ['Date']
header.extend(notes)
header = ','.join(map(str,header)) + '\n'
print(header)
filewriter.write(header)
n_lst=[]
for day, day_value in messages.items():
row_of_output = []
row_of_output.append(day)
for index in range(len(notes)):
if notes[index] in day_value.keys():
row_of_output.append(day_value[notes[index]])
else:
row_of_output.append(0)
output = ','.join(map(str,row_of_output)) + '\n'
n_lst.append(row_of_output)
print(output)
filewriter.write(output)
filewriter.close()
visualizeData(n_lst) #簡單可視化
#簡單可視化函數(shù)
def visualizeData(nlst): #限定為 3*n的二維列表
import matplotlib.pyplot as plt
%matplotlib inline
plt.style.use('ggplot')
fig = plt.figure()
ax1 = fig.add_subplot(1,1,1)
ax1.plot(nlst[0][1:], marker=r'o', color=u'blue', linestyle='-',label=nlst[0][0])
ax1.plot(nlst[1][1:], marker=r'+', color=u'red', linestyle='--', label=nlst[1][0])
ax1.plot(nlst[2][1:], marker=r'*', color=u'green', linestyle='-.', label=nlst[2][0])
ax1.xaxis.set_ticks_position('bottom')
ax1.yaxis.set_ticks_position('left')
ax1.set_title('Six Types')
plt.xlabel('Error Type')
plt.ylabel('Occur Num')
plt.legend(loc='best')
in_f='./mysql_server_error_log.txt'
out_f='error_log_data_out.csv'
parseTextFile(in_f,out_f) #調(diào)用
輸出:
Date,InnoDB: Compressed tables use zlib 1.2.3,InnoDB: Using atomics to ref count buffer pool pages,InnoDB: 5.6.16 started; log sequence number 1234567,/usr/local/mysql/bin/mysqld: Shutdown complete,InnoDB: Completed initialization of buffer pool,InnoDB: IPv6 is available.
2014-02-03,2,2,1,1,0,0
2014-03-07,3,1,1,1,0,0
2014-10-27,0,0,1,1,2,2
原書第5章到這里就結(jié)束了垃喊,但是練習(xí)題有個題目挺有趣的,也是比較有用的袜炕,上面處理MySQL錯誤日志時本谜,進行了鑲套字典到csv行列數(shù)據(jù)的轉(zhuǎn)換,這個練習(xí)的輸出的字典就是簡化的json格式偎窘,json是前端用得非常廣泛的數(shù)據(jù)乌助,所以這里補充到這篇筆記里。題目描述:
練習(xí)使用字典來為數(shù)據(jù)分類陌知。例如他托,將 Excel 工作表或 CSV 文件中的數(shù)據(jù)解析成一個字典,使輸入文件中每一列都是字典中的一個鍵-值對仆葡。也就是說赏参,每列標題是字典中的鍵,每個鍵對應(yīng)的值就是相應(yīng)列中的數(shù)據(jù)沿盅。
下面給了兩種實現(xiàn)方式把篓,各有優(yōu)勢。
#用csv庫實現(xiàn)
def rowColToDict(fname):
import csv
with open(fname, 'r', newline='') as csv_file:
filereader = csv.reader(csv_file, delimiter=',')
at_key=True #剛讀到首行
for r_lst in filereader:
if at_key: #循環(huán)在表的標題時
d_key=r_lst
l_k=len(d_key)
d_val=[[] for i in range(l_k)]
at_key=False
else:
for i in range(l_k):
d_val[i].append(r_lst[i])
print(r_lst)
d_out={d_key[i]:d_val[i] for i in range(l_k)}
print('轉(zhuǎn)換后的字典:\n',d_out)
#return d_out
f='./supplier_data.csv'
rowColToDict(f)
輸出:
['Supplier Name', 'Invoice Number', 'Part Number', 'Cost', 'Purchase Date']
['Supplier X', '001-1001', '2341', '$500.00', '1/20/14']
['Supplier X', '001-1001', '2341', '$500.00', '1/20/14']
['Supplier X', '001-1001', '5467', '$750.00', '1/20/14']
['Supplier X', '001-1001', '5467', '$750.00', '1/20/14']
['Supplier Y', '50-9501', '7009', '$250.00', '1/30/14']
['Supplier Y', '50-9501', '7009', '$250.00', '1/30/14']
['Supplier Y', '50-9505', '6650', '$125.00', '2/3/14']
['Supplier Y', '50-9505', '6650', '$125.00', '2/3/14']
['Supplier Z', '920-4803', '3321', '$615.00', '2/3/14']
['Supplier Z', '920-4804', '3321', '$615.00', '2/10/14']
['Supplier Z', '920-4805', '3321', '$615.00', '2/17/14']
['Supplier Z', '920-4806', '3321', '$615.00', '2/24/14']
轉(zhuǎn)換后的字典:
{'Supplier Name': ['Supplier X', 'Supplier X', 'Supplier X', 'Supplier X', 'Supplier Y', 'Supplier Y', 'Supplier Y', 'Supplier Y', 'Supplier Z', 'Supplier Z', 'Supplier Z', 'Supplier Z'], 'Invoice Number': ['001-1001', '001-1001', '001-1001', '001-1001', '50-9501', '50-9501', '50-9505', '50-9505', '920-4803', '920-4804', '920-4805', '920-4806'], 'Part Number': ['2341', '2341', '5467', '5467', '7009', '7009', '6650', '6650', '3321', '3321', '3321', '3321'], 'Cost': ['$500.00', '$500.00', '$750.00', '$750.00', '$250.00', '$250.00', '$125.00', '$125.00', '$615.00', '$615.00', '$615.00', '$615.00'], 'Purchase Date': ['1/20/14', '1/20/14', '1/20/14', '1/20/14', '1/30/14', '1/30/14', '2/3/14', '2/3/14', '2/3/14', '2/10/14', '2/17/14', '2/24/14']}
用pandas庫實現(xiàn):
def rowColToDictpd(fname): #用pandas庫實現(xiàn)
import pandas as pd
dataf=pd.read_csv(fname)
print(dataf)
d_key=list(dataf.columns)
l_k=len(d_key)
d_pd={} #d_pd=dict()
for col in d_key:
d_pd[col]=list(dataf.loc[:,col])
print('轉(zhuǎn)換后字典:\n',d_pd)
#print(pd.DataFrame(d_pd)) #可以比較好地轉(zhuǎn)換回dataframe格式腰涧,但是由于字典的無序韧掩,col排列順序會變化
#這種方法因為pd讀csv生成dataframe時,對于數(shù)字窖铡,會進行類型轉(zhuǎn)換疗锐,所以獲得的字典數(shù)據(jù)不都是字符串
f='./supplier_data.csv'
rowColToDictpd(f)
輸出:
Supplier Name Invoice Number Part Number Cost Purchase Date
0 Supplier X 001-1001 2341 $500.00 1/20/14
1 Supplier X 001-1001 2341 $500.00 1/20/14
2 Supplier X 001-1001 5467 $750.00 1/20/14
3 Supplier X 001-1001 5467 $750.00 1/20/14
4 Supplier Y 50-9501 7009 $250.00 1/30/14
5 Supplier Y 50-9501 7009 $250.00 1/30/14
6 Supplier Y 50-9505 6650 $125.00 2/3/14
7 Supplier Y 50-9505 6650 $125.00 2/3/14
8 Supplier Z 920-4803 3321 $615.00 2/3/14
9 Supplier Z 920-4804 3321 $615.00 2/10/14
10 Supplier Z 920-4805 3321 $615.00 2/17/14
11 Supplier Z 920-4806 3321 $615.00 2/24/14
轉(zhuǎn)換后字典:
{'Supplier Name': ['Supplier X', 'Supplier X', 'Supplier X', 'Supplier X', 'Supplier Y', 'Supplier Y', 'Supplier Y', 'Supplier Y', 'Supplier Z', 'Supplier Z', 'Supplier Z', 'Supplier Z'], 'Invoice Number': ['001-1001', '001-1001', '001-1001', '001-1001', '50-9501', '50-9501', '50-9505', '50-9505', '920-4803', '920-4804', '920-4805', '920-4806'], 'Part Number': [2341, 2341, 5467, 5467, 7009, 7009, 6650, 6650, 3321, 3321, 3321, 3321], 'Cost': ['$500.00', '$500.00', '$750.00', '$750.00', '$250.00', '$250.00', '$125.00', '$125.00', '$615.00', '$615.00', '$615.00', '$615.00'], 'Purchase Date': ['1/20/14', '1/20/14', '1/20/14', '1/20/14', '1/30/14', '1/30/14', '2/3/14', '2/3/14', '2/3/14', '2/10/14', '2/17/14', '2/24/14']}
上面4個例子代碼都挺長的,也很實用费彼,一些相關(guān)的項目也可以根據(jù)這些代碼進行更改和拓展滑臊,因此理解和實踐這部分代碼對Python技能的提高很有幫助。下一篇筆記會進入可視化的內(nèi)容敌买,用圖表表現(xiàn)數(shù)據(jù)。
本篇筆記的GitHub同步項目于readingForDS阶界。關(guān)于本系列筆記有任何建議歡迎留言討論虹钮。