pip install xlrd 可以用來(lái)讀取excel
import pandas as pd
import datetime
import matplotlib
import matplotlib.pyplot as plt
import os? ? ? #os 適用于windows的路徑镊尺,包括路徑拼接等
cur_dir = os.path.dirname(__file__)
# __file__? 為當(dāng)前.py file 的位置
df = pd.read_csv(os.path.join(cur_dir,"employees.csv"))
#print(cur_dir)
#print(df)
#
# cur_dir is C:/Users/T596494/FZPython/Insideout2_dist
# os.path.join(cur_dir,"employees.csv") is C:/Users/T596494/FZPython/Insideout2_dist\employees.csv
# it can automatically add \
# when print df, it will print the file details
# df will show up the whole document's details - "employees.csv"
print(df.shape)
# shape will show the number of rows and columns of the data. - (1000, 4) 1000 rows, 4 columns 不含title的數(shù)據(jù)行數(shù)
# print(df.dtypes)
# dtypes check each columns' data type
# employee_id? ? ? ? int64
# annual_inc? ? ? ? float64
# employee_title? ? object
# home_ownership? ? object
# dtype: object
print(df.columns)
# Index(['employee_id', 'annual_inc', 'employee_title', 'home_ownership'], dtype='object')
# show each columns' title
print(df.head())
#df.head() 可以顯示前面幾行的數(shù)據(jù)互躬,如果在括號(hào)里面輸入3,print(df.head(3)) 則顯示三行數(shù)據(jù)(包括title則為4行)
print(df.sample(10))
#隨機(jī)顯示10行數(shù)據(jù)蕊肥,若不寫(xiě),則只顯示1行
print(df.tail(5))
#顯示最后5行的數(shù)據(jù)
#把title為annual_inc的列儲(chǔ)存到annual_incs的變量中债蜜。annual_incs的屬性同df? ? ?
#Thisis known as a Series.
annual_incs = df['annual_inc']
print(annual_incs)
print(type(annual_incs))
print(annual_incs.head())
print(annual_incs.shape)
#將第二行的數(shù)據(jù)存到row_2晴埂,并且將相應(yīng)數(shù)據(jù)顯示出來(lái)
row_2 = df.iloc[1]
print(row_2)
emp_id = row_2['employee_id']
emp_title = row_2['employee_title']
print('The employee with ID {emp_id} has the job title {emp_title}.' .format(emp_id = emp_id,emp_title = emp_title))
#篩選
Filtering by ONE condition, numeric
To perform filtering by numeric values, first check the column is a numeric column (int64 or float64).
Then use the following formula to do so:
df[df['column_name']<conditional operator><value>]
#篩選 df中年收入大于 300000的所有行
print(df[df['annual_inc'] >=300000])
#對(duì)字符串類型的數(shù)據(jù)的篩選
#選出所有home_ownership 是OWN的數(shù)據(jù)
print(df[df['home_ownership'] =='OWN'].head())
#選出所有employee_title 是‘Accountant'或者'Sales'的數(shù)據(jù)
print(df[df['employee_title'].isin(['Accountant','Sales'])])
#多條件篩選
#選出employee_title 是President, 并且annual_inc 超過(guò)225000的數(shù)據(jù)
print(df[ (df['employee_title'] =='President') & (df['annual_inc'] >=225000)])
#選出employee_title是Sales寻定, 并且annual_inc 超過(guò)100000 或者小于 80000
print(df[(df['employee_title']=='Sales') & ((df['annual_inc']>=100000) | (df['annual_inc']<=80000))])
#保存儒洛,用df的id對(duì)df2的id做vlookup,并且保存到output.csv
df2 = pd.read_csv(os.path.join(cur_dir,"employees2.csv"))
output = df[df['employee_id'].isin(df2['employee_id'])]
output.to_csv("output.csv",index=False)
#顯示文件的信息
print(df.info())
#Draw graph with matplotlib
df.groupby('home_ownership')['home_ownership'].value_counts().unstack().plot(kind='bar',figsize=(10,5))
plt.show()