from pandas import Series, DataFrame
d = {'name':Series(['Braud','Cummings','Heikkinen','Allen'], index = ['a', 'b', 'c', 'd']),\
'age': Series([22, 38, 26, 35], index = ['a', 'b', 'c', 'd']),\
'fare': Series([7.25, 71.83, 8.05], index = ['a', 'b', 'd']),\
'survived?': Series([False, True, True, False], index = ['a', 'b', 'c', 'd'])}
df = DataFrame(d)
df['name']
Out[12]:
a Braud
b Cummings
c Heikkinen
d Allen
Name: name, dtype: object
df.loc['a']
Out[14]:
age 22
fare 7.25
name Braud
survived? False
Name: a, dtype: object
df[df['age'] >= 30]
Out[15]:
age fare name survived?
b 38 71.83 Cummings True
d 35 8.05 Allen False
d2 = {'one': Series([1,2,3], index = ['a', 'b', 'c']),\
'two': Series([1,2,3,4], index = ['a', 'b', 'c', 'd'])}
df2 = DataFrame(d2)
df2.apply(numpy.mean)
Out[17]:
one 2.0
two 2.5
dtype: float64
df2['one'].map(lambda x: x>1)
Out[18]:
a False
b True
c True
d False
Name: one, dtype: bool
df2.applymap(lambda x: x>1)
Out[19]:
one two
a False False
b True True
c True True
d False True
Difference between Python and numpy(Numerical Python)
python2 使用 & |
python3 使用and or
import pandas as pd
countries = [
'Afghanistan', 'Albania', 'Algeria', 'Angola', 'Argentina',
'Armenia', 'Australia', 'Austria', 'Azerbaijan', 'Bahamas',
'Bahrain', 'Bangladesh', 'Barbados', 'Belarus', 'Belgium',
'Belize', 'Benin', 'Bhutan', 'Bolivia',
'Bosnia and Herzegovina'
]
employment_values = [
55.70000076, 51.40000153, 50.5 , 75.69999695,
58.40000153, 40.09999847, 61.5 , 57.09999847,
60.90000153, 66.59999847, 60.40000153, 68.09999847,
66.90000153, 53.40000153, 48.59999847, 56.79999924,
71.59999847, 58.40000153, 70.40000153, 41.20000076
]
# Employment data in 2007 for 20 countries
employment = pd.Series(employment_values, index=countries)
def max_employment(employment):
'''
Fill in this function to return the name of the country
with the highest employment in the given employment
data, and the employment in that country.
The input will be a Pandas series where the values
are employment and the index is country names.
Try using the Pandas argmax() function. Documention is
here: http://pandas.pydata.org/pandas-docs/stable/generated/pandas.Series.argmax.html
'''
max_country_index = employment.argmax(axis = employment_values)
max_country = max_country_index # Replace this with your code
max_value = employment[max_country_index] # Replace this with your code
print max_country, max_value
max_employment(employment)
Angola 75.69999695
pandas dataframe exercise
# Subway ridership for 5 stations on 10 different days
ridership_df = pd.DataFrame(
data=[[ 0, 0, 2, 5, 0],
[1478, 3877, 3674, 2328, 2539],
[1613, 4088, 3991, 6461, 2691],
[1560, 3392, 3826, 4787, 2613],
[1608, 4802, 3932, 4477, 2705],
[1576, 3933, 3909, 4979, 2685],
[ 95, 229, 255, 496, 201],
[ 2, 0, 1, 27, 0],
[1438, 3785, 3589, 4174, 2215],
[1342, 4043, 4009, 4665, 3033]],
index=['05-01-11', '05-02-11', '05-03-11', '05-04-11', '05-05-11',
'05-06-11', '05-07-11', '05-08-11', '05-09-11', '05-10-11'],
columns=['R003', 'R004', 'R005', 'R006', 'R007']
)
# Change False to True for each block of code to see what it does
# DataFrame creation
if False:
# You can create a DataFrame out of a dictionary mapping column names to values
df_1 = pd.DataFrame({'A': [0, 1, 2], 'B': [3, 4, 5]})
print df_1
# You can also use a list of lists or a 2D NumPy array
df_2 = pd.DataFrame([[0, 1, 2], [3, 4, 5]], columns=['A', 'B', 'C'])
print df_2
# Accessing elements
if False:
print ridership_df.iloc[0]
print ridership_df.loc['05-05-11']
print ridership_df['R003']
print ridership_df.iloc[1, 3]
# Accessing multiple rows
if False:
print ridership_df.iloc[1:4]
# Accessing multiple columns
if False:
print ridership_df[['R003', 'R005']]
# Pandas axis
if False:
df = pd.DataFrame({'A': [0, 1, 2], 'B': [3, 4, 5]})
print df.sum()
print df.sum(axis=1)
print df.values.sum()
def mean_riders_for_max_station(ridership):
'''
Fill in this function to find the station with the maximum riders on the
first day, then return the mean riders per day for that station. Also
return the mean ridership overall for comparsion.
This is the same as a previous exercise, but this time the
input is a Pandas DataFrame rather than a 2D NumPy array.
'''
overall_mean = None # Replace this with your code
mean_for_max = None # Replace this with your code
return (overall_mean, mean_for_max)
Summing up, apply
works on a row / column basis of a DataFrame, applymap
works element-wise on a DataFrame, and map
works element-wise on a Series.