pandas datafram add series experiment
import pandas as pd
# Change False to True for each block of code to see what it does
# Adding a Series to a square DataFrame
if False:
s = pd.Series([1, 2, 3, 4])
df = pd.DataFrame({
0: [10, 20, 30, 40],
1: [50, 60, 70, 80],
2: [90, 100, 110, 120],
3: [130, 140, 150, 160]
})
print df
print '' # Create a blank line between outputs
print df + s
# Adding a Series to a one-row DataFrame
if False:
s = pd.Series([1, 2, 3, 4])
df = pd.DataFrame({0: [10], 1: [20], 2: [30], 3: [40]})
print df
print '' # Create a blank line between outputs
print df + s
# Adding a Series to a one-column DataFrame
if False:
s = pd.Series([1, 2, 3, 4])
df = pd.DataFrame({0: [10, 20, 30, 40]})
print df
print '' # Create a blank line between outputs
print df + s
# Adding when DataFrame column names match Series index
if False:
s = pd.Series([1, 2, 3, 4], index=['a', 'b', 'c', 'd'])
df = pd.DataFrame({
'a': [10, 20, 30, 40],
'b': [50, 60, 70, 80],
'c': [90, 100, 110, 120],
'd': [130, 140, 150, 160]
})
print df
print '' # Create a blank line between outputs
print df + s
# Adding when DataFrame column names don't match Series index
if False:
s = pd.Series([1, 2, 3, 4])
df = pd.DataFrame({
'a': [10, 20, 30, 40],
'b': [50, 60, 70, 80],
'c': [90, 100, 110, 120],
'd': [130, 140, 150, 160]
})
print df
print '' # Create a blank line between outputs
print df + s
pandas加減乘除的方向確定方法:如果要對行操作蚕断,那么就是對同一組index進行操作,那么就改用add苞尝, sub说铃,div的方法來访惜,并把axis調成index嘹履;默認的+——/都是按照列來,也就是對同一組column操作债热,例如df.mean(),返回一個數組砾嫉,每個數組的元素代表某一列的數組的平均值。df.mean() == df.means(axis = 'index')
- 求某一行的平均值
df.mean(axis = 'columns')
- 將某個df減去每一行的平均值
df.sub(df.mean(axis = 'columns'), axis = 'index')
Excercise
import pandas as pd
# Adding using +
if False:
s = pd.Series([1, 2, 3, 4])
df = pd.DataFrame({
0: [10, 20, 30, 40],
1: [50, 60, 70, 80],
2: [90, 100, 110, 120],
3: [130, 140, 150, 160]
})
print df
print '' # Create a blank line between outputs
print df + s
# Adding with axis='index'
if False:
s = pd.Series([1, 2, 3, 4])
df = pd.DataFrame({
0: [10, 20, 30, 40],
1: [50, 60, 70, 80],
2: [90, 100, 110, 120],
3: [130, 140, 150, 160]
})
print df
print '' # Create a blank line between outputs
print df.add(s, axis='index')
# The functions sub(), mul(), and div() work similarly to add()
# Adding with axis='columns'
if False:
s = pd.Series([1, 2, 3, 4])
df = pd.DataFrame({
0: [10, 20, 30, 40],
1: [50, 60, 70, 80],
2: [90, 100, 110, 120],
3: [130, 140, 150, 160]
})
print df
print '' # Create a blank line between outputs
print df.add(s, axis='columns')
# The functions sub(), mul(), and div() work similarly to add()
grades_df = pd.DataFrame(
data={'exam1': [43, 81, 78, 75, 89, 70, 91, 65, 98, 87],
'exam2': [24, 63, 56, 56, 67, 51, 79, 46, 72, 60]},
index=['Andre', 'Barry', 'Chris', 'Dan', 'Emilio',
'Fred', 'Greta', 'Humbert', 'Ivan', 'James']
)
def standardize(df):
'''
Fill in this function to standardize each column of the given
DataFrame. To standardize a variable, convert each value to the
number of standard deviations it is above or below the mean.
This time, try to use vectorized operations instead of apply().
You should get the same results as you did before.
'''
return None
def standardize_rows(df):
'''
Optional: Fill in this function to standardize each row of the given
DataFrame. Again, try not to use apply().
This one is more challenging than standardizing each column!
'''
return None
Excercise about groupby()
import numpy as np
import pandas as pd
values = np.array([1, 3, 2, 4, 1, 6, 4])
example_df = pd.DataFrame({
'value': values,
'even': values % 2 == 0,
'above_three': values > 3
}, index=['a', 'b', 'c', 'd', 'e', 'f', 'g'])
# Change False to True for each block of code to see what it does
# Standardize each group
if False:
def standardize(xs):
return (xs - xs.mean()) / xs.std()
grouped_data = example_df.groupby('even')
print grouped_data['value'].apply(standardize)
# Find second largest value in each group
if False:
def second_largest(xs):
sorted_xs = xs.sort(inplace=False, ascending=False)
return sorted_xs.iloc[1]
grouped_data = example_df.groupby('even')
print grouped_data['value'].apply(second_largest)
# --- Quiz ---
# DataFrame with cumulative entries and exits for multiple stations
ridership_df = pd.DataFrame({
'UNIT': ['R051', 'R079', 'R051', 'R079', 'R051', 'R079', 'R051', 'R079', 'R051'],
'TIMEn': ['00:00:00', '02:00:00', '04:00:00', '06:00:00', '08:00:00', '10:00:00', '12:00:00', '14:00:00', '16:00:00'],
'ENTRIESn': [3144312, 8936644, 3144335, 8936658, 3144353, 8936687, 3144424, 8936819, 3144594],
'EXITSn': [1088151, 13755385, 1088159, 13755393, 1088177, 13755598, 1088231, 13756191, 1088275]
})
def get_hourly_entries_and_exits(entries_and_exits):
'''
Fill in this function to take a DataFrame with cumulative entries
and exits and return a DataFrame with hourly entries and exits.
The hourly entries and exits should be calculated separately for
each station (the 'UNIT' column).
Hint: Take a look at the `get_hourly_entries_and_exits()` function
you wrote in a previous quiz, DataFrame Vectorized Operations. If
you copy it here and rename it, you can use it and the `.apply()`
function to help solve this problem.
'''
return None