Pandas: Comparison with SQL

??這是今年年初在學(xué)習(xí) Pandas 時(shí),所寫(xiě)的一篇筆記叶圃,當(dāng)時(shí)對(duì) Pandas 還不夠熟悉森书,便借助與 SQL (以 SQLite 為例)進(jìn)行對(duì)比, 理解并熟悉 Pandas 的使用融痛。

# 關(guān)于 Pandas 可參看之前的文章——Pandas

0 Prepping

import pandas as pd
import numpy as np
tips = pd.read_csv(r'../data/tips.csv')
tips.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 244 entries, 0 to 243
Data columns (total 7 columns):
total_bill 244 non-null float64
tip 244 non-null float64
sex 244 non-null object
smoker 244 non-null object
day 244 non-null object
time 244 non-null object
size 244 non-null int64
dtypes: float64(2), int64(1), object(4)
memory usage: 13.4+ KB

tips.head()
import sqlite3

path = '../data/data.db'

# 保存 DataFrame 到 SQLite
def save_sql(df, tablename):
    '''
         Save Dataframe into SQLite
    '''
    conn = sqlite3.connect(path)
    
    try:
        df.to_sql(tablename, conn, index=False, if_exists='replace')
        print('Save Successful.')
    except Exception as e:
        print('Error:\n\n{0}'.format(e))
        
    conn.close()

#  連接 SQLite 并進(jìn)行查詢操作
def query_sql(query):
    '''
        Connect SQLite and Query
    '''
    conn = sqlite3.connect(path)
    
    try:
        result = pd.read_sql(query, conn)
        print('\nQuery Results:')
        return result
    except Exception as e:
        print('Error:\n\n{0}'.format(e))
    
    conn.close()

# 在 SQLite 中執(zhí)行游標(biāo)    
def cursor_sql(query):
    '''
         Executing Cursors in SQLite
    '''
    conn = sqlite3.connect(path)
    
    cursor = conn.cursor()
    cursor.execute(query)
    cursor.close()
    
    conn.commit()
    conn.close()
# 保存 tips 到數(shù)據(jù)庫(kù)
save_sql(tips, 'tips')

Save Successful.

1 SELECT

query = '''
SELECT total_bill, tip, smoker, time
FROM tips
LIMIT 5;
'''

query_sql(query)

Query Results:

tips[['total_bill', 'tip', 'smoker', 'time']].head(5)

2 WHERE

query = '''
SELECT *
FROM tips
WHERE time = 'Dinner'
LIMIT 5;
'''

query_sql(query)

Query Results:

tips[tips['time'] == 'Dinner'].head(5)
is_dinner = tips['time'] == 'Dinner'
is_dinner.value_counts()

True 176
False 68
Name: time, dtype: int64

tips[is_dinner].head(5)
query = '''
-- tips of more than $5.00 at Dinner meals
SELECT *
FROM tips
WHERE time = 'Dinner' AND tip > 5.00;
'''

query_sql(query)

Query Results:

# tips of more than $5.00 at Dinner meals
tips[(tips['time'] == 'Dinner') & (tips['tip'] > 5.00)] 
query = '''
-- tips by parties of at least 5 diners OR bill total was more than $45
SELECT *
FROM tips
WHERE size >= 5 OR total_bill > 45;
'''

query_sql(query)

Query Results:

# tips by parties of at least 5 diners OR bill total was more than $45
tips[(tips['size'] >= 5) | (tips['total_bill'] > 45)]
# Create a DataFrame
frame = pd.DataFrame({'col1': ['A', 'B', np.NaN, 'C', 'D'],
                      'col2': ['F', np.NaN, 'G', 'H', 'I']})
frame
save_sql(frame, 'frame')
Save Successful.
query = '''
-- where col2 IS NULL
SELECT *
FROM frame
WHERE col2 IS NULL;
'''

query_sql(query)

Query Results:

# where col2 IS NULL
frame[frame['col2'].isna()]
query = '''
-- where col1 IS NOT NULL
SELECT *
FROM frame
WHERE col1 IS NOT NULL;
'''

query_sql(query)

Query Results:

# where col1 IS NOT NULL
frame[frame['col1'].notna()]

3 GROUP BY

query = '''
SELECT sex
    ,count(*)
FROM tips
GROUP BY sex;
'''

query_sql(query)

Query Results:

tips.groupby('sex').size()

sex
Female 87
Male 157
dtype: int64

tips.groupby('sex').count()
tips.groupby('sex')['total_bill'].count()

sex
Female 87
Male 157
Name: total_bill, dtype: int64

query = '''
SELECT day
    ,AVG(tip)
    ,COUNT(*)
FROM tips
GROUP BY day;
'''

query_sql(query)

Query Results:

tips.groupby('day').agg({'tip': np.mean, 'day': np.size})
query = '''
SELECT smoker
    ,day
    ,COUNT(*)
    ,AVG(tip)
FROM tips
GROUP BY smoker
    ,day;
'''

query_sql(query)

Query Results:

tips.groupby(['smoker', 'day']).agg({'tip': [np.size, np.mean]})

4 JOIN

4.0 Prepping

df1 = pd.DataFrame({'key': ['A', 'B', 'C', 'D'],
                    'value': np.random.randn(4)})
df2 = pd.DataFrame({'key': ['B', 'D', 'D', 'E'],
                    'value': np.random.randn(4)})
save_sql(df1, 'df1')

Save Successful.

save_sql(df2, 'df2')

Save Successful.

4.1 INNER JOIN

query = '''
SELECT *
FROM df1
INNER JOIN df2
    ON df1.key = df2.key;
'''

query_sql(query)

Query Results:

# merge performs an INNER JOIN by default
pd.merge(df1, df2, on='key')
indexed_df2 = df2.set_index('key')
pd.merge(df1, indexed_df2, left_on='key', right_index=True)

4.2 LEFT OUTER JOIN

query = '''
-- show all records from df1
SELECT *
FROM df1
LEFT OUTER JOIN df2
    ON df1.key = df2.key;
'''

query_sql(query)

Query Results:

# show all records from df1
pd.merge(df1, df2, on='key', how='left')

4.3 RIGHT OUTER JOIN

# SQLite cannot RIGHT OUTER JOIN
query = '''
-- show all records from df2
SELECT *
FROM df1
RIGHT OUTER JOIN df2
    ON df1.key = df2.key;
'''

query_sql(query)

Error:

Execution failed on sql '
-- show all records from df2
SELECT *
FROM df1
RIGHT OUTER JOIN df2
ON df1.key = df2.key;
': RIGHT and FULL OUTER JOINs are not currently supported

# show all records from df2
pd.merge(df1, df2, on='key', how='right')

4.4 FULL JOIN

# SQLite cannot FULL OUTER JOIN
query = '''
-- show all records from both tables
SELECT *
FROM df1
FULL OUTER JOIN df2
  ON df1.key = df2.key;
'''

query_sql(query)

Error:

Execution failed on sql '
-- show all records from both tables
SELECT *
FROM df1
FULL OUTER JOIN df2
ON df1.key = df2.key;
': RIGHT and FULL OUTER JOINs are not currently supported

# show all records from both frames
pd.merge(df1, df2, on='key', how='outer')

5 UNION

city1 = pd.DataFrame({'city': ['Chicago', 'San Francisco', 'New York City'],
                    'rank': range(1, 4)})
city2 = pd.DataFrame({'city': ['Chicago', 'Boston', 'Los Angeles'],
                    'rank': [1, 4, 5]})
save_sql(city1, 'city1')
save_sql(city2, 'city2')

Save Successful.
Save Successful.

query = '''
SELECT city
    ,rank
FROM city1
UNION ALL
SELECT city
    ,rank
FROM city2;
'''

query_sql(query)

Query Results:

pd.concat([city1, city2])
query = '''
SELECT city
    ,rank
FROM city1
UNION
SELECT city
    ,rank
FROM city2;
'''

query_sql(query)

Query Results:

pd.concat([city1, city2]).drop_duplicates()

6 Pandas equivalents for some SQL analytic and aggregate functions

6.1 Top N rows with offset

-- MySQL
SELECT * FROM tips
ORDER BY tip DESC
LIMIT 10 OFFSET 5;
tips.nlargest(10+5, columns='tip').tail(10)

6.2 Top N rows per group

-- Oracle's ROW_NUMBER() analytic function
SELECT * FROM (
  SELECT
    t.*,
    ROW_NUMBER() OVER(PARTITION BY day ORDER BY total_bill DESC) AS rn
  FROM tips t
)
WHERE rn < 3
ORDER BY day, rn;
(tips.assign(rn=tips.sort_values(['total_bill'], ascending=False)
                         .groupby(['day'])
                         .cumcount() + 1)
          .query('rn < 3')
          .sort_values(['day','rn'])
    )
 (tips.assign(rnk=tips.groupby(['day'])['total_bill']
                          .rank(method='first', ascending=False))
          .query('rnk < 3')
          .sort_values(['day','rnk'])
     )
-- Oracle's RANK() analytic function
SELECT * FROM (
  SELECT
    t.*,
    RANK() OVER(PARTITION BY sex ORDER BY tip) AS rnk
  FROM tips t
  WHERE tip < 2
)
WHERE rnk < 3
ORDER BY sex, rnk;
(tips[tips['tip'] < 2]
          .assign(rnk_min=tips.groupby(['sex'])['tip']
                              .rank(method='min'))
          .query('rnk_min < 3')
          .sort_values(['sex','rnk_min'])
     )

7 UPDATE

query = '''
UPDATE tips
SET tip = tip*2
WHERE tip < 2;
'''

cursor_sql(query)
tips.loc[tips['tip'] < 2, 'tip'] *= 2

8 DELETE

query = '''
DELETE FROM tips
WHERE tip > 9;
'''

cursor_sql(query)
tips = tips.loc[tips['tip'] <= 9]
?著作權(quán)歸作者所有,轉(zhuǎn)載或內(nèi)容合作請(qǐng)聯(lián)系作者
  • 序言:七十年代末,一起剝皮案震驚了整個(gè)濱河市神僵,隨后出現(xiàn)的幾起案子雁刷,更是在濱河造成了極大的恐慌,老刑警劉巖保礼,帶你破解...
    沈念sama閱讀 212,454評(píng)論 6 493
  • 序言:濱河連續(xù)發(fā)生了三起死亡事件沛励,死亡現(xiàn)場(chǎng)離奇詭異,居然都是意外死亡炮障,警方通過(guò)查閱死者的電腦和手機(jī)目派,發(fā)現(xiàn)死者居然都...
    沈念sama閱讀 90,553評(píng)論 3 385
  • 文/潘曉璐 我一進(jìn)店門(mén),熙熙樓的掌柜王于貴愁眉苦臉地迎上來(lái)胁赢,“玉大人址貌,你說(shuō)我怎么就攤上這事。” “怎么了练对?”我有些...
    開(kāi)封第一講書(shū)人閱讀 157,921評(píng)論 0 348
  • 文/不壞的土叔 我叫張陵,是天一觀的道長(zhǎng)吹害。 經(jīng)常有香客問(wèn)我螟凭,道長(zhǎng),這世上最難降的妖魔是什么它呀? 我笑而不...
    開(kāi)封第一講書(shū)人閱讀 56,648評(píng)論 1 284
  • 正文 為了忘掉前任螺男,我火速辦了婚禮,結(jié)果婚禮上纵穿,老公的妹妹穿的比我還像新娘下隧。我一直安慰自己,他們只是感情好谓媒,可當(dāng)我...
    茶點(diǎn)故事閱讀 65,770評(píng)論 6 386
  • 文/花漫 我一把揭開(kāi)白布淆院。 她就那樣靜靜地躺著,像睡著了一般句惯。 火紅的嫁衣襯著肌膚如雪土辩。 梳的紋絲不亂的頭發(fā)上,一...
    開(kāi)封第一講書(shū)人閱讀 49,950評(píng)論 1 291
  • 那天抢野,我揣著相機(jī)與錄音拷淘,去河邊找鬼。 笑死指孤,一個(gè)胖子當(dāng)著我的面吹牛启涯,可吹牛的內(nèi)容都是我干的。 我是一名探鬼主播恃轩,決...
    沈念sama閱讀 39,090評(píng)論 3 410
  • 文/蒼蘭香墨 我猛地睜開(kāi)眼结洼,長(zhǎng)吁一口氣:“原來(lái)是場(chǎng)噩夢(mèng)啊……” “哼!你這毒婦竟也來(lái)了详恼?” 一聲冷哼從身側(cè)響起补君,我...
    開(kāi)封第一講書(shū)人閱讀 37,817評(píng)論 0 268
  • 序言:老撾萬(wàn)榮一對(duì)情侶失蹤,失蹤者是張志新(化名)和其女友劉穎昧互,沒(méi)想到半個(gè)月后挽铁,有當(dāng)?shù)厝嗽跇?shù)林里發(fā)現(xiàn)了一具尸體,經(jīng)...
    沈念sama閱讀 44,275評(píng)論 1 303
  • 正文 獨(dú)居荒郊野嶺守林人離奇死亡敞掘,尸身上長(zhǎng)有42處帶血的膿包…… 初始之章·張勛 以下內(nèi)容為張勛視角 年9月15日...
    茶點(diǎn)故事閱讀 36,592評(píng)論 2 327
  • 正文 我和宋清朗相戀三年叽掘,在試婚紗的時(shí)候發(fā)現(xiàn)自己被綠了。 大學(xué)時(shí)的朋友給我發(fā)了我未婚夫和他白月光在一起吃飯的照片玖雁。...
    茶點(diǎn)故事閱讀 38,724評(píng)論 1 341
  • 序言:一個(gè)原本活蹦亂跳的男人離奇死亡更扁,死狀恐怖,靈堂內(nèi)的尸體忽然破棺而出,到底是詐尸還是另有隱情浓镜,我是刑警寧澤溃列,帶...
    沈念sama閱讀 34,409評(píng)論 4 333
  • 正文 年R本政府宣布,位于F島的核電站膛薛,受9級(jí)特大地震影響听隐,放射性物質(zhì)發(fā)生泄漏。R本人自食惡果不足惜哄啄,卻給世界環(huán)境...
    茶點(diǎn)故事閱讀 40,052評(píng)論 3 316
  • 文/蒙蒙 一雅任、第九天 我趴在偏房一處隱蔽的房頂上張望。 院中可真熱鬧咨跌,春花似錦沪么、人聲如沸。這莊子的主人今日做“春日...
    開(kāi)封第一講書(shū)人閱讀 30,815評(píng)論 0 21
  • 文/蒼蘭香墨 我抬頭看了看天上的太陽(yáng)。三九已至拳喻,卻和暖如春哭当,著一層夾襖步出監(jiān)牢的瞬間,已是汗流浹背冗澈。 一陣腳步聲響...
    開(kāi)封第一講書(shū)人閱讀 32,043評(píng)論 1 266
  • 我被黑心中介騙來(lái)泰國(guó)打工钦勘, 沒(méi)想到剛下飛機(jī)就差點(diǎn)兒被人妖公主榨干…… 1. 我叫王不留,地道東北人亚亲。 一個(gè)月前我還...
    沈念sama閱讀 46,503評(píng)論 2 361
  • 正文 我出身青樓彻采,卻偏偏與公主長(zhǎng)得像,于是被迫代替她去往敵國(guó)和親捌归。 傳聞我的和親對(duì)象是個(gè)殘疾皇子肛响,可洞房花燭夜當(dāng)晚...
    茶點(diǎn)故事閱讀 43,627評(píng)論 2 350

推薦閱讀更多精彩內(nèi)容