介紹
CREATE TABLE "candidates" (
"id" INTEGER PRIMARY KEY NOT NULL ,
"first_name" VARCHAR,
"last_name" VARCHAR,
"middle_name" VARCHAR,
"party" VARCHAR NOT NULL
);
CREATE TABLE "contributors" (
"id" INTEGER PRIMARY KEY AUTOINCREMENT NOT NULL,
"last_name" VARCHAR,
"first_name" VARCHAR,
"middle_name" VARCHAR,
"street_1" VARCHAR,
"street_2" VARCHAR,
"city" VARCHAR,
"state" VARCHAR,
"zip" VARCHAR,
"amount" INTEGER,
"date" DATETIME,
"candidate_id" INTEGER NOT NULL, name,
FOREIGN KEY(candidate_id) REFERENCES candidates(id)
);
dfcond對應(yīng)的是contributors表佃声,dfuser對應(yīng)的是condidates
數(shù)據(jù)查詢
- 單條件
查找first_name為‘John’的數(shù)據(jù)
dfcond.query("first_name=='John'")
dfcond[dfcond.first_name=='John']
dfcond.loc[dfcond.first_name=='John']
上面三個語句等價
###SQL
select * from contributors where first_name == 'John'
- 多條件
查找last_name為Ahrens,訂單大于500的數(shù)據(jù)
dfcond.query("last_name=='Ahrens' and amount>500")
###SQL
select * from contributors where last_name=='Ahrens' and amount>500
- 空值
查找state為空的數(shù)據(jù)
dfcond[dfcond.state.isnull()]
###SQL
select * from contributors where state is null
- 多值選擇
查找state為VA或者WA的數(shù)據(jù)
dfcond[dfcond.state.isin(['VA','WA'])]
###SQL
select * from contributors where state in ('VA','WA')
- 區(qū)間查找
查到訂單介于10到50之間的數(shù)據(jù)
dfcond.query("10<=amount<=50")
###SQL
select * from contributors where amount between 10 and 50
- 重復(fù)值
查找first_name,last_name
dfcond[['first_name,last_name']].drop_duplicates()
dfcond.drop_duplicates(subset=['first_name,last_name'],keep='first')
keep還可以選擇'last'囤萤,或者False刪除所有重復(fù)項
###SQL
select distinct first_name,last_name from contributors
- 數(shù)據(jù)返回量控制
返回三條記錄
dfcond[0:3]
dfcond.iloc[0:3]
#SQL
select * from contributors limit 3
- 帶有函數(shù)的例子
訂單值大于最大訂單值減去2000
dfcond[dfcond.amount>dfcond.amount.max()-2000]
###SQL
select * from contributors where amount>(select max(amount) from contributors)
- 聯(lián)合查找
從candidates表中查找last_name為Obama耗溜,然后去contributors中查找與其有關(guān)的記錄坐桩。
cid = dfuser.query("last_name=='Obama'")['id'].values[0]
dfcond.query('candidate_id=={}'.format(cid))
###SQL
select * from contributors where candidate_id =
(select id from candidates where last_name='Obama')
#也可以進行隱式連接
select contributors.last_name,contributors.amount from contributors,candidates where
condidates.last_name='Obama' and condidates.id = contributors.candidate_id
- 內(nèi)聯(lián)
dfcond.merge(dfuser,left_on='candidate_id',right_on='id')
###SQL
select * from contributors,candidates where contributors.candidate_id =
candidates.id
select * from contributors inter join candidates on candidate_id=candidates.id
#這里有個疑問就是candidate_id如果加了表名字段就會報錯钝的,找不到字段
- 左聯(lián)/右聯(lián)/全聯(lián)
dfcond.merge(dfuser,left_on='candidate_id',right_on='id',how='left/right/outer')
###SQL
select * from contributors left join candidates on candidate_id=candidates.id
select * from contributors right join candidates on candidate_id=candidates.id
select * from contributors full join candidates on candidate_id=candidates.id
left,right,full的區(qū)別惧辈?
行列操作
- 列選擇
選擇first_name列
dfcond['first_name']
dfcond[['first_name','last_name']]#多列選擇
###SQL
select first_name from contributors
select first_name,last_name from contributors
- 新增加一列
增加一列name
dfcond['name'] = dfcond['last_name']+','+dfcond['first_name']
dfcond.assign(name=dfcond.last_name+":"+dfcond.first_name)
###對于sql要修改數(shù)據(jù)表
ALTER TABLE contributors ADD COLUMN name varchar(255);
- 列刪除
刪除name列
del dfcond['name']
###SQL
alter table contributors drop column name
- 行刪除
刪除所有l(wèi)ast_name為Ahrens的行
#這里其實比較復(fù)雜琳状,先把last_name弄成索引,然后刪除盒齿,最后恢復(fù)索引
df2=dfcwdi.copy()
df2.set_index('last_name', inplace=True)
df2.drop(['Ahrens','Akin'],inplace=True)#這里可以單個也可以多個值
df2.reset_index()
# 其實這里沒必要這樣直接就是查詢last_name 不等于Ahrens的行就可以了
ndf = dfcwdi.query('last_name!="Ahrens"')
###SQL
delete from contributors where last_name='Ahrens'
delete from contributes where last_name in ('Ahrens','Akin')
數(shù)據(jù)修改
- 多行數(shù)據(jù)修改
dfcond.loc[dfcond.state=='VA','name'] = "Junk"
###SQL
update contributors set name = "Junk" where state = 'VA'
聚合Aggregate
- 極值
###amount最大值
dfcond.describe()###獲取平均值念逞,極值(貌似只對數(shù)據(jù)有效)
dfcond.amount.max()
dfcond[dfcond.amount.max()==dfcond.amount]
###SQL
select *,max(amount) as maxcol from contributors
- 計數(shù)
dfcond.count()
#返回所有字段的統(tǒng)計計數(shù),空值不算
dfcond.info()#這個函數(shù)也有同樣的結(jié)果
dfcond.XX.count_values()#對相同的值進行統(tǒng)計边翁,類似于Counter功能
###SQL
select count(amount) as countcol from contributors
#平均值
select avg(amount) as avgcol from contributors
- groupby
根據(jù)state分類匯總
dfcond.groupby('state').sum()#分類匯總求和
gb = dfcond.groupby('state')#分組
CA = gb.get_group('CA')#獲取單個分組
###SQL
SELECT state,SUM(amount) FROM contributors GROUP BY state
排序
df = dfcond.sort_values(by=['last_name'],ascending=False)#采用降序排列
###SQL
SELECT * FROM contributors ORDER BY last_name DESC;
pandas與數(shù)據(jù)庫之間的轉(zhuǎn)換
from sqlite3 import dbapi2 as sq3
import os
PATHSTART="."
def get_db(dbfile):
sqlite_db = sq3.connect(os.path.join(PATHSTART, dbfile))
return sqlite_db
def init_db(dbfile, schema):
"""Creates the database tables."""
db = get_db(dbfile)
db.cursor().executescript(schema)
db.commit()
return db
db=init_db("cancont.db", ourschema)
dfusers.to_sql("candidates", db, if_exists="append", index=False)
dfcand.to_sql("contributors", db, if_exists="append", index=False)
另一種數(shù)據(jù)插入方法
ins="""
INSERT INTO candidates (id, first_name, last_name, middle_name, party) \
VALUES (?,?,?,?,?);
"""
#candidates.txt文件中第一行是表頭
with open('./candidates.txt') as f:
lines = f.readlines()
for line in lines[1:]:
zid,first_name,last_name,middle_name,party = line.strip().split('|')
print(zid,first_name,last_name,middle_name,party)
vals = (int(zid),first_name,last_name,middle_name,party)
print(vals)
db.cursor().execute(ins,vals)
sql語句執(zhí)行函數(shù)
def make_query(sql):
c = db.cursor().execute(sql)
return c.fetchall()
把查詢結(jié)果轉(zhuǎn)換為dataframe對象
def make_frame(data,col_names):
frame = []
for i,name in enumerate(col_names):
frame.append((name,[d[i] for d in data]))
#把行編程列
return pd.DataFrame.from_items(frame)
上面的函數(shù)需要列名
col_names = [field[1] for field in make_query("pragma table_info(contributors)")]
#這是針對sqlite
使用例子
make_frame(make_query("select * from contributors where state is null"),col_names)
如果是sqlite數(shù)據(jù)庫翎承,可以直接如下讀取
pd.read_sql("SELECT * FROM candidates WHERE party= 'D';", db)