1.對DataFram進行合并操作
data = pd.merge(df1,df2,left_on = "名稱",right_on = " 姓名") #對數(shù)據進行左右合并
data = pd.concat([df1,df2,df3,df4],axis = 0) #對數(shù)據進行上下合并
df_inner=pd.merge(df,df1,how='inner')
df_left=pd.merge(df,df1,how='left')
df_right=pd.merge(df,df1,how='right')
df_outer=pd.merge(df,df1,how='outer')
2.在字段中對數(shù)據進行模糊匹配疚颊,相當于excel中的關鍵字查找
篩選出姓名列中包含"張"的數(shù)據:
data = df[df["姓名"].str.contains(r'.*?張.*')]
data = df[df["姓名"].isin(["張"])]
data = df[~df["姓名"].isin(["張"])] #通過~取反惠猿,篩選出姓名列中不包含"張"的數(shù)據
3.將"地址"字段所有數(shù)據保留前5個字符(注意:前閉后開)
df[" 地址"] = df["地址"].str[0:5]
4.將[{'a':'int','b':'str'},{'a':'cat','b':'dog'}]變成datafram
df = pd.DataFrame(newlist)
5.將dataframe寫到表格中,可以保存為csv和xlsx格式(注意:encoding在中文顯示亂碼的情況下要修改編碼格式)
pandas往csv里寫文件時0丟失的問題:在字符串前面加上"\t"就能正常作為文本寫入csv
data[代碼] = "\t" + data[代碼] #可以保留代碼前的0
df.to_csv('data.csv',encoding = 'utf-8-sig',index = None) #不要行號
df.to_csv('data.csv',encoding = 'utf-8-sig') #要行號
df.to_excel('data.xlsx',encoding = 'utf-8')
6.刪除姓名列中含有空值的那一行
df1 = df1[df1["姓名"].notna()] #刪除None所在的行
df1 = df1.dropna()
df1 = df1.dropna(axis = 0,how = 'all') #刪除整行都是空的
df1 = df1.fillna(" ") #把空值填成字符串
7.更改列名及更改列字段的值
df1.rename(columns = {"name":"姓名","id":"序號"}贸桶,inplace = True)
data['是否在職'] = data['是否在職'].replace('1','在職')
data['是否在職'] = data['是否在職'].replace('0','離職')
用判斷語句進行修改,1為在職0為離職,因為序列中的多個值不能和單個值比較侄刽,所以定義一個變量a
def zhiwei(a):
if (a == '1'):
return ' 在職'
else:
return ' 離職'
data['是否在職'] = data['是否在職'].apply(zhiwei)
8.刪除數(shù)據
df1.drop(df[["姓名","地址"]],axis = 1,inplace = True) #刪除某幾列
df1 = df1.drop_duplicates() #刪除重復的行
#按列位置刪除一列或多列
sql_data.drop(sql_data.columns[[0,1]],axis=1,inplace=True)
df_pledgee = df2[df2['p_stock2220_F006N'].notna()]
# 刪除None所在的 行
df= df.dropna()
#刪除重復的某行保留第一條
df = df.drop_duplicates(subset=['fund_id'],keep='first',inplace=True)
9.用groupby做分組運算(注:按“姓名”進行分類衷模,將同一組的“次數(shù)”求和)
df1 = df1[[" 次數(shù)"]].grouby(df1["姓名"]).sum()
df1 = df1.groupby(["姓名","地址","次數(shù)"]).sum()
df1 = df1.reset_index() #把索引變成列名
df1 = df1.sort_values(by = "次數(shù)",ascending = True) #排序
#篩選每組中最大的
df.groupby('fund_code').apply(lambda t: t[t.market_date==t.market_date.max()])
#groupby后保留原來的列
df_company = df[['customerId']].groupby(df['companyCode']).count().reset_index()
df_data.groupby(['FirstIndustryName'])['price_limit'].mean().reset_index()
df.groupby(['fund_code', 'market_date', 'subject_id']).sum()['market_value'].reset_index()
#分組移動求某一時間段的最大值
data_base['quotationLocal_max'] = data_base.groupby(['subjectName', 'companyCode', 'fundCode'])['quotationLocal'].cummax()
#分組滾動求值
fund_data['stock_sale_earn'] = fund_data.groupby(['companyCode', 'fundCode', 'subjectCode']).apply(
lambda x: (x['number'].shift(1) - x['number']).map(lambda x: x if x > 0 else 0) * (
x['quotation'] - x['quotation'].shift(1))).values
10.多列合并及拆分
df1["信息"] = df1[["姓名","地址"]].apply(lambda x:'/'.join(x),axis = 1)
df1["姓名"] = df1["信息"].map(lambda x:x.split('/')[0])
df1["地址"] = df1["信息"].map(lambda x:x.split('/')[1])
拆分字符串(rsplit從后往前)
df1["信息"] = df1["信息"].str.rsplit('',1)
#對category字段的值依次進行分列鹊汛,并創(chuàng)建數(shù)據表,索引值為df_inner的索引列阱冶,列名稱為category和size
pd.DataFrame((x.split('-') for x in df_inner['category']),index=d
f_inner.index,columns=['category','size'])
11.條件篩選(時間)
df1 = df1[(df1['data'] >= start_day) &
(df1['data'] <= end_day) &
(df1['seccode'] == seccode) ]['df1num']
12.處理缺失值
def remove_the_null(data,data1,a): #第二個參數(shù):當缺失率達到多少時刁憋,直接刪除
t = []
for col_name in data.columns:
changdu = len(data[col_name])
cnt = list(data[col_name].isna()).count(True)
if (cnt / changdu > a):
del data[col_name]
del data1[col_name]
t.append(col_name)
return data,data1,t
df.fillna(value = 0) #使用0對NA進行填充
df['price'].fillna(df['price'].mean()) #使用price均值對NA進行填充
df.isnull() #檢查數(shù)據空值
df['price'].isnull() #檢查特定列空值
13.Series取成一個值
df1 = df1.iloc[0]
df_inner.iloc[:3,:2] #使用iloc按位置區(qū)域提取數(shù)據
df_inner.iloc[[0,2,5],[4,5]] #使用iloc按位置單獨提取數(shù)據
14.dataframe轉置
df1 = pd.DataFrame(df.values.T, index = df.columns, columns = df.index)
15.將DataFrame類型的數(shù)據按某一列的值拆分成多個DataFrame類型的數(shù)據
class = data["地址"].unique()
for i in class:
data = data[data["地址"].isin([class])]
16.利用數(shù)據框df的name列中的非空值去填充df的features列中對應的NaN
df.loc[df['features'].isnull(),'features'] = df[df['features'].isnull()]['name']
csv_data = csv_data.fillna(' ') #把空值填成空字符串
17.轉變數(shù)據類型astype
df['代碼'] = df['代碼'].astype(str)
df['代碼'] = df['代碼'].astype(str).str[0:10].str.replace('-','.') #轉化類型、截取字段木蹬、替換數(shù)據
18.讀數(shù)據
cavPath1 = 'excel_name.xlsx'
excel_name = pd.read_excel(cavPath1,encoding = 'utf-8-sig',converters={u'代碼':str}) #可以把以0開頭的字段轉為字符串
19.把list中每個元素加1
l = map(lambda n:n+1,l)
20.pandas DataFrame取列名
DataFrame.columns.values.tolist()
21.給數(shù)據前面補0
s = n.zfill(5) #用來給字符串補0
s = "%05d" % n #純數(shù)字至耻,我們也可以通過格式化的方式來補0
22.過濾掉非int的數(shù)
if isinstance(days,int):
if days < 32:
return f'離{product}付息日 ({timeData})還差{days}天'
else:
return ' '
else:
return ' '
23.轉成json文件
class MyEncoder(json.JSONEncoder):
def default(self,obj):
if isinstance(obj,numpy.integer):
return int(obj)
elif isinstance(obj,numpy.floating):
return float(obj)
elif isinstance(obj,numpy.ndarray):
return obj.tolist()
else:
return super(MyEncoder,self).default(obj)
filename = 'data.json'
with open(filename,'w') as f:
json.dump(ret,f,cls = MyEncoder,ensure_ascii = False)
24.對數(shù)據進行替換
df["TermIndex"] = df["TermIndex"].raplace([1,2],['一','二']) #替換"TermIndex"的值,將數(shù)字轉為中文
translations = {r'(.*)<(.*)' : r'\l<\2' , r'(.*)>(.*) : r'\l>\2'}
df = df.replace(translations,regex = True) #替換某個字符:數(shù)據值中含有<>的使用(只替換最后出現(xiàn)的那個)
data = data.replace('[<]','<',regex = True) #替換某個字符:數(shù)據值中含有<>的使用(替換所有出現(xiàn)的)
data = data.replace('[>]','>',regex = True)
25.合并多個表格到一個sheet中,startcol從第幾列開始拼接镊叁,startrow第幾行開始拼接
df1 = data()
df2 = data()
df3 = data()
writer = pd.ExcelWriter('匯總表.xlsx')
df1.to_excel(writer)
df2.to_excel(writer,startcol = 4)
df1.to_excel(writer,startcol = 8,startrow = 4)
writer.save()
26.對日期的處理
time1['起始日'] = pd.to_datatime(time1['起始日'],format = "%Y-%m -%d") #轉換為日期形式
time1['起始日'] += datatime.timedelta(days = 1) #日期天數(shù)加1
rightTime = pd.datetime.now() - dateutil.relativedelta.relativedalta(months=6) #從今天開始往前推半年
27.數(shù)據表檢查
df.shape #數(shù)據維度(行列):Excel中可以通過CTRL+向下的光標鍵尘颓,和CTRL+向右的光標鍵
df.info() #數(shù)據表信息:數(shù)據維度、列名稱晦譬、數(shù)據格式和所占空間等信息
df.dtypes #查看數(shù)據表各列格式
df['B'].dtype #查看單列格式
df.values #查看數(shù)據表的值
df['name'].unique #查看name中的唯一值
df.columns #查看列名稱
df.head() #查看前10行
df.tail() #查看最后10行
28.數(shù)據清洗
df['city']=df['city'].map(str.strip) #清除city字段中的字符空格
df['city']=df['city'].str.lower() #city列大小寫轉換
df['price'].astype('int') #更改數(shù)據格式
df.rename(columns = {"name":"姓名","id":"序號"}疤苹,inplace = True) #更改列名稱
df['city'].drop_duplicates() #刪除后出現(xiàn)的重復值
df['city'].drop_duplicates(keep='last') #刪除先出現(xiàn)的重復值
df['city'].replace('sh', 'shanghai') #數(shù)據替換
df = df.sort_values(by="日期",ascending=True) #按某列的值排序
df['a'] = df['mean'].round(decimals=2) #保留兩位小數(shù)
df['a'] = df['mean'].map(lambda x:("%.2f)%x) #不為數(shù)值型字段
df['a'] = df['mean'].map(lambda x:format(x,".2%))
df['a'] = df['mean'].apply(lambda x:round(x,2))
29.數(shù)據預處理
df_inner=pd.merge(df,df1,how='inner') #數(shù)據表匹配合并,合并的方式還有l(wèi)eft,right和outer方式
df_inner.set_index('id') #設置索引列
df_inner.sort_values(by=['age']) #按特定列的值排序
df_inner.sort_index() #按索引列排序
df_inner['group'] = np.where(df_inner['price'] > 3000,'high','low
') #使用Where函數(shù)用來對數(shù)據進行判斷和分組
df_inner.loc[(df_inner['city'] == 'beijing') & (df_inner['price']
>= 4000), 'sign']=1 #對復合多個條件的數(shù)據進行分組標記
30.數(shù)據提取
df_inner.loc[3] #按索引提取單行的數(shù)值
df_inner.loc[0:5] #按索引提取區(qū)域行數(shù)值
df_inner.reset_index() #重設索引
df_inner=df_inner.set_index('date') #設置日期為索引
31.數(shù)據篩選
#使用“與”條件進行篩選
df_inner.loc[(df_inner['age'] > 25) & (df_inner['city'] == 'beiji
ng'), ['id','city','age','category','gender']]
#使用“或”條件篩選
df_inner.loc[(df_inner['age'] > 25) | (df_inner['city'] == 'beiji
ng'), ['id','city','age','category','gender']].sort(['age'])
#使用“非”條件進行篩選
df_inner.loc[(df_inner['city'] != 'beijing'), ['id','city','age',
'category','gender']].sort(['id'])
#對篩選后的數(shù)據按city列進行計數(shù)
df_inner.loc[(df_inner['city'] != 'beijing'), ['id','city','age',
'category','gender']].sort(['id']).city.count()
#使用query函數(shù)進行篩選
df_inner.query('city == ["beijing", "shanghai"]')
#對篩選后的結果按price進行求和
df_inner.query('city == ["beijing", "shanghai"]').price.sum()
32.數(shù)據匯總
df_inner.groupby('city').count() #對所有列進行計數(shù)匯總
df_inner.groupby('city')['id'].count() #對特定的ID列進行計數(shù)匯總
df_inner.groupby(['city','size'])['id'].count()#對兩個字段進行匯總計數(shù)
#對city字段進行匯總并計算price的合計和均值敛腌。
df_inner.groupby('city')['price'].agg([len,np.sum, np.mean])
33.Python中通過pivot_table函數(shù)實現(xiàn)數(shù)據透視
#設定city為行字段卧土,size為列字段惫皱,price為值字段。分別計算price的數(shù)量和金額并且按行與列進行匯總尤莺。
pd.pivot_table(df_inner,index=["city"],values=["price"],columns=[
"size"],aggfunc=[len,np.sum],fill_value=0,margins=True)
34.數(shù)據采樣
df_inner.sample(n=3) #簡單的數(shù)據采樣
weights = [0, 0, 0, 0, 0.5, 0.5] #手動設置采樣權重
df_inner.sample(n=2, weights=weights)
df_inner.sample(n=6, replace=False) #采樣后不放回
df_inner.sample(n=6, replace=True) #采樣后放回
df_inner.describe().round(2).T #數(shù)據表描述性統(tǒng)計
df_inner['price'].corr(df_inner['m-point']) #相關性分析
df_inner.corr() #數(shù)據表相關性分析
35.時間與今天對比
把數(shù)據表中給出的時間與今天對比旅敷,劃分一個時間范圍用0和1來判斷取值
today = pd.datetime.now()
def dfApply01(d):
d = datetime.datetime.strptime(d,"%Y-%m-%d") #
theDay = today + dateutil.relativedelta.relativedelta(days=7)
#time1['起始日'] += datatime.timedelta(days = 1)
if d <=theDay and d>today:
return 1
else:
return 0
dataAll["bool"] = dataAll["最遲時間"].apply(dfApply01)
36.遍歷每一個值來讀取
def mainStar(data):
report = data["日期"]
seccode = data["代碼"]
secname = data["簡稱"]
year = data["年度"]
year01 = int(year)+1
if report == "年報":
message = f'根據提交的{report}披露時間,{seccode}{secname}需要在{year01}發(fā)送報告'
else:
message = " "
return message
df["notice"] = df.apply(mainStar,axis=1)
37.引用模塊
import sys, os
base_path = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) #os.path.dirname比文件夾個數(shù)多1
sys.path.append(base_path)
from test3 import c
38.數(shù)值設置千分位、保留小數(shù)點
data["數(shù)量"] = data["數(shù)量"].map(lambda x:format(x,",")) #設置千分位
data["數(shù)量"] = data["數(shù)量"].round(decimals = 2)
data["數(shù)量"] = data["數(shù)量"].map(lambda x:("%.2f")%x)
data["數(shù)量"] = data["數(shù)量"].map(lambda x:format(x,".2%"))
data["數(shù)量"] = data["數(shù)量"].apply(lambda x:round(x,2))
39.合并兩個字典
from collections import ChainMap
context = dict(ChainMap(data_dic,pic_dic))
40.取出A與B之間的數(shù)據
data["max"] = data["tag"].apply(lambda x:(x.partition("A")[2].partition("B"))[0][:])
41.讀取json
import json
with open("1.json","r",encoding = "utf-8") as f:
content = json.load(f)
print(content)
42.創(chuàng)建excel表格
import numpy as np
import pandas as pd
import openpyxl
data = np.random.randint(10,40,size=(20,4))
columns = ["a","b","c","d"]
df = pd.DataFrame(data = data,columns = columns)
df.to_excel("df.xlsx",encoding="utf-8-sig",index=None)
print(df)
43.忽略警告錯誤信息
import warnings
warnings.filterwarnings('ignore')