在進(jìn)行內(nèi)存優(yōu)化之前寞宫,可以使用如下函數(shù)對進(jìn)行使用的內(nèi)存進(jìn)行統(tǒng)計。
import psutil
impot os
def cpu_stats():
pid = os.getpid()
py = psutil.Process(pid)
memory_use = py.memory_info()[0] / 2. ** 30
return 'memory GB:' + str(np.round(memory_use, 2))
# 整體內(nèi)存使用
df.info(memory_usage="deep")
# 每列內(nèi)存使用
df.memory_usage()
對于應(yīng)用程序崇摄,可以使用filprofiler函數(shù)查看內(nèi)存峰值擎值。
https://pythonspeed.com/fil/docs/
Pandas內(nèi)存優(yōu)化
分批讀取
如果數(shù)據(jù)文件非常大,可以在讀取時分批次讀取逐抑,通過設(shè)置chunksize來控制批大小鸠儿。
df = pd.read_csv(path, chunksize=1000000)
for chunk in df:
# 分批次處理數(shù)據(jù)
pass
選擇讀取部分列
df = pd.read_csv(path, usecols=["a"])
提前設(shè)置列類型
df = pd.read_csv(path, dtype={"a":"int8"})
將類別列設(shè)為category類型
df['a'] = df['a'].astype('category')
自動識別類型并進(jìn)行轉(zhuǎn)換
def reduce_mem_usage(props):
start_mem_usg = props.memory_usage().sum() / 1024**2
print("Memory usage of properties dataframe is :",start_mem_usg," MB")
NAlist = [] # Keeps track of columns that have missing values filled in.
for col in props.columns:
if props[col].dtype != object: # Exclude strings
# Print current column type
print("******************************")
print("Column: ",col)
print("dtype before: ",props[col].dtype)
# make variables for Int, max and min
IsInt = False
mx = props[col].max()
mn = props[col].min()
# Integer does not support NA, therefore, NA needs to be filled
if not np.isfinite(props[col]).all():
NAlist.append(col)
props[col].fillna(mn-1,inplace=True)
# test if column can be converted to an integer
asint = props[col].fillna(0).astype(np.int64)
result = (props[col] - asint)
result = result.sum()
if result > -0.01 and result < 0.01:
IsInt = True
# Make Integer/unsigned Integer datatypes
if IsInt:
if mn >= 0:
if mx < 255:
props[col] = props[col].astype(np.uint8)
elif mx < 65535:
props[col] = props[col].astype(np.uint16)
elif mx < 4294967295:
props[col] = props[col].astype(np.uint32)
else:
props[col] = props[col].astype(np.uint64)
else:
if mn > np.iinfo(np.int8).min and mx < np.iinfo(np.int8).max:
props[col] = props[col].astype(np.int8)
elif mn > np.iinfo(np.int16).min and mx < np.iinfo(np.int16).max:
props[col] = props[col].astype(np.int16)
elif mn > np.iinfo(np.int32).min and mx < np.iinfo(np.int32).max:
props[col] = props[col].astype(np.int32)
elif mn > np.iinfo(np.int64).min and mx < np.iinfo(np.int64).max:
props[col] = props[col].astype(np.int64)
# Make float datatypes 32 bit
else:
props[col] = props[col].astype(np.float32)
# Print new column type
print("dtype after: ",props[col].dtype)
print("******************************")
# Print final result
print("___MEMORY USAGE AFTER COMPLETION:___")
mem_usg = props.memory_usage().sum() / 1024**2
print("Memory usage is: ",mem_usg," MB")
print("This is ",100*mem_usg/start_mem_usg,"% of the initial size")
return props, NAlist
props = pd.read_csv(r"../input/properties_2016.csv") #The properties dataset
props, NAlist = reduce_mem_usage(props)
print("_________________")
print("")
print("Warning: the following columns have missing values filled with 'df['column_name'].min() -1': ")
print("_________________")
print("")
print(NAlist)
模型內(nèi)存優(yōu)化