1 怎么多進(jìn)程處理
比如膛堤,對(duì)于一個(gè)pd.DataFrame 數(shù)據(jù)手趣,需要對(duì)其中的每一行進(jìn)行一個(gè)gen_new_df操作,然后將所有的結(jié)果拼接起來(lái)。
import pandas as pd
import multiprocessing as mp
import numpy as np
# test_df: 你要處理的數(shù)據(jù)
# pro_count: 進(jìn)程數(shù)量
test_df_split = np.array_split(test_df, pro_count)
pool = mp.Pool(pro_count)
df = pd.concat(pool.map(gen_new_df, test_df_split))
pool.close()
pool.join()
2 實(shí)例
2.1 生成測(cè)試數(shù)據(jù):
import pandas as pd
from faker import Faker
from datetime import datetime
fake = Faker()
n = 10000
# 生成隨機(jī)的名字和時(shí)間戳
name_data = [fake.name() for _ in range(n)] # 生成隨機(jī)名字
timestamp = [datetime.now().strftime('%Y-%m-%d %H:%M:%S') for _ in range(n)] # 生成時(shí)間戳
# 創(chuàng)建DataFrame
df = pd.DataFrame({"Name": name_data, "Time": timestamp})
# 將DataFrame保存為CSV文件
df.to_csv("test.csv", index=False)
2.2 定義處理函數(shù):
比如我們這里,讓1行變成5行
def gen_new_df(input_df):
new_df = pd.DataFrame(columns=['Name', 'Time', 'uuid', 'index'])
for i in range(len(input_df)):
input_dic = input_df.iloc[i].to_dict()
for j in range(5):
new_dic = {
"Name": input_dic["Name"],
"Time": input_dic["Time"],
"uuid": str(uuid.uuid4()),
"index": str(j)
}
new_df = pd.concat([new_df, pd.DataFrame([new_dic])], ignore_index=True)
return new_df
2.3 多進(jìn)程處理
可以先查看我們的CPU核數(shù):
import multiprocessing as mp
num_cores = mp.cpu_count()
print(num_cores)
用不同的進(jìn)程數(shù)量來(lái)處理:
if __name__ == '__main__':
# 用不同的進(jìn)程數(shù)量進(jìn)行測(cè)試
test_pro_count = [1, 2, 4, 7, 8, 9, 10, 15, 20, 25, 30]
test_df = pd.read_csv("test.csv")
time_con_list = []
begin_time = time.time()
gen_new_df(test_df)
time_con = time.time() - begin_time
print("不使用多進(jìn)程: ", "---", time_con)
time_consume = []
for pro_count in test_pro_count:
begin_time = time.time()
test_df_split = np.array_split(test_df, pro_count)
pool = mp.Pool(pro_count)
df = pd.concat(pool.map(gen_new_df, test_df_split))
pool.close()
pool.join()
time_con = time.time() - begin_time
time_con_list.append(time_con)
print("進(jìn)程數(shù)量:", pro_count, "消耗時(shí)間: ", time_con)
print(time_con_list)
3 效率分析
進(jìn)程數(shù)量 --- 消耗時(shí)間
0 --- 27.70097780227661
1 --- 28.131190061569214
2 --- 9.749696016311646
4 --- 4.519064903259277
7 --- 2.705703020095825
8 --- 2.702981948852539
9 --- 3.6096010208129883
10 --- 2.7417500019073486
15 --- 4.223258018493652
20 --- 3.8286550045013428
25 --- 5.2411949634552
30 --- 6.33648681640625