- [純python和pandas速度比較]
- [一百萬(wàn)數(shù)據(jù)做測(cè)試, 六個(gè)統(tǒng)計(jì)量]
- [或許多計(jì)算幾個(gè)不是放到list里再計(jì)算一遍會(huì)更快點(diǎn)? 算了, 反正純python太慢都用pandas了, 不再測(cè)試了]
- [那么多幾個(gè)統(tǒng)計(jì)量呢? 比如20個(gè)? 30個(gè)? 會(huì)有追平的情況么?]
- [是只有基本類(lèi)型會(huì)這樣還是所有的統(tǒng)計(jì)都是pandas快?]
- [嘗試使用純python多次循環(huán), 單次循環(huán)與pandas對(duì)比, go 單次與多次循環(huán)對(duì)比]
- [一百萬(wàn)數(shù)據(jù)做測(cè)試, 六個(gè)統(tǒng)計(jì)量]
純python和pandas速度比較
有個(gè)數(shù)據(jù)分析的任務(wù), 本來(lái)打算用pandas做, 后來(lái)想的是要遍歷一個(gè)大列表(十萬(wàn)左右, 4000組)幾十遍
使用pandas會(huì)不會(huì)因?yàn)橐恢痹诒闅v導(dǎo)致速度比較慢
嘗試使用純python做, 爭(zhēng)取一次遍歷把所有的統(tǒng)計(jì)項(xiàng)計(jì)算出來(lái)
所以先試試看速度會(huì)有多少差距
先說(shuō)結(jié)論, 不要用純python來(lái)寫(xiě)數(shù)據(jù)分析砌函, 使用pandas會(huì)的多
一百萬(wàn)數(shù)據(jù)做測(cè)試, 六個(gè)統(tǒng)計(jì)量
python:
from pandas import DataFrame
import time
import random
def create_test_data(nums=1000*1000):
with open("data.txt", "w") as f:
for i in range(nums):
f.write(f"{random.randint(1, 1000)}\n")
if __name__ == "__main__":
with open("./data.txt")as f:
data = f.readlines()
call_times_pd = DataFrame({"call_times": data})
call_times_pd[["call_times"]] = call_times_pd[['call_times']].astype(int)
call_times_py = [int(i) for i in data]
st = time.time()
# call_times_py = [int(i) for i in data]
lte_200_py = []
lt_500_py = []
eq_600_py = []
xx = []
yy = []
zz = []
for i in call_times_py:
if i <= 200:
lte_200_py.append(i)
if i < 500:
lt_500_py.append(i)
if i == 600:
eq_600_py.append(i)
if i <= 100:
xx.append(i)
if i <= 300:
yy.append(i)
if i > 800:
zz.append(i)
print(len(lte_200_py), len(lt_500_py), len(eq_600_py))
print(time.time() - st)
stt = time.time()
# call_times_pd = DataFrame({"call_times": data})
# call_times_pd[["call_times"]] = call_times_pd[['call_times']].astype(int)
lte_200_pd = call_times_pd[call_times_pd.call_times <= 200]
lt_500_pd = call_times_pd[call_times_pd.call_times < 500]
eq_600_pd = call_times_pd[call_times_pd.call_times == 600]
xxx = call_times_pd[call_times_pd.call_times <= 100]
yyy = call_times_pd[call_times_pd.call_times <= 300]
zzz = call_times_pd[call_times_pd.call_times > 800]
# xxx = call_times_pd[call_times_pd.call_times <= 100]
# yyy = call_times_pd[call_times_pd.call_times <= 300]
# zzz = call_times_pd[call_times_pd.call_times > 800]
# xxx = call_times_pd[call_times_pd.call_times <= 100]
# yyy = call_times_pd[call_times_pd.call_times <= 300]
# zzz = call_times_pd[call_times_pd.call_times > 800]
print(len(lte_200_pd), len(lt_500_pd), len(eq_600_pd))
print(time.time() - stt)
最后測(cè)試發(fā)現(xiàn)不使用pandas會(huì)比使用pandas慢一個(gè)數(shù)量級(jí)
為什么用list而不是直接用int, 因?yàn)樾枨笾袝?huì)要求獲取次數(shù), 平均值
例如, 小于200的總次數(shù), 平均值, 中位數(shù)等等
或許多計(jì)算幾個(gè)不是放到list里再計(jì)算一遍會(huì)更快點(diǎn)? 算了, 反正純python太慢都用pandas了, 不再測(cè)試了
那么多幾個(gè)統(tǒng)計(jì)量呢? 比如20個(gè)? 30個(gè)? 會(huì)有追平的情況么?
測(cè)試了10, 20, 30, 50, 100, 200, 500, 都是純python慢很多
時(shí)間消耗大概都是7倍左右, 沒(méi)有縮短的跡象
有可能是因?yàn)楹臅r(shí)的操作不是循環(huán), 而是列表操作
將操作中的列表, 改為數(shù)字, 會(huì)少一些, 但是耗時(shí)比還是不會(huì)變動(dòng), 一直是5~6倍
def run_py_code():
xxx, yyy, zzz, lt_500_py_int, lte_200_py_int, eq_600_py_int = 0, 0, 0, 0, 0, 0
def inner_def():
nonlocal xxx, yyy, zzz, lt_500_py_int, lte_200_py_int, eq_600_py_int
if i <= 200:
# lte_200_py.append(i)
lte_200_py_int += 0
if i < 500:
# lt_500_py.append(i)
lt_500_py_int += 0
if i == 600:
# eq_600_py.append(i)
eq_600_py_int += 0
if i <= 100:
# xx.append(i)
xxx += 0
if i <= 300:
# yy.append(i)
yyy += 0
if i > 800:
# zz.append(i)
zzz += 0
inner_def()
這個(gè)值和使用go語(yǔ)言來(lái)做(分別是單循環(huán)做全部統(tǒng)計(jì), 每次循環(huán)統(tǒng)計(jì)一個(gè)量)差不太多
TODO 是只有基本類(lèi)型會(huì)這樣還是所有的統(tǒng)計(jì)都是pandas快?
暫時(shí)不做測(cè)試, 一般不會(huì)這么用, 需要的時(shí)候再說(shuō)
嘗試使用純python多次循環(huán), 單次循環(huán)與pandas對(duì)比, go 單次與多次循環(huán)對(duì)比
from pandas import DataFrame
import time
import random
def create_test_data(nums=1000*1000):
with open("data.txt", "w") as f:
for i in range(nums):
f.write(f"{random.randint(1, 1000)}\n")
def py_code2(call_times_py, times=10):
lte_200_py_int = 0
lt_500_py_int = 0
eq_600_py_int = 0
xxx = 0
yyy = 0
zzz = 0
def inner_200():
nonlocal lte_200_py_int
if i <= 200:
lte_200_py_int += 1
def inner_500():
nonlocal lt_500_py_int
if i < 500:
lt_500_py_int += 1
def inner_600():
nonlocal eq_600_py_int
if i == 600:
eq_600_py_int+=1
def inner_xxx():
nonlocal xxx
if i<=100:
xxx+=1
def inner_yyy():
nonlocal yyy
if i<= 300:
yyy+=1
def inner_zzz():
nonlocal zzz
if i> 800:
zzz+=1
st = time.time()
for i in range(times):
for i in call_times_py:
inner_200()
for i in call_times_py:
inner_500()
for i in call_times_py:
inner_600()
for i in call_times_py:
inner_xxx()
for i in call_times_py:
inner_yyy()
for i in call_times_py:
inner_zzz()
return time.time()-st
def py_code(call_times_py, times=10):
# call_times_py = [int(i) for i in data]
lte_200_py = []
lt_500_py = []
eq_600_py = []
xx = []
yy = []
zz = []
lte_200_py_int = 0
lt_500_py_int = 0
eq_600_py_int = 0
xxx = 0
yyy = 0
zzz = 0
XXX=30
def inner_def():
nonlocal xxx, yyy, zzz, lt_500_py_int, lte_200_py_int, eq_600_py_int
if i <= 200:
lte_200_py.append(i)
# lte_200_py_int += 0
if i < 500:
lt_500_py.append(i)
# lt_500_py_int += 0
if i == 600:
eq_600_py.append(i)
# eq_600_py_int += 0
if i <= 100:
xx.append(i)
# xxx += 0
if i <= 300:
yy.append(i)
# yyy += 0
if i > 800:
zz.append(i)
# zzz += 0
st = time.time()
for i in call_times_py:
for j in range(times):
inner_def()
lte_200_py = []
lt_500_py = []
eq_600_py = []
xx = []
yy = []
zz = []
# print(len(lte_200_py), len(lt_500_py), len(eq_600_py))
py_time = time.time() - st
return py_time
def pd_code(call_times_pd, times=10):
stt = time.time()
def inner_def():
lte_200_pd = call_times_pd[call_times_pd.call_times <= 200]
lt_500_pd = call_times_pd[call_times_pd.call_times < 500]
eq_600_pd = call_times_pd[call_times_pd.call_times == 600]
xxx = call_times_pd[call_times_pd.call_times <= 100]
yyy = call_times_pd[call_times_pd.call_times <= 300]
zzz = call_times_pd[call_times_pd.call_times > 800]
for i in range(times):
inner_def()
pd_time = time.time() - stt
return pd_time
if __name__ == "__main__":
with open("./data.txt")as f:
data = f.readlines()
call_times_pd = DataFrame({"call_times": data})
call_times_pd[["call_times"]] = call_times_pd[['call_times']].astype(int)
call_times_py = [int(i) for i in data]
for i in [10, 20, 30, 50, 100, 200, 500]:
for j in range(1, 4):
py_time = py_code(call_times_py, i)
pd_time = pd_code(call_times_pd, i)
py_time2 = py_code2(call_times_py, i)
print(f"統(tǒng)計(jì)量:{i} 次數(shù):{j} 時(shí)間比: {py_time/pd_time} 詳細(xì)時(shí)長(zhǎng): py: {py_time} pd: {pd_time} py2: {py_time2}")
統(tǒng)計(jì)量:10 次數(shù):1 時(shí)間比: 8.446309077857356 詳細(xì)時(shí)長(zhǎng): py: 5.8574230670928955 pd: 0.6934890747070312 py2: 10.970935583114624
統(tǒng)計(jì)量:10 次數(shù):2 時(shí)間比: 8.19909197864528 詳細(xì)時(shí)長(zhǎng): py: 5.218891143798828 pd: 0.6365206241607666 py2: 9.700028657913208
統(tǒng)計(jì)量:10 次數(shù):3 時(shí)間比: 8.45852420498877 詳細(xì)時(shí)長(zhǎng): py: 5.458881855010986 pd: 0.6453704833984375 py2: 10.23711371421814
統(tǒng)計(jì)量:20 次數(shù):1 時(shí)間比: 7.564866761185951 詳細(xì)時(shí)長(zhǎng): py: 9.94292688369751 pd: 1.3143558502197266 py2: 19.783536195755005
統(tǒng)計(jì)量:20 次數(shù):2 時(shí)間比: 7.360162732624824 詳細(xì)時(shí)長(zhǎng): py: 9.758050680160522 pd: 1.3257927894592285 py2: 19.761258363723755
統(tǒng)計(jì)量:20 次數(shù):3 時(shí)間比: 7.508303230230394 詳細(xì)時(shí)長(zhǎng): py: 9.933343410491943 pd: 1.3229811191558838 py2: 19.530989408493042
統(tǒng)計(jì)量:30 次數(shù):1 時(shí)間比: 7.833986134979155 詳細(xì)時(shí)長(zhǎng): py: 14.523708581924438 pd: 1.853935956954956 py2: 32.46154570579529
統(tǒng)計(jì)量:30 次數(shù):2 時(shí)間比: 7.621607390748258 詳細(xì)時(shí)長(zhǎng): py: 14.912060499191284 pd: 1.9565505981445312 py2: 29.734895944595337
統(tǒng)計(jì)量:30 次數(shù):3 時(shí)間比: 7.275279705214964 詳細(xì)時(shí)長(zhǎng): py: 14.678386211395264 pd: 2.0175700187683105 py2: 30.317322492599487
統(tǒng)計(jì)量:50 次數(shù):1 時(shí)間比: 7.725853759509482 詳細(xì)時(shí)長(zhǎng): py: 23.88418436050415 pd: 3.0914621353149414 py2: 50.774309158325195
統(tǒng)計(jì)量:50 次數(shù):2 時(shí)間比: 7.254025451527843 詳細(xì)時(shí)長(zhǎng): py: 23.80484104156494 pd: 3.281604290008545 py2: 54.80711579322815
統(tǒng)計(jì)量:50 次數(shù):3 時(shí)間比: 8.11059675328286 詳細(xì)時(shí)長(zhǎng): py: 26.42996597290039 pd: 3.258695602416992 py2: 49.66933298110962
統(tǒng)計(jì)量:100 次數(shù):1 時(shí)間比: 7.395159931692905 詳細(xì)時(shí)長(zhǎng): py: 47.11621284484863 pd: 6.371222972869873 py2: 100.95593667030334
統(tǒng)計(jì)量:100 次數(shù):2 時(shí)間比: 7.285887516396374 詳細(xì)時(shí)長(zhǎng): py: 47.732889890670776 pd: 6.551417350769043 py2: 100.88967823982239
統(tǒng)計(jì)量:100 次數(shù):3 時(shí)間比: 7.5608031699271185 詳細(xì)時(shí)長(zhǎng): py: 48.61383056640625 pd: 6.429717779159546 py2: 108.31028723716736
統(tǒng)計(jì)量:200 次數(shù):1 時(shí)間比: 7.168724536572008 詳細(xì)時(shí)長(zhǎng): py: 94.84511423110962 pd: 13.230402946472168 py2: 203.78652048110962
統(tǒng)計(jì)量:200 次數(shù):2 時(shí)間比: 7.64953061578108 詳細(xì)時(shí)長(zhǎng): py: 101.12934279441833 pd: 13.220333099365234 py2: 208.26342511177063
統(tǒng)計(jì)量:200 次數(shù):3 時(shí)間比: 7.221915299871638 詳細(xì)時(shí)長(zhǎng): py: 94.99437260627747 pd: 13.153625965118408 py2: 208.55407118797302
統(tǒng)計(jì)量:500 次數(shù):1 時(shí)間比: 7.733304171085339 詳細(xì)時(shí)長(zhǎng): py: 249.97211408615112 pd: 32.32410216331482 py2: 507.01753211021423
統(tǒng)計(jì)量:500 次數(shù):2 時(shí)間比: 7.523341562433384 詳細(xì)時(shí)長(zhǎng): py: 245.09230089187622 pd: 32.577585220336914 py2: 521.6754240989685
統(tǒng)計(jì)量:500 次數(shù):3 時(shí)間比: 9.62083059863606 詳細(xì)時(shí)長(zhǎng): py: 243.66297483444214 pd: 25.32660484313965 py2: 448.2097132205963
package main
import (
"bufio"
"fmt"
"io"
"os"
"strconv"
"strings"
"time"
)
func once_run(new_files * [1000000]int, times int) float64 {
st := time.Now()
lte_200 := []int{}
lt_500 := []int{}
eq_600 := []int{}
xx := []int{}
yy := []int{}
zz := []int{}
for i:=0;i<times;i++{
// for index:=0;index<len(new_files);index++{
// int_v := new_files[index]
for _, int_v := range new_files {
if int_v <= 200 {
lte_200 = append(lte_200, int_v)
}
if int_v < 500 {
lt_500 = append(lt_500, int_v)
}
if int_v == 600 {
eq_600 = append(eq_600, int_v)
}
if int_v <= 100 {
xx = append(xx, int_v)
}
if int_v <= 300 {
yy = append(yy, int_v)
}
if int_v > 800 {
zz = append(zz, int_v)
}
}
lte_200 = []int{}
lt_500 = []int{}
eq_600 = []int{}
xx = []int{}
yy = []int{}
zz = []int{}
}
// fmt.Println(len(lte_200), len(lt_500), len(eq_600), len(xx), len(yy), len(zz))
// fmt.Println(len(lte_200), len(lt_500), len(eq_600))
et := time.Now()
return et.Sub(st).Seconds()
}
func many_run(new_files * [1000000]int, times int)float64{
st := time.Now()
lte_200 := []int{}
lt_500 := []int{}
eq_600 := []int{}
xx := []int{}
yy := []int{}
zz := []int{}
for i:=0;i<times;i++{
for _, int_v := range new_files{
if int_v <= 200{
lte_200 = append(lte_200, int_v)
}
}
for _, int_v := range new_files{
if int_v < 500{
lt_500 = append(lt_500, int_v)
}
}
for _, int_v := range new_files{
if int_v == 600{
eq_600 = append(eq_600, int_v)
}
}
for _, int_v := range new_files{
if int_v <= 100{
xx = append(xx, int_v)
}
}
for _, int_v := range new_files{
if int_v <= 300{
yy = append(yy, int_v)
}
}
for _, int_v := range new_files{
if int_v > 800{
zz = append(zz, int_v)
}
}
lte_200 = []int{}
lt_500 = []int{}
eq_600 = []int{}
xx = []int{}
yy = []int{}
zz = []int{}
}
// fmt.Println(len(lte_200), len(lt_500), len(eq_600))
et := time.Now()
return et.Sub(st).Seconds()
}
func main() {
filepath := "./data.txt"
file, err := os.OpenFile(filepath, os.O_RDONLY, 0777)
if err != nil {
fmt.Println(err)
}
defer file.Close()
buf := bufio.NewReader(file)
file_lines := []string{}
for {
line, err := buf.ReadString('\n')
if err != nil {
if err == io.EOF {
break
} else {
fmt.Println(err)
}
}
line = strings.TrimSpace(line)
file_lines = append(file_lines, line)
}
var new_files [1000000]int
for i := 0; i < len(file_lines); i++ {
int_v, _ := strconv.Atoi(file_lines[i])
new_files[i] = int_v
}
/*
for _, ii := range []int{1, 2, 3, 4, 5, 6, 7, 8, 9, 10} {
fmt.Println(ii, "--")
for _, int_v := range new_files {
if int_v <= 200 {
lte_200 = append(lte_200, int_v)
}
}
}
*/
for _, i := range []int{10, 20, 30, 50, 100, 200, 500}{
// for _, i := range []int{500}{
for j:=0;j<4;j++{
time_used_once := once_run(&new_files, i)
time_used_many := many_run(&new_files, i)
fmt.Println(i," ", j,":--> ", "once" ,time_used_once, "many", time_used_many)
}
}
}
可以看到, golang的單次循環(huán)和多次循環(huán)的差距沒(méi)有那么大, 所以除了特殊情況, 最主要的還是循環(huán)體里的內(nèi)容
10 0 :–> once 0.270043332 many 0.363176675
10 1 :–> once 0.276643586 many 0.352971341
10 2 :–> once 0.25890732 many 0.35560765
10 3 :–> once 0.245769677 many 0.370233654
20 0 :–> once 0.589726051 many 0.769544436
20 1 :–> once 0.580975835 many 0.753842546
20 2 :–> once 0.600672759 many 0.721600799
20 3 :–> once 0.596183651 many 0.766575763
30 0 :–> once 0.882573551 many 1.113109061
30 1 :–> once 0.880121284 many 1.160065195
30 2 :–> once 0.867385259 many 1.133052454
30 3 :–> once 0.889980355 many 1.136251198
50 0 :–> once 1.427478098 many 1.850872163
50 1 :–> once 1.501920479 many 1.898363955
50 2 :–> once 1.502236307 many 1.8533934570000001
50 3 :–> once 1.5080970219999998 many 1.884553839
100 0 :–> once 2.934879984 many 3.7428952300000002
100 1 :–> once 2.949133964 many 3.768129714
100 2 :–> once 2.879401105 many 3.774616441
100 3 :–> once 2.908388572 many 3.75524067
200 0 :–> once 5.265939245 many 7.3770994420000005
200 1 :–> once 5.676770116 many 7.516143486
200 2 :–> once 5.835790768 many 7.420399405
200 3 :–> once 5.757892503 many 7.498774821
500 0 :–> once 13.946337923 many 18.661140954
500 1 :–> once 13.558824813 many 18.278247642
500 2 :–> once 13.671900874 many 18.228428328
500 3 :–> once 12.964145702 many 18.017974093