0x01
在用python寫excel時贷币,發(fā)現(xiàn)寫10000行數(shù)據(jù)昔脯,每行50列時笑陈, 最快的excel庫也要10秒快集,而在我的項目中用的xlsxwriter耗時10多秒贡羔,測試代碼如下。這對于一個web服務(wù)來說碍讨,耗時實在是太長了治力。
測試
import sys
from time import clock
import openpyxl
import pyexcelerate
import xlsxwriter
import xlwt
from openpyxl.utils import get_column_letter
# Default to 1000 rows x 50 cols.
if len(sys.argv) > 1:
row_max = int(sys.argv[1])
col_max = 50
else:
row_max = 10000
col_max = 50
if len(sys.argv) > 2:
col_max = int(sys.argv[2])
def print_elapsed_time(module_name, elapsed):
""" Print module run times in a consistent format. """
print(" %-22s: %6.2f" % (module_name, elapsed))
def time_xlsxwriter():
""" Run XlsxWriter in default mode. """
start_time = clock()
workbook = xlsxwriter.Workbook('xlsxwriter.xlsx')
worksheet = workbook.add_worksheet()
for row in range(row_max // 2):
for col in range(col_max):
worksheet.write_string(row * 2, col, "Row: %d Col: %d" % (row, col))
for col in range(col_max):
worksheet.write_number(row * 2 + 1, col, row + col)
workbook.close()
elapsed = clock() - start_time
print_elapsed_time('xlsxwriter', elapsed)
def time_xlsxwriter_optimised():
""" Run XlsxWriter in optimised/constant memory mode. """
start_time = clock()
workbook = xlsxwriter.Workbook('xlsxwriter_opt.xlsx',
{'constant_memory': True})
worksheet = workbook.add_worksheet()
for row in range(row_max // 2):
for col in range(col_max):
worksheet.write_string(row * 2, col, "Row: %d Col: %d" % (row, col))
for col in range(col_max):
worksheet.write_number(row * 2 + 1, col, row + col)
workbook.close()
elapsed = clock() - start_time
print_elapsed_time('xlsxwriter (optimised)', elapsed)
def time_openpyxl():
""" Run OpenPyXL in default mode. """
start_time = clock()
workbook = openpyxl.workbook.Workbook()
worksheet = workbook.active
for row in range(row_max // 2):
for col in range(col_max):
colletter = get_column_letter(col + 1)
worksheet.cell('%s%s' % (colletter, row * 2 + 1)).value = "Row: %d Col: %d" % (row, col)
for col in range(col_max):
colletter = get_column_letter(col + 1)
worksheet.cell('%s%s' % (colletter, row * 2 + 2)).value = row + col
workbook.save('openpyxl.xlsx')
elapsed = clock() - start_time
print_elapsed_time('openpyxl', elapsed)
def time_openpyxl_optimised():
""" Run OpenPyXL in optimised mode. """
start_time = clock()
workbook = openpyxl.workbook.Workbook()
worksheet = workbook.create_sheet()
for row in range(row_max // 2):
string_data = ["Row: %d Col: %d" % (row, col) for col in range(col_max)]
worksheet.append(string_data)
num_data = [row + col for col in range(col_max)]
worksheet.append(num_data)
workbook.save('openpyxl_opt.xlsx')
elapsed = clock() - start_time
print_elapsed_time('openpyxl (optimised)', elapsed)
def time_pyexcelerate():
""" Run pyexcelerate in "faster" mode. """
start_time = clock()
workbook = pyexcelerate.Workbook()
worksheet = workbook.new_sheet('Sheet1')
for row in range(row_max // 2):
for col in range(col_max):
worksheet.set_cell_value(row * 2 + 1, col + 1, "Row: %d Col: %d" % (row, col))
for col in range(col_max):
worksheet.set_cell_value(row * 2 + 2, col + 1, row + col)
workbook.save('pyexcelerate.xlsx')
elapsed = clock() - start_time
print_elapsed_time('pyexcelerate', elapsed)
def time_xlwt():
""" Run xlwt in default mode. """
start_time = clock()
workbook = xlwt.Workbook()
worksheet = workbook.add_sheet('Sheet1')
for row in range(row_max // 2):
for col in range(col_max):
worksheet.write(row * 2, col, "Row: %d Col: %d" % (row, col))
for col in range(col_max):
worksheet.write(row * 2 + 1, col, row + col)
workbook.save('xlwt.xls')
elapsed = clock() - start_time
print_elapsed_time('xlwt', elapsed)
print("")
print("Versions:")
print(" %-12s: %s" % ('python', sys.version[:5]))
print(" %-12s: %s" % ('openpyxl', openpyxl.__version__))
print(" %-12s: %s" % ('pyexcelerate', pyexcelerate.__version__))
print(" %-12s: %s" % ('xlsxwriter', xlsxwriter.__version__))
print(" %-12s: %s" % ('xlwt', xlwt.__VERSION__))
print("")
print("Dimensions:")
print(" Rows = %d" % row_max)
print(" Cols = %d" % col_max)
print("")
print("Times:")
time_pyexcelerate()
time_xlwt()
time_xlsxwriter_optimised()
time_xlsxwriter()
time_openpyxl_optimised()
time_openpyxl()
print("")
0x02
為了提高python的效率,很自然的就想到了c勃黍, 查詢相關(guān)資料后宵统,發(fā)現(xiàn)寫c代碼的方式有幾種。
- 第一種, 直接利用ctype 調(diào)用動態(tài)鏈接庫
from ctypes import windll # 首先導(dǎo)入 ctypes 模塊的 windll 子模塊
somelibc = windll.LoadLibrary(some.dll) # 使用 windll 模塊的 LoadLibrary 導(dǎo)入動態(tài)鏈接庫
第二種 利用ctypes來寫搭起c與python的橋梁
第三種直接用c來封裝c代碼马澈,并生成動態(tài)鏈接庫瓢省。
#include <Python.h>
#include <string.h>
/* module functions */
static PyObject * /* returns object */
message(PyObject *self, PyObject *args) /* self unused in modules */
{ /* args from Python call */
char *fromPython, result[64];
if (! PyArg_Parse(args, "(s)", &fromPython)) /* convert Python -> C */
return NULL; /* null=raise exception */
else {
strcpy(result, "Hello, "); /* build up C string */
strcat(result, fromPython); /* add passed Python string */
return Py_BuildValue("s", result); /* convert C -> Python */
}
}
/* registration table */
static struct PyMethodDef hello_methods[] = {
{"message", message, 1}, /* method name, C func ptr, always-tuple */
{NULL, NULL} /* end of table marker */
};
/* module initializer */
void inithello( ) /* called on first import */
{ /* name matters if loaded dynamically */
(void) Py_InitModule3("hello", hello_methods); /* mod name, table ptr */
}
這種方式代碼效率最高,缺點是與py版本不兼容
- 第四種痊班,利用cython生成c代碼勤婚,這種方式是最先進(jìn)的,也是最推薦的涤伐。
0x03
用cython可以參考cython的官網(wǎng)馒胆。
首先寫.pxd文件,類似于c語言的.h頭文件凝果,定義函數(shù)簽名等
這里我直接調(diào)用了c的excel庫libxlsxwriter祝迂,并且安裝libxlsxwriter到系統(tǒng)路徑中去了。c中的函數(shù)簽名直接copy到.pxd文件中就可以了器净,需要注意的是型雳,如果c定于的是一個struct,如lxw_error山害,那么在.pxd中直接寫上pass就好了纠俭,cython在生成代碼的時候會自動幫我們找到這個struct。
#cexcel.pyx
cdef extern from "xlsxwriter/format.h":
ctypedef struct lxw_error:
pass
cdef extern from "xlsxwriter/common.h":
ctypedef int lxw_col_t
ctypedef int lxw_row_t
ctypedef struct lxw_format:
pass
cdef extern from "xlsxwriter/worksheet.h":
ctypedef struct lxw_worksheet:
pass
lxw_error worksheet_write_string(lxw_worksheet *worksheet,
lxw_row_t row,
lxw_col_t col,
const char *string,
lxw_format *cformat);
lxw_error worksheet_write_number(lxw_worksheet *worksheet,
lxw_row_t row,
lxw_col_t col,
double number,
lxw_format *cformat);
lxw_error worksheet_set_column(lxw_worksheet *worksheet,
lxw_col_t first_col,
lxw_col_t last_col,
double width, lxw_format *format);
cdef extern from "xlsxwriter/workbook.h":
ctypedef struct lxw_workbook:
pass
ctypedef struct lxw_workbookoptions:
pass
lxw_workbook *new_workbook(const char *filename);
lxw_worksheet *workbook_add_worksheet(lxw_workbook *workbook,
const char *sheetname);
lxw_error workbook_close(lxw_workbook *workbook);
cdef extern from "xlsxwriter/custom.h":
lxw_format *get_my_style(lxw_workbook *workbook, int name)
定義好.pxd文件后浪慌,下面就開始寫我們的代碼邏輯了冤荆,定義在.pyx文件中代碼如下,其中cexcel就是之前我們定義的cexcel.pxd文件权纤,在WorkBook類中匙赞,如果需要使用一個c變量, 那么我們需要用cdef語句先聲明這個變量的類型妖碉。
#excel.pyx
cimport cexcel
cdef class WorkBook:
cdef cexcel.lxw_workbook *_c_workbook
cdef cexcel.lxw_worksheet *_c_worksheet
cdef cexcel.lxw_format *_c_header
cdef cexcel.lxw_format *_c_str
cdef cexcel.lxw_format *_c_num
cdef cexcel.lxw_format *cformat
def __cinit__(self, const char *filename):
self._c_workbook = cexcel.new_workbook(filename)
self._c_header = cexcel.get_my_style(self._c_workbook, 0)
self._c_str = cexcel.get_my_style(self._c_workbook, 1)
self._c_num = cexcel.get_my_style(self._c_workbook, 2)
def add_worksheet(self, const char *sheetname):
self._c_worksheet = cexcel.workbook_add_worksheet(self._c_workbook, sheetname)
return self
def write_header(self, cexcel.lxw_row_t row, cexcel.lxw_col_t col, const char *string):
cexcel.worksheet_write_string(self._c_worksheet, row, col, string, self._c_header)
def write_string(self, cexcel.lxw_row_t row, cexcel.lxw_col_t col, const char *string):
cexcel.worksheet_write_string(self._c_worksheet, row, col, string, self._c_str)
def write_number(self, cexcel.lxw_row_t row, cexcel.lxw_col_t col, number):
if number in {'--', ''}:
cexcel.worksheet_write_string(self._c_worksheet, row, col, number, self._c_num)
else:
cexcel.worksheet_write_number(self._c_worksheet, row, col, number, self._c_num)
def write_percent(self, cexcel.lxw_row_t row, cexcel.lxw_col_t col, const char *string):
cexcel.worksheet_write_string(self._c_worksheet, row, col, string, self._c_num)
def close(self):
cexcel.workbook_close(self._c_workbook)
def set_column(self, cexcel.lxw_col_t first_col, cexcel.lxw_col_t last_col, double width):
cexcel.worksheet_set_column(self._c_worksheet, first_col, last_col, width, NULL)
0x04
寫完模塊的業(yè)務(wù)邏輯之后涌庭,我們只需要編寫setup.py文件,利用disutls把我們的cython模塊安裝到系統(tǒng)路徑或者虛擬環(huán)境中
代碼如下
from Cython.Build import cythonize
from setuptools import setup, find_packages, Extension
ext_modules = cythonize([
Extension("py_c_xlsxwriter", ["excel.pyx"],
libraries=["xlsxwriter"], include_dirs=['/home/linl/Desktop/py_c_xlsxwriter/libxlsxwriter/lib'])])
setup(
name = "cpexcel",
version = '0.0.4',
keywords = 'c xlsxwriter cython',
license = 'MIT License',
url = 'https://github.com/drinksober',
install_requires = ['Cython'],
author = 'drinksober',
author_email = 'drinksober@foxmail.com',
packages = find_packages(),
platforms = 'any',
ext_modules = cythonize(ext_modules)
)
然后執(zhí)行python setup.py install欧宜,一個完成的cython模塊就完成了坐榆。