為了不在某個平臺開通會員,充錢歹袁,我借鑒了github某位大神的代碼https://github.com/python-fan/pdf2word屹篓,成功實(shí)現(xiàn)pdf轉(zhuǎn)換為word技肩,各位可以參考借鑒下俗孝。
1.在使用該代碼時酒甸,需要安裝對應(yīng)的第三方庫包,pdf轉(zhuǎn)換word(即提取pdf文字寫入到word中)需要兩個庫包:pdfminer3k和python-docx赋铝;需要提取pdf中的圖片插勤,需要?pymupdf該庫名進(jìn)行操作提取
2.實(shí)現(xiàn)代碼如下:
import os
from configparserimport ConfigParser
from ioimport StringIO
from ioimport open
from concurrent.futuresimport ProcessPoolExecutor
from pdfminer.pdfinterpimport PDFResourceManager
from pdfminer.pdfinterpimport process_pdf
from pdfminer.converterimport TextConverter
from pdfminer.layoutimport LAParams
from docximport Document
import fitz
import time
import re
#提取圖片函數(shù)
def pdf2pic(path, pic_path):
'''# 從pdf中提取圖片? ,param path: pdf的路徑 革骨,param pic_path: 圖片保存的路徑
'''
? ? # 使用正則表達(dá)式來查找圖片
? ? checkXO =r"/Type(?= */XObject)"
? ? checkIM =r"/Subtype(?= */Image)"
? ? # 打開pdf
? ? doc = fitz.open(path)
# 圖片計數(shù)
? ? imgcount =0
? ? lenXREF = doc.xrefLength()?#最新fitz庫是沒有doc._getXrefLength()
# 打印PDF的信息
? ? print("文件名:{}, 頁數(shù): {}, 對象: {}".format(path, len(doc), lenXREF -1))
# 遍歷每一個對象
? ? for iin range(1, lenXREF):
# 定義對象字符串
? ? ? ? text = doc.xrefObject(i)?#最新fitz庫是沒有g(shù)etObjectString()
isXObject = re.search(checkXO, text)
# 使用正則表達(dá)式查看是否是圖片
? ? ? ? isImage = re.search(checkIM, text)
# 如果不是對象也不是圖片农尖,則continue
? ? ? ? if not isXObjector not isImage:
continue
? ? ? ? imgcount +=1
? ? ? ? # 根據(jù)索引生成圖像
? ? ? ? pix = fitz.Pixmap(doc, i)
# 根據(jù)pdf的路徑生成圖片的名稱
? ? ? ? new_name = path.replace('\\', '_') +"_img{}.png".format(imgcount)
new_name = new_name.replace(':', '')
# 如果pix.n<5,可以直接存為PNG
? ? ? ? if pix.n <5:
pix.writePNG(os.path.join(pic_path, new_name))
# 否則先轉(zhuǎn)換CMYK
? ? ? ? else:
pix0 = fitz.Pixmap(fitz.csRGB, pix)
pix0.writePNG(os.path.join(pic_path, new_name))
pix0 =None
? ? ? ? # 釋放資源
? ? ? ? pix =None
? ? ? ? print("提取了{(lán)}張圖片".format(imgcount))
#讀取pdf文件函數(shù)
def read_from_pdf(file_path):
with open(file_path, 'rb')as file:
resource_manager = PDFResourceManager()
return_str = StringIO()
lap_params = LAParams()
device = TextConverter(
resource_manager, return_str, laparams=lap_params)
process_pdf(resource_manager, device, file)
device.close()
content = return_str.getvalue()
return_str.close()
return content
#保存word文件函數(shù)
def save_text_to_word(content, file_path):
doc = Document()
for linein content.split('\n'):#以換行符為分割
? ? ? ? paragraph = doc.add_paragraph()
paragraph.add_run(remove_control_characters(line))
doc.save(file_path)
#移除字符
def remove_control_characters(content):
mpa =dict.fromkeys(range(32))
return content.translate(mpa)
#直接調(diào)用read_from_pdf(pdf_file_path)函數(shù)和save_text_to_word(content, word_file_path)函數(shù)
def pdf_to_word(pdf_file_path, word_file_path):
content = read_from_pdf(pdf_file_path)
save_text_to_word(content, word_file_path)
#寫個主函數(shù),可以讀取配置文件信息和使用多進(jìn)程處理pdf文件
def main():
#讀取配置文件信息
? ? config_parser = ConfigParser()
config_parser.read('config.cfg')
config = config_parser['default']
#多進(jìn)程處理任務(wù)良哲,可以同時執(zhí)行多個pdf文件進(jìn)行轉(zhuǎn)換為word文件
? ? tasks = []
with ProcessPoolExecutor(max_workers=int(config['max_worker']))as executor:
for filein os.listdir(config['pdf_folder']):
extension_name = os.path.splitext(file)[1]
if extension_name !='.pdf':
continue
? ? ? ? ? ? file_name = os.path.splitext(file)[0]
pdf_file = config['pdf_folder'] +'/' + file
word_file = config['word_folder'] +'/' + file_name +'.docx'
? ? ? ? ? ? print('正在處理: ', file)
result = executor.submit(pdf_to_word, pdf_file, word_file)
tasks.append(result)
while True:
exit_flag =True
? ? ? ? for taskin tasks:
if not task.done():
exit_flag =False
? ? ? ? if exit_flag:
print('完成')
exit(0)
3.執(zhí)行代碼運(yùn)行
if __name__ =='__main__':
fpath=os.getcwd()
# pdf_to_word(fpath+'\\pdf\\test.pdf',fpath+'\\word\\test.docx')
? ? pdf2pic(fpath+'\\pdf\\test.pdf',fpath+'\\word')