一個簡短的Python腳本,實現(xiàn)對PDF文檔的讀寫
import pdfplumber
import pandas as pd
import os
import csv
#在硬盤下新建名為data_pdf的文件夾,放入PDF文件執(zhí)行即可
with open('pdf_read.csv','a+',newline='',encoding='utf-8') as csvfile:
writer=csv.writer(csvfile,dialect='excel')
for root, dirs, files in os.walk("./data_pdf/" ):
for index_file,name in enumerate(files):
try:
if name.split('.')[1]=='pdf':
path=os.path.join(root, name)
with pdfplumber.open(path) as pdf:
for index,page in enumerate(pdf.pages):
#tables = page.extract_tables() # 獲取表格信息
string = page.extract_text() # 獲取PDF文本信息
if 'a' in string or 'b' in string or 'c' in string or 'd' in string or 'e' in string or 'f' in string:
print("{}/{},{}在第{}頁存在關(guān)鍵詞,處理完畢".format(index_file+1,len(files),name,index+1))
count=1
state='T'
writer.writerow([name,count,index+1,state])
break
else:
print("{}/{},{}的第{}頁正在處理".format(index_file+1,len(files),name,index+1))
except:
state='F'
writer.writerow([name,state])