學習Caffe第一件想干的事就是中文OCR栏妖。因為雖然數(shù)字OCR與英文OCR已經(jīng)非常成熟(特別是印刷體的),但是如果說到中文OCR返敬,那能搜索到的信息也是寥寥挡闰。但不斷聽說有人用Caffe做了不錯的中文OCR識別,因此古戴,內心也是癢癢欠橘。這篇文章,想先用Caffe作一個簡單的中文OCR的測試现恼。這里肃续,將百家姓作為測試對象黍檩,并且只看黑體。希望可以先一窺Caffe識別的整體流程始锚。
0. 訓練集與測試集的準備
0.1 準備字符集
首先準備百家姓字符集:我在維基百科中刽酱,選擇了2007年排名前一百的姓氏:
將它存儲在family_name.txt中:王李張劉陳楊黃趙吳周徐孫馬朱胡郭何高林羅鄭梁謝宋唐許韓馮鄧曹彭曾蕭田董袁潘于蔣蔡余杜葉程蘇魏呂丁任沈姚盧姜崔鍾譚陸汪范金石廖賈夏韋傅方白鄒孟熊秦邱江尹薛閆段雷侯龍史陶黎賀顧毛郝龔邵萬錢嚴覃武戴莫孔向湯
一共100個字。
首先在我的圖片庫下創(chuàng)建文件夾family_name_db瞧捌,然后在family_name_db下分別創(chuàng)建db棵里,train_set和test_set三個文件夾
E:\Pictures\ImageDataBase\family_name_db>tree
E:.
├─db
├─test_set
└─train_set
0.2 轉換成圖片
然后,調用一段python代碼察郁,來生成最原始的圖片集:
import os
import pygame
import Image
import StringIO
from os.path import join
def paste_word(word, dir_name, font_size, canvas_size):
'''輸入一個文字衍慎,輸出一張包含該文字的圖片'''
pygame.init()
font = pygame.font.Font(join("./fonts", "simhei.ttf"), font_size)
#text = word.decode('utf-8')
if os.path.isdir(dir_name):
sub_dir = join(dir_name, word)
if not os.path.isdir(sub_dir):
os.mkdir(sub_dir)
else:
return
img_name = join(sub_dir, str(len(os.listdir(sub_dir))) + ".png")
paste(word, font, img_name, canvas_size, (0,0))
def paste(text, font, imgName, canvas_size, area = (3, 3)):
'''根據(jù)字體,將一個文字黏貼到圖片上皮钠,并保存'''
im = Image.new("RGB", (canvas_size, canvas_size), (255, 255, 255))
rtext = font.render(text, True, (0, 0, 0), (255, 255, 255))
sio = StringIO.StringIO()
pygame.image.save(rtext, sio)
sio.seek(0)
line = Image.open(sio)
im.paste(line, area)
# im.show()
im.save(imgName)
if __name__ == "__main__":
file_path = raw_input('Please input the chinese characters text file: ')
while not os.path.isfile(file_path):
print file_path, ' not exist'
file_path = raw_input('Please re-input the chinese characters text file: ')
dir_path = raw_input('Please input the database dir: ')
while not os.path.isdir(dir_path):
print dir_path, ' not exist'
dir_path = raw_input('Please re-input the database dir: ')
f = open(file_path, 'r')
words = f.read()
words = words.decode('utf-8')
for w in words:
paste_word(w, dir_path, 48, 48)
由于我決定最終將文字都歸一化到40x40大小稳捆。因此,我實際上運行了這個程序三次麦轰,第一次最后一行為:
paste_word(w, dir_path, 40, 40)
第二次為
paste_word(w, dir_path, 44, 44)
第三次為
paste_word(w, dir_path, 48, 48)
如此乔夯,在每個字符對應的文件夾中就得到了這樣的三張圖片:
0.3 生成訓練集與測試集
接下來,利用上述生成的每個字三個樣本款侵,利用亞像素精度的位移末荐,生成更多樣本的訓練集和測試集:
#!/usr/bin/python
# -*- coding:utf-8 -*-
import os
import shutil
import filecmp
from ctypes import *
from os.path import join
def dir_copytree(src, dst):
names = [d.decode('gbk') for d in os.listdir(src)]
# 目標文件夾不存在,則新建
if not os.path.exists(dst):
os.mkdir(dst)
# 遍歷源文件夾中的文件與文件夾
for name in names:
src_name = os.path.join(src, name)
dst_name = os.path.join(dst, name)
# 是文件夾則遞歸調用本拷貝函數(shù)新锈,否則直接拷貝文件
if os.path.isdir(src_name):
dir_copytree(src_name, dst_name)
else:
if not os.path.exists(dst_name):
shutil.copy2(src_name, dst)
else:
if not filecmp.cmp(src_name, dst_name):
shutil.copy2(src_name, dst)
def sub_pixel_move(from_dir, to_dir, orients, steps):
# windll.LoadLibrary(join(r"D:\project\UniversalOCR\UniversalOCR_alg_dev\build-shared\bin\Debug", \
# "tinyxml2.dll"))
libgg = windll.LoadLibrary(join(r"D:\project\UniversalOCR\UniversalOCR_alg_dev\build-x64\bin\Debug",\
"gg_universal_ocr.dll"))
i_orient = 0
for o in orients:
if o == "up":
i_orient |= 1
elif o == "down":
i_orient |= 2
elif o == "left":
i_orient |= 4
elif o == "right":
i_orient |= 8
cwd = os.getcwd()
os.chdir(from_dir)
sub_dir = [d for d in os.listdir('.') if os.path.isdir(d)]
for sd in sub_dir:
pngs = [join(sd,f) for f in os.listdir(sd) if os.path.splitext(f)[-1] == ".png"]
dst_dir = join(to_dir, sd)
if not os.path.isdir(dst_dir):
os.mkdir(dst_dir)
for p in pngs:
for s in steps:
ret_status = libgg.ggGenMoreSamples(os.path.abspath(p), dst_dir, c_double(s), i_orient, 2, None)
# print ret_status
os.chdir(cwd)
if __name__ == "__main__":
image_data_base = raw_input('Image data base dir: ')
while not os.path.isdir(image_data_base):
print(image_data_base, ' not exist')
image_data_base = raw_input('Please re-input image data base dir: ')
train_set_dir = raw_input('Train set dir: ')
if not os.path.isdir(train_set_dir):
os.mkdir(train_set_dir)
test_set_dir = raw_input('Test set dir:')
if not os.path.isdir(test_set_dir):
os.mkdir(test_set_dir)
print 'Use sub pixel move to generate more images'
train_sub_pixel_step = [0.1, 0.3, 0.5, 0.7, 0.9]
test_sub_pixel_step = [0.2, 0.4, 0.6, 0.8]
print 'image data base is: ', image_data_base
print 'train set dir: ', train_set_dir
print 'train sub-pixel step up and left: ', train_sub_pixel_step
print 'test sub-pixel step up and left: ', test_sub_pixel_step
print 'test set dir: ', test_set_dir
print 'Copying images from image data base to train set directory...'
dir_copytree(image_data_base, train_set_dir)
print 'Done.'
print 'Train set images generating: sub-pixel move upward and left...'
sub_pixel_move(image_data_base, train_set_dir, ["up", "left"], train_sub_pixel_step)
print 'Done.'
print 'Test set images generating: sub-pixel move upward and left...'
sub_pixel_move(image_data_base, test_set_dir, ["up", "left"], test_sub_pixel_step)
print 'Done.'
這里的關鍵代碼
ret_status = libgg.ggGenMoreSamples(os.path.abspath(p), dst_dir, c_double(s), i_orient, 2, None)
我是調用的我自己寫的另外一個C++程序甲脏。
/** 通過一個樣本,生成多個樣本
@param[in] in_img 輸入圖像
@param[in] out_dir 輸出目錄
@param[in] subpixel_step 亞像素步長,應當小于1
@param[in] orient 方向
@param[in] n_in 需要多少個樣本
@param[out] n_out 實際生成了多少個樣本
*/
GGAPI(int) ggGenMoreSamples(const char* in_img, const char* out_dir, double subpixel_step, int orient, int n_in, int* n_out);
至此妹笆,train_set
中每個樣本文件夾包含了33個對象块请,test_set
中每個樣本文件夾包含了24個對象。
1. 圖像數(shù)據(jù)轉換為db文件
參考:https://www.cnblogs.com/denny402/p/5082341.html
通過上述步驟拳缠,已經(jīng)獲得了訓練集和測試集墩新,它們是.bmp格式的圖像,而且大小是不一致的窟坐。而在caffe中經(jīng)常使用的數(shù)據(jù)類型是lmdb或leveldb海渊。那么如何從原始的.bmp格式的文件轉換到db格式文件呢?
打開caffe解決方案哲鸳,可以看到在tools下臣疑,有一個工程,叫convert_imageset徙菠。使用這個工具朝捆,可以將圖片文件轉換成caffe框架中可以直接使用的db文件。在convert_imageset.cpp中懒豹,開頭的注釋是這么寫的:
// This program converts a set of images to a lmdb/leveldb by storing them
// as Datum proto buffers.
// Usage:
// convert_imageset [FLAGS] ROOTFOLDER/ LISTFILE DB_NAME
//
// where ROOTFOLDER is the root folder that holds all the images, and LISTFILE
// should be a list of files as well as their labels, in the format as
// subfolder1/file1.JPEG 7
// ....
需要四個參數(shù):
-
FLAGS: 參數(shù)
-
-gray
: 是否以灰度圖的方式打開圖片芙盘。程序調用opencv庫中的imread()函數(shù)來打開圖片驯用,默認為false -
-shuffle
: 是否隨機打亂圖片順序。默認為false -
-backend
: 需要轉換成的db文件格式儒老,可選為leveldb或lmdb,默認為lmdb -
-resize_width/resize_height
: 改變圖片的大小蝴乔。在運行中,要求所有圖片的尺寸一致驮樊,因此需要改變圖片大小薇正。 程序調用opencv庫的resize()函數(shù)來對圖片放大縮小,默認為0囚衔,不改變 -
-check_size
: 檢查所有的數(shù)據(jù)是否有相同的尺寸挖腰。默認為false,不檢查 -
-encoded
: 是否將原圖片編碼放入最終的數(shù)據(jù)中,默認為false -
-encode_type
: 與前一個參數(shù)對應练湿,將圖片編碼為哪一個格式:‘png','jpg'......
-
- ROOTFOLDER/: 圖片存放的絕對路徑
- LISTFILE: 圖片文件列表清單猴仑,一般為一個txt文件,一行一張圖片
- DB_NAME: 最終生成的db文件存放目錄
現(xiàn)在首先來生成LISTFILE肥哎。注意辽俗,標簽要從0開始!我一開始沒注意篡诽,將標簽設置為從1開始崖飘,結果訓練的時候出現(xiàn)問題
import os
from os.path import join, isdir
def gen_listfile(dir):
cwd = os.getcwd()
os.chdir(dir)
sd = [d for d in os.listdir('.') if isdir(d)]
class_id = 0
with open(join(dir, 'listfile.txt'), 'w') as f:
for d in sd:
fs = [join(d,x) for x in os.listdir(d)]
for img in fs:
f.write(img + ' ' + str(class_id) + '\n')
class_id += 1
os.chdir(cwd)
if __name__ == "__main__":
root_dir = raw_input('image root dir: ')
while not isdir(root_dir):
raw_input('not exist, re-input please: ')
gen_listfile(root_dir)
生成后的列表為:
丁\0left0_20.bmp 0
丁\0left0_40.bmp 0
丁\0left0_60.bmp 0
丁\0left0_80.bmp 0
丁\0up0_20.bmp 0
丁\0up0_40.bmp 0
丁\0up0_60.bmp 0
丁\0up0_80.bmp 0
丁\1left0_20.bmp 0
丁\1left0_40.bmp 0
丁\1left0_60.bmp 0
丁\1left0_80.bmp 0
丁\1up0_20.bmp 0
丁\1up0_40.bmp 0
丁\1up0_60.bmp 0
丁\1up0_80.bmp 0
丁\2left0_20.bmp 0
丁\2left0_40.bmp 0
丁\2left0_60.bmp 0
丁\2left0_80.bmp 0
丁\2up0_20.bmp 0
丁\2up0_40.bmp 0
丁\2up0_60.bmp 0
丁\2up0_80.bmp 0
萬\0left0_20.bmp 1
萬\0left0_40.bmp 1
萬\0left0_60.bmp 1
萬\0left0_80.bmp 1
萬\0up0_20.bmp 1
萬\0up0_40.bmp 1
......
然后用以下的命令來生成數(shù)據(jù)庫
F:\OpenSource\caffe\build\tools\Release\convert_imageset --shuffle ^
--gray --resize_height=40 --resize_width=40 ^
E:\Pictures\ImageDataBase\family_name_db\train_set\ ^
E:\Pictures\ImageDataBase\family_name_db\train_set\listfile.txt ^
E:\Pictures\ImageDataBase\family_name_db\train_set\fn_train_lmdb
F:\OpenSource\caffe\build\tools\Release\convert_imageset --shuffle ^
--gray --resize_height=40 --resize_width=40 ^
E:\Pictures\ImageDataBase\family_name_db\test_set\ ^
test_set\listfile.txt test_set\fn_test_lmdb
此時,即會生成fn_train_lmdb
文件夾和fn_test_lmdb
文件夾
2. 創(chuàng)建模型并測試
我借用了MNIST例子中的lenet模型杈女。并且在caffe\examples目錄下創(chuàng)建了family_name文件夾來存放各種模型的配置文件朱浴。并且將fn_train_lmdb
文件夾和fn_test_lmdb
文件夾也都拷貝到這個文件夾下:
F:\OPENSOURCE\CAFFE\EXAMPLES\FAMILY_NAME
│ lenet_solver.prototxt
│ lenet_train_test.prototxt
│ train_lenet.sh
│
├─fn_test_lmdb
│ data.mdb
│ lock.mdb
│
└─fn_train_lmdb
data.mdb
lock.mdb
lenet_solver.prototxt設置了訓練需要的參數(shù):
# The train/test net protocol buffer definition
net: "examples/family_name/lenet_train_test.prototxt"
# test_iter specifies how many forward passes the test should carry out.
# In the case of family_name, we have test batch size 100 and 24 test iterations,
# covering the full 2,400 testing images.
test_iter: 24
# Carry out testing every 100 training iterations.
test_interval: 100
# The base learning rate, momentum and the weight decay of the network.
base_lr: 0.01
momentum: 0.9
weight_decay: 0.0005
# The learning rate policy
lr_policy: "inv"
gamma: 0.0001
power: 0.75
# Display every 100 iterations
display: 100
# The maximum number of iterations
max_iter: 10000
# snapshot intermediate results
snapshot: 5000
snapshot_prefix: "examples/family_name/lenet"
# solver mode: CPU or GPU
solver_mode: GPU
lenet_train_test.prototxt定義了網(wǎng)絡結構:
name: "LeNet"
layer {
name: "family_name"
type: "Data"
top: "data"
top: "label"
include {
phase: TRAIN
}
transform_param {
scale: 0.00390625
}
data_param {
source: "examples/family_name/fn_train_lmdb"
batch_size: 33
backend: LMDB
}
}
layer {
name: "family_name"
type: "Data"
top: "data"
top: "label"
include {
phase: TEST
}
transform_param {
scale: 0.00390625
}
data_param {
source: "examples/family_name/fn_test_lmdb"
batch_size: 100
backend: LMDB
}
}
layer {
name: "conv1"
type: "Convolution"
bottom: "data"
top: "conv1"
param {
lr_mult: 1
}
param {
lr_mult: 2
}
convolution_param {
num_output: 20
kernel_size: 5
stride: 1
weight_filler {
type: "xavier"
}
bias_filler {
type: "constant"
}
}
}
layer {
name: "pool1"
type: "Pooling"
bottom: "conv1"
top: "pool1"
pooling_param {
pool: MAX
kernel_size: 2
stride: 2
}
}
layer {
name: "conv2"
type: "Convolution"
bottom: "pool1"
top: "conv2"
param {
lr_mult: 1
}
param {
lr_mult: 2
}
convolution_param {
num_output: 50
kernel_size: 5
stride: 1
weight_filler {
type: "xavier"
}
bias_filler {
type: "constant"
}
}
}
layer {
name: "pool2"
type: "Pooling"
bottom: "conv2"
top: "pool2"
pooling_param {
pool: MAX
kernel_size: 2
stride: 2
}
}
layer {
name: "ip1"
type: "InnerProduct"
bottom: "pool2"
top: "ip1"
param {
lr_mult: 1
}
param {
lr_mult: 2
}
inner_product_param {
num_output: 1000
weight_filler {
type: "xavier"
}
bias_filler {
type: "constant"
}
}
}
layer {
name: "relu1"
type: "ReLU"
bottom: "ip1"
top: "ip1"
}
layer {
name: "ip2"
type: "InnerProduct"
bottom: "ip1"
top: "ip2"
param {
lr_mult: 1
}
param {
lr_mult: 2
}
inner_product_param {
num_output: 100
weight_filler {
type: "xavier"
}
bias_filler {
type: "constant"
}
}
}
layer {
name: "accuracy"
type: "Accuracy"
bottom: "ip2"
bottom: "label"
top: "accuracy"
include {
phase: TEST
}
}
layer {
name: "loss"
type: "SoftmaxWithLoss"
bottom: "ip2"
bottom: "label"
top: "loss"
}
train_lenet.sh為運行訓練的bash命令腳本
#!/usr/bin/env sh
set -e
./build/tools/Release/caffe train --solver=examples/family_name/lenet_solver.prototxt $@
在caffe的主目錄下用cygwin運行這個腳本。
zhongc@zhongc-PC /cygdrive/f/OpenSource/caffe
$ ./examples/family_name/train_lenet.sh
最終生成的結果為:
I0418 08:11:10.970157 7868 solver.cpp:398] Test net output #0: accuracy = 1
I0418 08:11:10.970157 7868 solver.cpp:398] Test net output #1: loss = 0.000447432 (* 1 = 0.000447432 loss)
I0418 08:11:10.970157 7868 solver.cpp:316] Optimization Done.
I0418 08:11:10.970157 7868 caffe.cpp:260] Optimization Done.
說明測試集的正確率為100%达椰。由于篇幅原因翰蠢,暫且罷筆。下一篇文章將嘗試使用c++ API識別單個圖片砰碴。也就是再次探討如何將自己訓練的結果應用起來躏筏。