對于程序員來說經(jīng)常遇到英文手冊芙贫,本文就是作者在工作過程中遇到英文的.chm格式的英文手冊界面如圖1所示些己,為了方便查詢手冊,利用python編程血久,網(wǎng)絡(luò)在線翻譯文檔突照。具體的實現(xiàn)如下:
1.利用windows系統(tǒng)自帶的hh.exe文件還原.chm文件
進入cmd 使用語句 hh -decompile 目標文件夾 源CHM文件名
我使用的實例如圖2所示
還原后的文件如圖3所示
2.還原后的文件包括各種文件類型,而我們需要翻譯的是.htm文件類型氧吐。所以我們需要獲取還原后的文件夾中的所有.html文件,這里就會用到編程語言中的遞歸末盔,具體代碼如下:
#獲取所有的html 文件
def getHtmlFiles(dir):
global htmlFiles
fileNames = os.listdir(dir)
for i in range(len(fileNames)):
if fileNames[i].__contains__(".htm"):
htmlFiles.append(dir+"\\"+fileNames[i])
print(dir+"\\"+fileNames[i])
for i in range(len(fileNames)):
if os.path.isdir(dir + "\\" + fileNames[i]):
getHtmlFiles(dir + "\\" + fileNames[i])
打印出的文件路徑如圖4所示
3.以上獲取到htm文件路徑后筑舅,我們就可以根據(jù)路徑讀取htm的文件內(nèi)容,實際操作過程中陨舱,我將翻譯的結(jié)果替代htm的英文后翠拣,用瀏覽器打開會出現(xiàn)亂碼,因此先要刪除htm文件中關(guān)于語言的設(shè)置頭文件游盲。具體代碼如下:
#刪除頭文件解決中文亂碼
def deleteHtmlHead(index,path):
print('==1_刪除頭文件中的屬性解決中文亂碼Start==')
print('****'+str(index)+'****'+str(path))
filedata = ""
with open(path, "r") as html:
encoding = 'UTF-8'
# print("hhhhhhhhhhhhhhhhhhhhhhhh")
for line in html:
# print(line)
if line.__contains__('<meta http-equiv="Content-Language" content="en-us">'):
line = ''
if line.__contains__('<meta http-equiv="Content-Type" content="text/html; charset=windows-1252">'):
line = ''
if line.__contains__('charset=windows-1252'):
line = ''
if line.__contains__('doctype HTML'):
line = ''
filedata += line
with open(path, "w") as html:
html.write(filedata)
print('==1_刪除頭文件中的屬性解決中文亂碼END==')
4.原始的.htm文件打開后如圖5所示误墓。
提取內(nèi)容的文本可使用BeautifulSoup或者htmlparser 使用過程中沒有得到理想的效果蛮粮,因此我使用代碼獲取'>'與'<'之間的值來獲取。其實翻譯的準確性與否就在于能否準確的提取出.htm文件中的英文文本谜慌。在實際的編程過程中有許多特殊字符串情況需要處理然想,而本人做的不夠完美。
5.將獲取到的英文文本調(diào)用有道的翻譯接口進行翻譯
6.用翻譯的中文結(jié)果替換原文件中的英文完成翻譯
4-6過程的具體代碼如下:
#獲取翻譯結(jié)果
def translator(str):
"""
input : str 需要翻譯的字符串
output:translation 翻譯后的字符串
"""
# API
url = 'http://fanyi.youdao.com/translate?smartresult=dict&smartresult=rule&smartresult=ugc&sessionFrom=null'
# 傳輸?shù)膮?shù)欣范, i為要翻譯的內(nèi)容
key = {
'type': "AUTO",
'i': str,
"doctype": "json",
"version": "2.1",
"keyfrom": "fanyi.web",
"ue": "UTF-8",
"action": "FY_BY_CLICKBUTTON",
"typoResult": "true"
}
# key 這個字典為發(fā)送給有道詞典服務(wù)器的內(nèi)容
response = requests.post(url, data=key)
# 判斷服務(wù)器是否相應(yīng)成功
if response.status_code == 200:
# 通過 json.loads 把返回的結(jié)果加載成 json 格式
if response.text==None:
print("有道詞典None")
return None
else:
if (response.text.__contains__('非常抱歉变泄,來自您ip的請求異常頻繁,為了保護其他用戶的正常訪問恼琼,只能暫時禁止您目前的訪問妨蛹。')):
print("非常抱歉,來自您ip的請求異常頻繁")
return None
else:
result = json.loads(response.text)
# print ("輸入的詞為:%s" % result['translateResult'][0][0]['src'])
# print ("翻譯結(jié)果為:%s" % result['translateResult'][0][0]['tgt'])
# translation = result['translateResult'][0][0]['tgt']
translation = result['translateResult']
return translation
else:
print("有道詞典調(diào)用失敗")
# 相應(yīng)失敗就返回空
return None
def getEnContentAndTrans(index,path):
print('==2_處理文本分割Start==')
print('****'+str(index)+'****'+str(path))
html = open(path,"r")
orgContent = html.read()
list=[]
if orgContent.__contains__('Example (C)'):
list = orgContent.split('Example (C)')
elif orgContent.__contains__('Example</span> (C)'):
list = orgContent.split('Example</span> (C)')
else:
list = orgContent.split('Example (C)')
html.close()
print('==2_處理文本分割 END==')
print('==3_截取英文字段并翻譯Start==')
print('****'+str(index)+'****'+str(path))
dealText = list[0]
chTransText = dealText
totalSize = len(dealText)
beginPosition = 0
findLeft = True
leftPosition = 0
rightPosition = 0
enTextList=[]
for i in range(beginPosition,totalSize):
if dealText[i]==">" and findLeft:
beginPosition=i
leftPosition = i;
findLeft = False
if dealText[i]=="<" and not findLeft:
beginPosition = i
rightPosition = i
findLeft = True
enText = dealText[leftPosition+1:rightPosition]
checkText = re.sub("\s", "", enText)
#屏蔽
checkNbspText = checkText
checkNbspText = checkNbspText.replace(' ', '')
checkNbspText = checkNbspText.replace('\n', '')
checkNbspText = checkNbspText.replace(' ', '')
checkNbspText = checkNbspText.replace('\t', '')
#屏蔽大寫字母和數(shù)字
checkNum = checkNbspText
checkNum = checkNum.replace('.','')
isDigit = checkNum.isdigit()
isUpcase = checkNum.isupper()
if (len(checkText))>0 and len(checkNbspText)>0 and isDigit==False and isUpcase==False:
transResult = translator(enText)
# if enText.__contains__('With J1939, the Rx buffer is reserved for this CAN channel'):
# print('ssssssssssssssssssss')
ch=""
if transResult==None:
print("NONENONENONENONENONENONE")
sys.exit(0)
else:
for i in range(len(transResult)):
for j in range(len(transResult[i])):
item = transResult[i][j]
chx = item['tgt']
ch+=chx
print(enText+'___'+ch)
#dealText = dealText.replace(enText,ch,1)
if enText.__contains__('CAN') or enText.__contains__('Can') or enText.__contains__('can'):
chTransText = chTransText.replace(enText, 'CAN'+ch, 1)
elif ch.__contains__('手動RC系列30'):
ch=ch.replace('手動RC系列30','RC30系列手冊',1)
chTransText = chTransText.replace(enText, ch, 1)
else:
chTransText = chTransText.replace(enText, ch, 1)
else:
if len(checkText)>0:
print('%%%%%%'+enText)
#enTextList.append(enText)
print('==3_截取英文字段并翻譯 END==')
print('==4_將翻譯后的文件保存Start==')
print('****'+str(index)+'****'+str(path))
with open(path,"w") as html:
chTransText=chTransText.replace('晴竞。','.')
html.write(chTransText)
if len(list)>1:
# html.write('Example')
# if orgContent.__contains__('Example (C)'):
# html.write('Example (C)')
# elif orgContent.__contains__('Example</span> (C)'):
# html.write('Example</span> (C)')
for i in range(1,len(list)):
if orgContent.__contains__('Example (C)'):
html.write('Example (C)')
elif orgContent.__contains__('Example</span> (C)'):
html.write('Example</span> (C)')
html.write(list[i])
print('==4_將翻譯后的文件保存 END==')
翻譯后的結(jié)果如圖6所示
7.利用.chm生成器將所有翻譯后的文件打包成.chm文件
8.一些問題蛙卤。有道接口一小時只能調(diào)用1000次,我的解決方案是電腦連接手機熱點噩死,當ip被有道封鎖后颤难,插拔sim卡,ip地址就不同了甜滨。整體的代碼(包括調(diào)試過程中的所有有用或無用的方法)如下:
import json
import requests
import os
from bs4 import BeautifulSoup
import re
import sys
isFirst = True
htmlFiles=[]
#獲取所有的html 文件
def getHtmlFiles(dir):
global htmlFiles
fileNames = os.listdir(dir)
for i in range(len(fileNames)):
if fileNames[i].__contains__(".htm"):
htmlFiles.append(dir+"\\"+fileNames[i])
print(dir+"\\"+fileNames[i])
for i in range(len(fileNames)):
if os.path.isdir(dir + "\\" + fileNames[i]):
getHtmlFiles(dir + "\\" + fileNames[i])
#刪除頭文件解決中文亂碼
def deleteHtmlHead(index,path):
print('==1_刪除頭文件中的屬性解決中文亂碼Start==')
print('****'+str(index)+'****'+str(path))
filedata = ""
with open(path, "r") as html:
encoding = 'UTF-8'
# print("hhhhhhhhhhhhhhhhhhhhhhhh")
for line in html:
# print(line)
if line.__contains__('<meta http-equiv="Content-Language" content="en-us">'):
line = ''
if line.__contains__('<meta http-equiv="Content-Type" content="text/html; charset=windows-1252">'):
line = ''
if line.__contains__('charset=windows-1252'):
line = ''
if line.__contains__('doctype HTML'):
line = ''
filedata += line
with open(path, "w") as html:
html.write(filedata)
print('==1_刪除頭文件中的屬性解決中文亂碼END==')
#處理html中的文本
def handHtmlContent(index,path):
print('========================')
print('==2_處理HTML中的文本Start==')
print('========================')
print(str(index)+'___'+str(path))
org=''
with open(path,"r") as html:
content = html.read()
soup = BeautifulSoup(content, "lxml")
org = str(soup.text)
# org = org.replace("\t", "\n")
# org = re.sub("\s\n", "\n", org)
# org=re.sub("\n\s\n","",org)
# org = re.sub("\n\s\s\n", "", org)
# org = re.sub("\s\s\s", "", org)
# org = re.sub("\s\s\s\s", "", org)
# org = re.sub("\s\s\s\s\s", "", org)
# org = re.sub("\s\s\s\s\s\s", "", org)
# org = re.sub("\s\s\s\s\s\s\s", "", org)
# org = re.sub("\s\s\s\s\s\s\s\s", "", org)
# org = re.sub("\s\s\s\s\s\s\s\s\s", "", org)
# org = re.sub("\s\s\s\s\s\s\s\s\s\s", "", org)
#
# while org.__contains__("\n\n"):
# org = org.replace("\n\n","\n")
print('========================')
print('==2_處理HTML中的文本 End==')
print('========================')
return org
#獲取翻譯結(jié)果
def translator(str):
"""
input : str 需要翻譯的字符串
output:translation 翻譯后的字符串
"""
# API
url = 'http://fanyi.youdao.com/translate?smartresult=dict&smartresult=rule&smartresult=ugc&sessionFrom=null'
# 傳輸?shù)膮?shù)乐严, i為要翻譯的內(nèi)容
key = {
'type': "AUTO",
'i': str,
"doctype": "json",
"version": "2.1",
"keyfrom": "fanyi.web",
"ue": "UTF-8",
"action": "FY_BY_CLICKBUTTON",
"typoResult": "true"
}
# key 這個字典為發(fā)送給有道詞典服務(wù)器的內(nèi)容
response = requests.post(url, data=key)
# 判斷服務(wù)器是否相應(yīng)成功
if response.status_code == 200:
# 通過 json.loads 把返回的結(jié)果加載成 json 格式
if response.text==None:
print("有道詞典None")
return None
else:
if (response.text.__contains__('非常抱歉,來自您ip的請求異常頻繁衣摩,為了保護其他用戶的正常訪問昂验,只能暫時禁止您目前的訪問。')):
print("非常抱歉艾扮,來自您ip的請求異常頻繁")
return None
else:
result = json.loads(response.text)
# print ("輸入的詞為:%s" % result['translateResult'][0][0]['src'])
# print ("翻譯結(jié)果為:%s" % result['translateResult'][0][0]['tgt'])
# translation = result['translateResult'][0][0]['tgt']
translation = result['translateResult']
return translation
else:
print("有道詞典調(diào)用失敗")
# 相應(yīng)失敗就返回空
return None
"""有道翻譯函數(shù) DONE!"""
#將翻譯的文本結(jié)果替換到源文件
def updateHtmlContent(index,path,transResult):
print('=========================')
print('==4_將中文替換文本內(nèi)容Start==')
print('=========================')
print(str(index) + '___' + str(path))
html = open(path, "r")
#lines = html.readlines()
maxContent = html.read()
updateContent = maxContent
for i in range(len(transResult)):
item = transResult[i][0]
en = item['src']
ch = item['tgt']
if ch==None or len(en)<=0 or en.__contains__('\xa0'):
continue
if updateContent.__contains__(en):
#print('==============='+ch)
updateContent = updateContent.replace(en,ch,1)
html.close()
with open(path, "w") as html:
html.write(updateContent)
print('=========================')
print('==4_將中文替換文本內(nèi)容 END==')
print('=========================')
#def checkOnlyNBSP(text):
def getEnContentAndTrans(index,path):
print('==2_處理文本分割Start==')
print('****'+str(index)+'****'+str(path))
html = open(path,"r")
orgContent = html.read()
list=[]
if orgContent.__contains__('Example (C)'):
list = orgContent.split('Example (C)')
elif orgContent.__contains__('Example</span> (C)'):
list = orgContent.split('Example</span> (C)')
else:
list = orgContent.split('Example (C)')
html.close()
print('==2_處理文本分割 END==')
print('==3_截取英文字段并翻譯Start==')
print('****'+str(index)+'****'+str(path))
dealText = list[0]
chTransText = dealText
totalSize = len(dealText)
beginPosition = 0
findLeft = True
leftPosition = 0
rightPosition = 0
enTextList=[]
for i in range(beginPosition,totalSize):
if dealText[i]==">" and findLeft:
beginPosition=i
leftPosition = i;
findLeft = False
if dealText[i]=="<" and not findLeft:
beginPosition = i
rightPosition = i
findLeft = True
enText = dealText[leftPosition+1:rightPosition]
checkText = re.sub("\s", "", enText)
#屏蔽
checkNbspText = checkText
checkNbspText = checkNbspText.replace(' ', '')
checkNbspText = checkNbspText.replace('\n', '')
checkNbspText = checkNbspText.replace(' ', '')
checkNbspText = checkNbspText.replace('\t', '')
#屏蔽大寫字母和數(shù)字
checkNum = checkNbspText
checkNum = checkNum.replace('.','')
isDigit = checkNum.isdigit()
isUpcase = checkNum.isupper()
if (len(checkText))>0 and len(checkNbspText)>0 and isDigit==False and isUpcase==False:
transResult = translator(enText)
# if enText.__contains__('With J1939, the Rx buffer is reserved for this CAN channel'):
# print('ssssssssssssssssssss')
ch=""
if transResult==None:
print("NONENONENONENONENONENONE")
sys.exit(0)
else:
for i in range(len(transResult)):
for j in range(len(transResult[i])):
item = transResult[i][j]
chx = item['tgt']
ch+=chx
print(enText+'___'+ch)
#dealText = dealText.replace(enText,ch,1)
if enText.__contains__('CAN') or enText.__contains__('Can') or enText.__contains__('can'):
chTransText = chTransText.replace(enText, 'CAN'+ch, 1)
elif ch.__contains__('手動RC系列30'):
ch=ch.replace('手動RC系列30','RC30系列手冊',1)
chTransText = chTransText.replace(enText, ch, 1)
else:
chTransText = chTransText.replace(enText, ch, 1)
else:
if len(checkText)>0:
print('%%%%%%'+enText)
#enTextList.append(enText)
print('==3_截取英文字段并翻譯 END==')
print('==4_將翻譯后的文件保存Start==')
print('****'+str(index)+'****'+str(path))
with open(path,"w") as html:
chTransText=chTransText.replace('既琴。','.')
html.write(chTransText)
if len(list)>1:
# html.write('Example')
# if orgContent.__contains__('Example (C)'):
# html.write('Example (C)')
# elif orgContent.__contains__('Example</span> (C)'):
# html.write('Example</span> (C)')
for i in range(1,len(list)):
if orgContent.__contains__('Example (C)'):
html.write('Example (C)')
elif orgContent.__contains__('Example</span> (C)'):
html.write('Example</span> (C)')
html.write(list[i])
print('==4_將翻譯后的文件保存 END==')
def main():
# rootDir = 'D:\用戶手冊\RC30_Manual'+'\\'+'3_controllers'+'\\'+'HWID00D3'+'\\'+'functions'
rootDir = 'D:\用戶手冊\RC30_Manual'
print('==0_獲取所有的Html文件Start==')
global isFirst
if isFirst:
getHtmlFiles(rootDir)
isFirst=False
print('==0_獲取所有的Html文件 END===')
# ** ** 118 ** ** D:\用戶手冊\RC30_Manual\5
# _API\CAN\can_sendDatabox.htm
'''
for i in range(len(htmlFiles)):
print(str(i)+'******'+htmlFiles[i])
if (str(htmlFiles[i])).__contains__('can_sendDatabox.htm'):
print('=============='+str(i))
'''
''''''
for i in range(207,len(htmlFiles)):
# for i in range(175, 176):
#print(str(i+1)+'__'+str(htmlFiles[i]))
# if i==0:
deleteHtmlHead(i,str(htmlFiles[i]))
getEnContentAndTrans(i,str(htmlFiles[i]))
'''
content = handHtmlContent(i,htmlFiles[i])
#print(content)
print('================================')
print('========3_調(diào)用有道翻譯Start========')
print('================================')
transResult = translator(content)
print(transResult)
# print(transResult[0])
# print(transResult[0][0])
# print(transResult[1][0])
#
# print(len(transResult))
print('===============================')
print('========3_調(diào)用有道翻譯 End========')
print('===============================')
updateHtmlContent(i,htmlFiles[i],transResult)
'''
if __name__=='__main__':
main()