上一篇文章,介紹了文本編碼判斷燎窘,但這都是基于文本有BOM摹闽,如果是遇到?jīng)]有BOM的文本,顯然結果是不正確褐健。所以在上一篇文章的基礎上付鹿,增加對無BOM文本的判斷。要百分百準確判斷一個文件的編碼是很難的蚜迅,但是判斷文本是否UTF-8編碼就相對簡單舵匾。用正則表達式遍歷數(shù)據(jù)就可以判斷,網(wǎng)上也有參考代碼谁不。
整理代碼如下:
Sub Gc()
Dim myFileName$
myFileName = ThisWorkbook.Path & "\UTF8NoBom.txt"
MsgBox GetCode(myFileName)
End Sub
Function GetCode(ByVal myFileName As String)
Dim i As Long
Dim n As Long
Open myFileName For Binary Access Read As #1
n = LOF(1) - 1
ReDim Tmp(n) As Byte
ReDim tp(n)
Get #1, , Tmp
Close #1
For i = 0 To n
tp(i) = ChrW(Tmp(i)) '返回與ANSI 字符代碼相對應的字符
Next
str1 = Tmp(0) & Tmp(1) '前二個
str2 = str1 & Tmp(2) '前三個
str3 = Join(tp, "")
If str1 = "255254" Then
GetCode = "Unicode"
ElseIf str1 = "254255" Then
GetCode = "Unicode Big Endian"
ElseIf str2 = "239187191" Then
GetCode = "UTF-8"
ElseIf is_valid_utf8(str3) Then '判斷是否UTF8
GetCode = "UTF8_NOBOM"
Else
GetCode = "ANSI"
End If
End Function
下面是判斷是否為UTF8
Function is_valid_utf8(ByRef str) 'ByRef以提高效率
Dim s, mRegExp
Set mRegExp = CreateObject("VbScript.regexp")
s = "[\xC0-\xDF]([^\x80-\xBF]|$)"
s = s & "|[\xE0-\xEF].{0,1}([^\x80-\xBF]|$)"
s = s & "|[\xF0-\xF7].{0,2}([^\x80-\xBF]|$)"
s = s & "|[\xF8-\xFB].{0,3}([^\x80-\xBF]|$)"
s = s & "|[\xFC-\xFD].{0,4}([^\x80-\xBF]|$)"
s = s & "|[\xFE-\xFE].{0,5}([^\x80-\xBF]|$)"
s = s & "|[\x00-\x7F][\x80-\xBF]"
s = s & "|[\xC0-\xDF].[\x80-\xBF]"
s = s & "|[\xE0-\xEF]..[\x80-\xBF]"
s = s & "|[\xF0-\xF7]...[\x80-\xBF]"
s = s & "|[\xF8-\xFB]....[\x80-\xBF]"
s = s & "|[\xFC-\xFD].....[\x80-\xBF]"
s = s & "|[\xFE-\xFE]......[\x80-\xBF]"
s = s & "|^[\x80-\xBF]"
mRegExp.Pattern = s
is_valid_utf8 = (Not mRegExp.test(str))
End Function
代碼依然存在小問題:如果文本是純英文數(shù)字坐梯,ASCII會判斷為UTF8NoBom,不過純英文數(shù)字在ASCII范圍內(nèi)和UTF-8是兼容的刹帕,不會出現(xiàn)亂碼烛缔,可以忽略。