這篇文章主要是將go語言實現(xiàn)的版本改為C/C++版本實現(xiàn)牵囤,主要思路是一樣的,具體思路請看:
GO代碼實現(xiàn)判斷字符編碼格式及編碼格式轉(zhuǎn)換(utf-8滞伟、gbk)
而本文更主要說明windows及l(fā)inux平臺下utf-8與gbk的轉(zhuǎn)換揭鳞。
判斷是否是gbk
bool isGBK(unsigned char* data, int len) {
int i = 0;
while (i < len) {
if (data[i] <= 0x7f) {
//編碼小于等于127,只有一個字節(jié)的編碼,兼容ASCII
i++;
continue;
} else {
//大于127的使用雙字節(jié)編碼
if (data[i] >= 0x81 &&
data[i] <= 0xfe &&
data[i + 1] >= 0x40 &&
data[i + 1] <= 0xfe &&
data[i + 1] != 0xf7) {
i += 2;
continue;
} else {
return false;
}
}
}
return true;
}
判斷是否是utf-8
int preNUm(unsigned char byte) {
unsigned char mask = 0x80;
int num = 0;
for (int i = 0; i < 8; i++) {
if ((byte & mask) == mask) {
mask = mask >> 1;
num++;
} else {
break;
}
}
return num;
}
bool isUtf8(unsigned char* data, int len) {
int num = 0;
int i = 0;
while (i < len) {
if ((data[i] & 0x80) == 0x00) {
// 0XXX_XXXX
i++;
continue;
}
else if ((num = preNUm(data[i])) > 2) {
// 110X_XXXX 10XX_XXXX
// 1110_XXXX 10XX_XXXX 10XX_XXXX
// 1111_0XXX 10XX_XXXX 10XX_XXXX 10XX_XXXX
// 1111_10XX 10XX_XXXX 10XX_XXXX 10XX_XXXX 10XX_XXXX
// 1111_110X 10XX_XXXX 10XX_XXXX 10XX_XXXX 10XX_XXXX 10XX_XXXX
// preNUm() 返回首個字節(jié)8個bits中首??0bit前面1bit的個數(shù)梆奈,該數(shù)量也是該字符所使用的字節(jié)數(shù)
i++;
for(int j = 0; j < num - 1; j++) {
//判斷后面num - 1 個字節(jié)是不是都是10開
if ((data[i] & 0xc0) != 0x80) {
return false;
}
i++;
}
} else {
//其他情況說明不是utf-8
return false;
}
}
return true;
}
utf-8 與 gbk 互轉(zhuǎn)
1. windows
windows 下utf-8與gbk的轉(zhuǎn)換借助Unicode編碼來實現(xiàn)野崇,這篇博客[http://www.reibang.com/p/42d275097336]講得很詳細。主要過程即:
utf-8 >> unicode >> gbk
gbk >> unicode >> utf-8
通過window提供的API(包含在windows.h中):MultiByteToWideChar()和MultiByteToWideChar()兩個函數(shù)實現(xiàn)的亩钟。官方明確提醒使用這兩個函數(shù)要十分小心可能發(fā)生的內(nèi)存溢出問題乓梨,例如一個本來為2字節(jié)的字符串轉(zhuǎn)換后變成4字節(jié)了,如果只分配原來字符串的空間大小径荔,必然會發(fā)生內(nèi)存溢出督禽。
解決辦法:先調(diào)用一次求出需要的字符串空間長度,分配好空間后再調(diào)用一次完成轉(zhuǎn)換总处。具體請參考下面的用例狈惫。
2. linux
linux下通過 iconv.h 提供的API來進行轉(zhuǎn)換
主要有三個API:
//指定轉(zhuǎn)換類型,返回一個轉(zhuǎn)換描述符,iconv_open("GBK", "UTF-8") 將utf-8字符串轉(zhuǎn)為gbk胧谈,
//可以轉(zhuǎn)化的類型可以通過指令 iconv --list 查看忆肾。
iconv_t iconv_open(const char *tocode, const char *fromcode);
//inbuf ,outbuf分別傳入指向buff的指針的地址菱肖,outbf調(diào)用后指向buff中已保存字符串的末尾客冈,即結(jié)束符;
//inbytesleft是一個輸入輸出參數(shù)稳强,傳入一個保存有需轉(zhuǎn)換字符串長度的變量的地址场仲,成功調(diào)用后該變量為0,表示字符串所有字節(jié)均已被轉(zhuǎn)換退疫;
//outbytesleft是一個輸入輸出參數(shù)渠缕,傳入一個保存outbuf總長度的變量的地址,調(diào)用結(jié)束后該變量保存outbuf剩下的可用長度褒繁。
size_t iconv(iconv_t cd,
char **inbuf, size_t *inbytesleft,
char **outbuf, size_t *outbytesleft);
//關(guān)閉打開的轉(zhuǎn)換描述符
iconv_close(cd);
windows/linux 跨平臺用例
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#ifdef WIN32
#include <windows.h>
#else
#include <iconv.h>
#endif
int preNUm(unsigned char byte) {
unsigned char mask = 0x80;
int num = 0;
for (int i = 0; i < 8; i++) {
if ((byte & mask) == mask) {
mask = mask >> 1;
num++;
} else {
break;
}
}
return num;
}
bool isUtf8(unsigned char* data, int len) {
int num = 0;
int i = 0;
while (i < len) {
if ((data[i] & 0x80) == 0x00) {
// 0XXX_XXXX
i++;
continue;
}
else if ((num = preNUm(data[i])) > 2) {
// 110X_XXXX 10XX_XXXX
// 1110_XXXX 10XX_XXXX 10XX_XXXX
// 1111_0XXX 10XX_XXXX 10XX_XXXX 10XX_XXXX
// 1111_10XX 10XX_XXXX 10XX_XXXX 10XX_XXXX 10XX_XXXX
// 1111_110X 10XX_XXXX 10XX_XXXX 10XX_XXXX 10XX_XXXX 10XX_XXXX
// preNUm() 返回首個字節(jié)8個bits中首??0bit前面1bit的個數(shù)亦鳞,該數(shù)量也是該字符所使用的字節(jié)數(shù)
i++;
for(int j = 0; j < num - 1; j++) {
//判斷后面num - 1 個字節(jié)是不是都是10開
if ((data[i] & 0xc0) != 0x80) {
return false;
}
i++;
}
} else {
//其他情況說明不是utf-8
return false;
}
}
return true;
}
bool isGBK(unsigned char* data, int len) {
int i = 0;
while (i < len) {
if (data[i] <= 0x7f) {
//編碼小于等于127,只有一個字節(jié)的編碼,兼容ASCII
i++;
continue;
} else {
//大于127的使用雙字節(jié)編碼
if (data[i] >= 0x81 &&
data[i] <= 0xfe &&
data[i + 1] >= 0x40 &&
data[i + 1] <= 0xfe &&
data[i + 1] != 0xf7) {
i += 2;
continue;
} else {
return false;
}
}
}
return true;
}
typedef enum {
GBK,
UTF8,
UNKOWN
} CODING;
//需要說明的是棒坏,isGBK()是通過雙字節(jié)是否落在gbk的編碼范圍內(nèi)實現(xiàn)的燕差,
//而utf-8編碼格式的每個字節(jié)都是落在gbk的編碼范圍內(nèi)??
//所以只有先調(diào)用isUtf8()先判斷不是utf-8編碼,再調(diào)用isGBK()才有意義
CODING GetCoding(unsigned char* data, int len) {
CODING coding;
if (isUtf8(data, len) == true) {
coding = UTF8;
} else if (isGBK(data, len) == true) {
coding = GBK;
} else {
coding = UNKOWN;
}
return coding;
}
int main() {
char src[512] = "你好";
int len = strlen(src);
//printf("%s, len:%d\n",src, len);
char dstgbk[512] = {0};
char dstutf8[512] = {0};
printf("coding:%d\n", GetCoding((unsigned char*)src, len)); //判斷是否是utf-8
#ifndef WIN32
iconv_t cd;
char* pSrc = src;
char* pUTFOUT = dstutf8;
char* pGBKOUT = dstgbk;
size_t srcLen = (size_t)len;
size_t outLenUTF = sizeof(dstutf8);
size_t outLenGBK = sizeof(dstgbk);
size_t ret;
#endif
#ifdef WIN32
wchar_t * pUnicodeBuff = NULL;
int rlen = 0;
rlen = MultiByteToWideChar(CP_UTF8, 0, src, -1, NULL ,NULL);
pUnicodeBuff = new WCHAR[rlen + 1]; //為Unicode字符串空間
rlen = MultiByteToWideChar(CP_UTF8, 0, src, -1, pUnicodeBuff, rlen);
rlen = WideCharToMultiByte(936, 0, pUnicodeBuff, -1, NULL, NULL, NULL, NULL); //936 為windows gb2312代碼頁碼
WideCharToMultiByte(936, 0, pUnicodeBuff ,-1, dstgbk, rlen, NULL ,NULL);
delete[] pUnicodeBuff;
#else
cd = iconv_open("GBK", "UTF-8");
if (cd == (iconv_t)-1) {
printf("iconv_open error\n");
}
ret = iconv(cd, &pSrc, &srcLen, &pGBKOUT, &outLenGBK);
iconv_close(cd);
#endif
//printf("%s, len:%d\n",dstgbk, strlen(dstgbk));
printf("coding:%d\n", GetCoding((unsigned char*)dstgbk, strlen(dstgbk))); //判斷是否是gbk
#ifdef WIN32
rlen = MultiByteToWideChar(936, 0, dstgbk, -1, NULL, NULL);
pUnicodeBuff = new WCHAR[rlen + 1]; //為Unicode字符串空間
rlen = MultiByteToWideChar(936, 0, dstgbk, -1, pUnicodeBuff, rlen);
rlen = WideCharToMultiByte(CP_UTF8, 0, pUnicodeBuff, -1, NULL, NULL, NULL, NULL);
WideCharToMultiByte(CP_UTF8, 0, pUnicodeBuff, -1, dstutf8, rlen, NULL, NULL);
delete[] pUnicodeBuff;
#else
cd = iconv_open("UTF-8", "GBK");
if (cd == (iconv_t)-1) {
printf("iconv_open error\n");
}
//pSrc = pGBKOUT; 錯誤坝冕,上面調(diào)用過一次此時iconv()后徒探,pGBKOUT指向的是dstgbk[512]可用的位置,
//即dstgbk[512]保存gbk字符串的后一位徽诲,也就是結(jié)束符的位置
pSrc = dstgbk;
srcLen = strlen(dstgbk);
ret = iconv(cd, &pSrc, &srcLen, &pUTFOUT, &outLenUTF);
iconv_close(cd);
#endif
//printf("%s, len:%d\n",dstutf8, strlen(dstutf8));
printf("coding:%d\n", GetCoding((unsigned char*)dstutf8, strlen(dstutf8))); //判斷是否是utf-8
getchar();
}
windows結(jié)果.png
linux結(jié)果.png
打印出枚舉類型1,0,1刹帕,代表utf-8,gbk,utf-8