從網(wǎng)絡(luò)上下載到的網(wǎng)頁(yè)經(jīng)常是Unicode格式的梁剔,這個(gè)工具類可以將Unicode格式轉(zhuǎn)換為utf-8格式瓣颅,也就是講\u2422\u3243之類的編碼轉(zhuǎn)換為漢字,非常好用,雙引號(hào)什么的沒有影響。
import java.nio.ByteBuffer;
import java.nio.charset.Charset;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
/**
?* @author tian
?*
?*/
public class CharSetUtil {
/**
?* 解碼 Unicode \\uXXXX
?* @param str
?* @return
?*/
public static String decodeUnicode(String str) {
Charset set = Charset.forName("UTF-16");
Pattern p = Pattern.compile("\\\\u([0-9a-fA-F]{4})");
Matcher m = p.matcher( str );
int start = 0 ;
int start2 = 0 ;
StringBuffer sb = new StringBuffer();
while( m.find( start ) ) {
start2 = m.start() ;
if( start2 > start ){
String seg = str.substring(start, start2) ;
sb.append( seg );
}
String code = m.group( 1 );
int i = Integer.valueOf( code , 16 );
byte[] bb = new byte[ 4 ] ;
bb[ 0 ] = (byte) ((i >> 8) & 0xFF );
bb[ 1 ] = (byte) ( i & 0xFF ) ;
ByteBuffer b = ByteBuffer.wrap(bb);
sb.append( String.valueOf( set.decode(b) ).trim() );
start = m.end() ;
}
start2 = str.length() ;
if( start2 > start ){
String seg = str.substring(start, start2) ;
sb.append( seg );
}
return sb.toString() ;
}
public static void main(String[] args) {
System.out.println( decodeUnicode("\\u5907abbbbbs\\u5907"));
}
}