問題描述
java程序通過new java.io.FileReader(file)讀取文件低千,文件內(nèi)容有中文,最終讀取到的中文產(chǎn)生亂碼馏颂。
閱讀代碼
package java.io;
public class FileReader extends InputStreamReader {
public FileReader(File file) throws FileNotFoundException {
super(new FileInputStream(file));
}
}
package java.io;
import java.nio.charset.Charset;
import java.nio.charset.CharsetDecoder;
import sun.nio.cs.StreamDecoder;
public class InputStreamReader extends Reader {
private final StreamDecoder sd;
public InputStreamReader(InputStream in) {
super(in);
try {
sd = StreamDecoder.forInputStreamReader(in, this, (String)null); // ## check lock object
} catch (UnsupportedEncodingException e) {
// The default encoding should always be available
throw new Error(e);
}
}
public InputStreamReader(InputStream in, String charsetName)
throws UnsupportedEncodingException
{
super(in);
if (charsetName == null)
throw new NullPointerException("charsetName");
sd = StreamDecoder.forInputStreamReader(in, this, charsetName);
}
public InputStreamReader(InputStream in, Charset cs) {
super(in);
if (cs == null)
throw new NullPointerException("charset");
sd = StreamDecoder.forInputStreamReader(in, this, cs);
}
}
package sun.nio.cs;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.Reader;
import java.io.UnsupportedEncodingException;
import java.nio.ByteBuffer;
import java.nio.CharBuffer;
import java.nio.channels.FileChannel;
import java.nio.channels.ReadableByteChannel;
import java.nio.charset.Charset;
import java.nio.charset.CharsetDecoder;
import java.nio.charset.CoderResult;
import java.nio.charset.CodingErrorAction;
import java.nio.charset.IllegalCharsetNameException;
public class StreamDecoder extends Reader {
private static final int MIN_BYTE_BUFFER_SIZE = 32;
private static final int DEFAULT_BYTE_BUFFER_SIZE = 8192;
private volatile boolean isOpen;
private boolean haveLeftoverChar;
private char leftoverChar;
private static volatile boolean channelsAvailable = true;
private Charset cs;
private CharsetDecoder decoder;
private ByteBuffer bb;
private InputStream in;
private ReadableByteChannel ch;
public static StreamDecoder forInputStreamReader(InputStream var0, Object var1, String var2) throws UnsupportedEncodingException {
String var3 = var2;
if (var2 == null) {
var3 = Charset.defaultCharset().name();
}
try {
if (Charset.isSupported(var3)) {
return new StreamDecoder(var0, var1, Charset.forName(var3));
}
} catch (IllegalCharsetNameException var5) {
}
throw new UnsupportedEncodingException(var3);
}
}
package java.nio.charset;
import java.nio.ByteBuffer;
import java.nio.CharBuffer;
import java.nio.charset.spi.CharsetProvider;
import java.security.AccessController;
import java.security.PrivilegedAction;
import java.util.Collections;
import java.util.HashSet;
import java.util.Iterator;
import java.util.Locale;
import java.util.Map;
import java.util.NoSuchElementException;
import java.util.Set;
import java.util.ServiceLoader;
import java.util.ServiceConfigurationError;
import java.util.SortedMap;
import java.util.TreeMap;
import sun.misc.ASCIICaseInsensitiveComparator;
import sun.nio.cs.StandardCharsets;
import sun.nio.cs.ThreadLocalCoders;
import sun.security.action.GetPropertyAction;
public abstract class Charset
implements Comparable<Charset>
{
private static volatile Charset defaultCharset;
/**
* Returns the default charset of this Java virtual machine.
*
* <p> The default charset is determined during virtual-machine startup and
* typically depends upon the locale and charset of the underlying
* operating system.
*
* @return A charset object for the default charset
*
* @since 1.5
*/
public static Charset defaultCharset() {
if (defaultCharset == null) {
synchronized (Charset.class) {
String csn = AccessController.doPrivileged(
new GetPropertyAction("file.encoding"));
Charset cs = lookup(csn);
if (cs != null)
defaultCharset = cs;
else
defaultCharset = forName("UTF-8");
}
}
return defaultCharset;
}
public final String name() {
return name;
}
}
從方法注釋可以看出示血,默認字符集是在虛擬機啟動時確定的,并且通常取決于底層操作系統(tǒng)的區(qū)域和字符集救拉。
package sun.security.action;
import java.security.AccessController;
import java.security.PrivilegedAction;
public class GetPropertyAction implements PrivilegedAction<String> {
private String theProp;
private String defaultVal;
public GetPropertyAction(String var1) {
this.theProp = var1;
}
public GetPropertyAction(String var1, String var2) {
this.theProp = var1;
this.defaultVal = var2;
}
public String run() {
String var1 = System.getProperty(this.theProp);
return var1 == null ? this.defaultVal : var1;
}
public static String privilegedGetProperty(String var0) {
return System.getSecurityManager() == null ? System.getProperty(var0) : (String)AccessController.doPrivileged(new GetPropertyAction(var0));
}
public static String privilegedGetProperty(String var0, String var1) {
return System.getSecurityManager() == null ? System.getProperty(var0, var1) : (String)AccessController.doPrivileged(new GetPropertyAction(var0, var1));
}
}
可以看出通過System.getProperty(this.theProp)獲取系統(tǒng)屬性难审。
如果你需要更改JVM的默認字符集,可以通過在啟動JVM時指定-Dfile.encoding屬性來實現(xiàn)亿絮。例如告喊,你可以使用-Dfile.encoding=UTF-8來將默認字符集設(shè)置為UTF-8。
需要注意的是派昧,操作系統(tǒng)字符集的設(shè)置也有相應的優(yōu)先級黔姜。通常,LC_ALL設(shè)置具有最高的優(yōu)先級蒂萎,它可以強制設(shè)置字符集秆吵。如果沒有設(shè)置LC_ALL,那么會考慮LC_設(shè)置五慈,最后是LANG設(shè)置帮毁。LANG是LC_的默認值,而LC_ALL比LC_的優(yōu)先級別高豺撑,設(shè)置完LC_ALL之后烈疚,會強制重置LC_各個值,如果不將LC_ALL重新設(shè)置為空聪轿,則再無法設(shè)置LC_*的單個值 爷肝。
排查
- java程序沒有設(shè)置file.encoding
- Linux中使用locale查看系統(tǒng)編碼發(fā)現(xiàn)使用的是GBK,到此亂碼原因找到。
解決方法
- java程序啟動時指定-Dfile.encoding=UTF-8灯抛。
示例:java -Dfile.encoding=UTF-8 -jar demo.jar - 修改操作系統(tǒng)默認字符集編碼金赦。
/etc/profile是系統(tǒng)級的配置文件,它應用于所有用戶对嚼。
~/.bash_profile是用戶級的配置文件夹抗,它只適用于當前登錄用戶。
系統(tǒng)會首先加載/etc/profile纵竖,然后加載~/.bash_profile漠烧。
這意味著用戶級的配置文件會覆蓋系統(tǒng)級的配置文件,因為后者的配置會在前者之后加載靡砌。
1已脓、sudo vim ~/.bash_profile
2、export LC_ALL=en_US.UTF-8
3通殃、source ~/.bash_profile
4度液、重啟java程序