錯誤筆記:
一開始使用的方法束亏,正常運行沒多久席里,出現(xiàn)一個情況叔磷,GBK文件會導致csv文件的第一行第一列亂碼,百思不得其解奖磁,最終發(fā)現(xiàn)是因為在源頭疗疟,早已使用byte栈雳,inputstream。read讀取幾個字節(jié)报账,所以導致后面的亂碼捧弃。
錯誤示例如下:
文件:12345.csv,格式為GBK
主鍵,密碼,創(chuàng)建時間,創(chuàng)建人,修改時間,修改人,是否刪除
中文,123456,null,null,null,null,false
錯誤部分代碼:
/**
* CSV文件編碼
*/
private static final String ENCODE = "UTF-8";
/**
* GBK編碼
* */
private static final String ENCODE_GBK = "GBK";
public static List<String> getLines(InputStream fileName) {
List<String> stringList=null;
try {
//判斷文件格式
byte[] bytes=new byte[3];
fileName.read(bytes);
if(bytes[0]==-17&&bytes[1]==-69&&bytes[2]==-65){
stringList=getLines(fileName, ENCODE);
}else{
stringList= getLines(fileName, ENCODE_GBK);
}
}catch (Exception e){
log.error("解析編碼格式異常:"+e.getMessage());
}finally {
try {
if (fileName != null) {
fileName.close();
}
}catch (IOException e) {
log.error("解析編碼格式異常Close stream failure :{}", e);
}
}
return stringList;
}
錯誤源頭就在于隐砸,byte[] bytes=new byte[3];原本是想這樣去判斷bytes是什么編碼格式膝捞,這樣就會導致后面鹰溜,丟失字節(jié),造成亂碼褐啡。后改為如下正確完整代碼诺舔。
import lombok.extern.slf4j.Slf4j;
import org.apache.any23.encoding.TikaEncodingDetector;
import java.io.*;
import java.nio.charset.Charset;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
@Slf4j
public class CSVFileUtil {
/**
* CSV文件編碼
*/
private static final String ENCODE = "UTF-8";
/**
* GBK編碼
* */
private static final String ENCODE_GBK = "GBK";
/**
* 讀取CSV文件得到List鳖昌,默認使用UTF-8編碼
* @param fileName 文件路徑
* @return
* 針對編碼格式做處理备畦,謹記關(guān)閉流,所以在此處初始化變量list许昨,在獲取行數(shù)方法執(zhí)行完畢后懂盐,finally關(guān)閉流。
* 謹記對于流處理糕档,如在調(diào)用getLines方法前莉恼,采用byte或者其他方法read之后,傳參數(shù)fileName速那,其實已經(jīng)把流讀取完畢俐银,為空引起報錯。
*/
public static List<String> getLines(InputStream fileName) {
List<String> stringList=null;
ByteArrayOutputStream arraystream=ToolExcelUtils.cloneInputStream(fileName);
InputStream inputStream=null;
InputStream stream=new ByteArrayInputStream(arraystream.toByteArray());
Charset charset= guessCharset(stream);
try {
if(charset!=null){
if(charset.name().equals(ENCODE)){
inputStream=new ByteArrayInputStream(arraystream.toByteArray());
stringList=getLines(inputStream, ENCODE);
}else{
inputStream=new ByteArrayInputStream(arraystream.toByteArray());
stringList= getLines(inputStream, ENCODE_GBK);
}
}
}catch (Exception e){
log.error("解析編碼格式異常:"+e.getMessage());
}finally {
try {
if (fileName != null) {
fileName.close();
}
if(inputStream!=null){
inputStream.close();
}
if(stream!=null){
stream.close();
}
}catch (IOException e) {
log.error("解析編碼格式異常Close stream failure :{}", e);
}
}
return stringList;
}
/**
* 讀取CSV文件得到List
* @param fileName 文件路徑
* @param encode 編碼
* @return
*/
public static List<String> getLines(InputStream fileName, String encode) {
List<String> lines = new ArrayList<String>();
BufferedReader br = null;
InputStreamReader isr = null;
try {
isr = new InputStreamReader(fileName, encode);
br = new BufferedReader(isr);
String line;
while ((line = br.readLine()) != null) {
StringBuilder sb = new StringBuilder();
sb.append(line);
boolean readNext = countChar(sb.toString(), '"', 0) % 2 == 1;
// 如果雙引號是奇數(shù)的時候繼續(xù)讀取端仰〈废В考慮有換行的是情況
while (readNext) {
line = br.readLine();
if (line == null) {
return null;
}
sb.append(line);
readNext = countChar(sb.toString(), '"', 0) % 2 == 1;
}
lines.add(sb.toString());
System.out.println(sb.toString());
}
} catch (Exception e) {
log.error("Read CSV file failure :{}", e);
} finally {
try {
if (br != null) {
br.close();
}
if (isr != null) {
isr.close();
}
} catch (IOException e) {
log.error("Close stream failure :{}", e);
}
}
return lines;
}
public static String[] fromCSVLine(String source) {
return fromCSVLine(source, 0);
}
/**
* 把CSV文件的一行轉(zhuǎn)換成字符串數(shù)組。指定數(shù)組長度荔烧,不夠長度的部分設置為null
* @param source
* @param size
* @return
*/
public static String[] fromCSVLine(String source, int size) {
List list = fromCSVLineToArray(source);
if (size < list.size()) {
size = list.size();
}
String[] arr = new String[size];
list.toArray(arr);
return arr;
}
public static List fromCSVLineToArray(String source) {
if (source == null || source.length() == 0) {
return new ArrayList();
}
int currentPosition = 0;
int maxPosition = source.length();
int nextComa = 0;
List list = new ArrayList();
while (currentPosition < maxPosition) {
nextComa = nextComma(source, currentPosition);
list.add(nextToken(source, currentPosition, nextComa));
currentPosition = nextComa + 1;
if (currentPosition == maxPosition) {
list.add("");
}
}
return list;
}
/**
* 把字符串類型的數(shù)組轉(zhuǎn)換成一個CSV行吱七。(輸出CSV文件的時候用)
*
* @param arr
* @return
*/
public static String toCSVLine(String[] arr) {
if (arr == null) {
return "";
}
StringBuilder sb = new StringBuilder();
for (int i = 0; i < arr.length; i++) {
String item = addQuote(arr[i]);
sb.append(item);
if (arr.length - 1 != i) {
sb.append(",");
}
}
return sb.toString();
}
/**
* 將list的第一行作為Map的key,下面的列作為Map的value
* @param list
* @return
*/
public static List<Map<String, Object>> parseList(List<String> list) {
List<Map<String, Object>> resultList = new ArrayList<Map<String, Object>>();
String firstLine = list.get(0);
String[] fields = firstLine.split(",");
for (int i = 1; i < list.size(); i++) {
String valueLine = list.get(i);
String[] valueItems = CSVFileUtil.fromCSVLine(valueLine);
Map<String, Object> map = new HashMap<String, Object>();
for (int j = 0; j < fields.length; j++) {
map.put(fields[j], valueItems[j]);
}
resultList.add(map);
}
return resultList;
}
/**
* 字符串類型的List轉(zhuǎn)換成一個CSV行鹤竭。(輸出CSV文件的時候用)
*
* @param strArrList
* @return
*/
public static String toCSVLine(ArrayList strArrList) {
if (strArrList == null) {
return "";
}
String[] strArray = new String[strArrList.size()];
for (int idx = 0; idx < strArrList.size(); idx++) {
strArray[idx] = (String) strArrList.get(idx);
}
return toCSVLine(strArray);
}
/**
* 計算指定字符的個數(shù)
*
* @param str 文字列
* @param c 字符
* @param start 開始位置
* @return 個數(shù)
*/
private static int countChar(String str, char c, int start) {
int index = str.indexOf(c, start);
return index == -1 ? 0 : countChar(str, c, index + 1) + 1;
}
/**
* 查詢下一個逗號的位置踊餐。
*
* @param source 文字列
* @param st 檢索開始位置
* @return 下一個逗號的位置。
*/
private static int nextComma(String source, int st) {
int maxPosition = source.length();
boolean inquote = false;
while (st < maxPosition) {
char ch = source.charAt(st);
if (!inquote && ch == ',') {
break;
} else if ('"' == ch) {
inquote = !inquote;
}
st++;
}
return st;
}
/**
* 取得下一個字符串
*
* @param source
* @param st
* @param nextComma
* @return
*/
private static String nextToken(String source, int st, int nextComma) {
StringBuilder strb = new StringBuilder();
int next = st;
while (next < nextComma) {
char ch = source.charAt(next++);
if (ch == '"') {
if ((st + 1 < next && next < nextComma) && (source.charAt(next) == '"')) {
strb.append(ch);
next++;
}
} else {
strb.append(ch);
}
}
return strb.toString();
}
/**
* 在字符串的外側(cè)加雙引號臀稚。如果該字符串的內(nèi)部有雙引號的話吝岭,把"轉(zhuǎn)換成""。
*
* @param item 字符串
* @return 處理過的字符串
*/
private static String addQuote(String item) {
if (item == null || item.length() == 0) {
return "\"\"";
}
StringBuilder sb = new StringBuilder();
sb.append('"');
for (int idx = 0; idx < item.length(); idx++) {
char ch = item.charAt(idx);
if ('"' == ch) {
sb.append("\"\"");
} else {
sb.append(ch);
}
}
sb.append('"');
return sb.toString();
}
public static Charset guessCharset(InputStream is) {
try {
return Charset.forName(new TikaEncodingDetector().guessEncoding(is));
}catch (Exception e){
log.error("獲取流格式異常:"+e.getMessage());
}
return null;
}
}
測試例子:
File xlsxfile=new File("D:\\測試上傳文件\\csv解析日期失敗文件\\專用文件_20210125163437.csv");
InputStream xlsxinputStream= new FileInputStream(xlsxfile);
StopWatch watch=new StopWatch();
watch.start();
List<Map<String, Object>> dataList= CSVFileUtil.getLines(xlsxinputStream);
watch.stop();
System.out.println("執(zhí)行完畢,共耗時:"+watch.getTotalTimeSeconds()+"秒,數(shù)量"+dataList.size());
如上代碼窜管,引入了一個工具包來獲取流的編碼格式:
<dependency>
<groupId>org.apache.any23</groupId>
<artifactId>apache-any23-encoding</artifactId>
<version>2.4</version>
</dependency>
public static Charset guessCharset(InputStream is) {
try {
return Charset.forName(new TikaEncodingDetector().guessEncoding(is));
}catch (Exception e){
log.error("獲取流格式異常:"+e.getMessage());
}
return null;
}