CSV檔案準確讀取兩種思路
阿新 • • 發佈:2019-02-19
通過查詢網上資料,發現有兩種解析思路:a.通過pattern分割各欄位,b.逐字元讀取並判斷,當然還有通過第三方Jar包來解析的方法。
1.通過Pattern準確分割欄位(Reference:csv檔案讀取)
package xufei;
import java.io.BufferedReader;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.util.ArrayList;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
/*
* 檔案規則
* Microsoft的格式是最簡單的。以逗號分隔的值要麼是“純粹的”(僅僅包含在括號之前),
* 要麼是在雙引號之間(這時資料中的雙引號以一對雙引號表示)。
* Ten Thousand,10000, 2710 ,,"10,000","It's ""10 Grand"", baby",10K
* 這一行包含七個欄位(fields):
* Ten Thousand
* 10000
* 2710
* 空欄位
* 10,000
* It's "10 Grand", baby
* 10K
* 每條記錄佔一行
* 以逗號為分隔符
* 逗號前後的空格會被忽略
* 欄位中包含有逗號,該欄位必須用雙引號括起來。如果是全形的沒有問題。
* 欄位中包含有換行符,該欄位必須用雙引號括起來
* 欄位前後包含有空格,該欄位必須用雙引號括起來
* 欄位中的雙引號用兩個雙引號表示
* 欄位中如果有雙引號,該欄位必須用雙引號括起來
* 第一條記錄,可以是欄位名
*/
/**
*
タイトル: xufei.CSVAnalysis.java
*
説明:
*
著作権: Copyright (c) 2006
*
會社名: technodia
* @author 徐飛
* @version 1.0
* createDate Aug 11, 2008
* 修正履歴
* 修正日 修正者 修正理由
*/
public class CSVAnalysis {
private InputStreamReader fr = null;
private BufferedReader br = null;
public CSVAnalysis(String f) throws IOException {
fr = new InputStreamReader(new FileInputStream(f));
}
/**
* 解析csv檔案 到一個list中
* 每個單元個為一個String型別記錄,每一行為一個list。
* 再將所有的行放到一個總list中
* @return
* @throws IOException
*/
public List> readCSVFile() throws IOException {
br = new BufferedReader(fr);
String rec = null;//一行
String str;//一個單元格
List> listFile = new ArrayList>();
try {
//讀取一行
while ((rec = br.readLine()) != null) {
Pattern pCells = Pattern
.compile("(\"[^\"]*(\"{2})*[^\"]*\")*[^,]*,");
Matcher mCells = pCells.matcher(rec);
List cells = new ArrayList();//每行記錄一個list
//讀取每個單元格
while (mCells.find()) {
str = mCells.group();
str = str.replaceAll(
"(?sm)\"?([^\"]*(\"{2})*[^\"]*)\"?.*,", "$1");
str = str.replaceAll("(?sm)(\"(\"))", "$2");
cells.add(str);
}
listFile.add(cells);
}
} catch (Exception e) {
e.printStackTrace();
} finally {
if (fr != null) {
fr.close();
}
if (br != null) {
br.close();
}
}
return listFile;
}
public static void main(String[] args) throws Throwable {
CSVAnalysis parser = new CSVAnalysis("c:/test2.csv");
parser.readCSVFile();
}
}
2.逐字元讀取字串,並對字元判斷(Reference:csv檔案讀取
import java.io.BufferedReader;
import java.io.Closeable;
import java.io.IOException;
import java.io.Reader;
import java.util.ArrayList;
import java.util.List;
/**
* A very simple CSV reader released under a commercial-friendly license.
*
* @author Glen Smith
*
*/
public class CSVReader implements Closeable {
private BufferedReader br;
private boolean hasNext = true;
private final char separator;
private final char quotechar;
private final char escape;
private int skipLines;
private boolean linesSkiped;
/** The default separator to use if none is supplied to the constructor. */
public static final char DEFAULT_SEPARATOR = ',';
public static final int INITIAL_READ_SIZE = 64;
/**
* The default quote character to use if none is supplied to the
* constructor.
*/
public static final char DEFAULT_QUOTE_CHARACTER = '"';
/**
* The default escape character to use if none is supplied to the
* constructor.
*/
public static final char DEFAULT_ESCAPE_CHARACTER = '\\';
/**
* The default line to start reading.
*/
public static final int DEFAULT_SKIP_LINES = 0;
/**
* Constructs CSVReader using a comma for the separator.
*
* @param reader
* the reader to an underlying CSV source.
*/
public CSVReader(Reader reader) {
this(reader, DEFAULT_SEPARATOR);
}
/**
* Constructs CSVReader with supplied separator.
*
* @param reader
* the reader to an underlying CSV source.
* @param separator
* the delimiter to use for separating entries.
*/
public CSVReader(Reader reader, char separator) {
this(reader, separator, DEFAULT_QUOTE_CHARACTER, DEFAULT_ESCAPE_CHARACTER);
}
/**
* Constructs CSVReader with supplied separator and quote char.
*
* @param reader
* the reader to an underlying CSV source.
* @param separator
* the delimiter to use for separating entries
* @param quotechar
* the character to use for quoted elements
*/
public CSVReader(Reader reader, char separator, char quotechar) {
this(reader, separator, quotechar, DEFAULT_ESCAPE_CHARACTER, DEFAULT_SKIP_LINES);
}
public CSVReader(Reader reader, char separator,
char quotechar, char escape) {
this(reader, separator, quotechar, escape, DEFAULT_SKIP_LINES);
}
/**
* Constructs CSVReader with supplied separator and quote char.
*
* @param reader
* the reader to an underlying CSV source.
* @param separator
* the delimiter to use for separating entries
* @param quotechar
* the character to use for quoted elements
* @param line
* the line number to skip for start reading
*/
public CSVReader(Reader reader, char separator, char quotechar, int line) {
this(reader, separator, quotechar, DEFAULT_ESCAPE_CHARACTER, line);
}
/**
* Constructs CSVReader with supplied separator and quote char.
*
* @param reader
* the reader to an underlying CSV source.
* @param separator
* the delimiter to use for separating entries
* @param quotechar
* the character to use for quoted elements
* @param escape
* the character to use for escaping a separator or quote
* @param line
* the line number to skip for start reading
*/
public CSVReader(Reader reader, char separator, char quotechar, char escape, int line) {
this.br = new BufferedReader(reader);
this.separator = separator;
this.quotechar = quotechar;
this.escape = escape;
this.skipLines = line;
}
/**
* Reads the entire file into a List with each element being a String[] of
* tokens.
*
* @return a List of String[], with each String[] representing a line of the
* file.
*
* @throws IOException
* if bad things happen during the read
*/
public List<String[]> readAll() throws IOException {
List<String[]> allElements = new ArrayList<String[]>();
while (hasNext) {
String[] nextLineAsTokens = readNext();
if (nextLineAsTokens != null)
allElements.add(nextLineAsTokens);
}
return allElements;
}
/**
* Reads the next line from the buffer and converts to a string array.
*
* @return a string array with each comma-separated element as a separate
* entry.
*
* @throws IOException
* if bad things happen during the read
*/
public String[] readNext() throws IOException {
String nextLine = getNextLine();
return hasNext ? parseLine(nextLine) : null;
}
/**
* Reads the next line from the file.
*
* @return the next line from the file without trailing newline
* @throws IOException
* if bad things happen during the read
*/
private String getNextLine() throws IOException {
if (!this.linesSkiped) {
for (int i = 0; i < skipLines; i++) {
br.readLine();
}
this.linesSkiped = true;
}
String nextLine = br.readLine();
if (nextLine == null) {
hasNext = false;
}
return hasNext ? nextLine : null;
}
/**
* Parses an incoming String and returns an array of elements.
*
* @param nextLine
* the string to parse
* @return the comma-tokenized list of elements, or null if nextLine is null
* @throws IOException if bad things happen during the read
*/
private String[] parseLine(String nextLine) throws IOException {
if (nextLine == null) {
return null;
}
List<String>tokensOnThisLine = new ArrayList<String>();
StringBuilder sb = new StringBuilder(INITIAL_READ_SIZE);
boolean inQuotes = false;
do {
if (inQuotes) {
// continuing a quoted section, reappend newline
sb.append("\n");
nextLine = getNextLine();
if (nextLine == null)
break;
}
for (int i = 0; i < nextLine.length(); i++) {
char c = nextLine.charAt(i);
if (c == this.escape) {
if( isEscapable(nextLine, inQuotes, i) ){
sb.append(nextLine.charAt(i+1));
i++;
} else {
i++; // ignore the escape
}
} else if (c == quotechar) {
if( isEscapedQuote(nextLine, inQuotes, i) ){
sb.append(nextLine.charAt(i+1));
i++;
}else{
inQuotes = !inQuotes;
// the tricky case of an embedded quote in the middle: a,bc"d"ef,g
if(i>2 //not on the beginning of the line
&& nextLine.charAt(i-1) != this.separator //not at the beginning of an escape sequence
&& nextLine.length()>(i+1) &&
nextLine.charAt(i+1) != this.separator //not at the end of an escape sequence
){
sb.append(c);
}
}
} else if (c == separator && !inQuotes) {
tokensOnThisLine.add(sb.toString());
sb = new StringBuilder(INITIAL_READ_SIZE); // start work on next token
} else {
sb.append(c);
}
}
} while (inQuotes);
tokensOnThisLine.add(sb.toString());
return tokensOnThisLine.toArray(new String[0]);
}
/**
* precondition: the current character is a quote or an escape
* @param nextLine the current line
* @param inQuotes true if the current context is quoted
* @param i current index in line
* @return true if the following character is a quote
*/
private boolean isEscapedQuote(String nextLine, boolean inQuotes, int i) {
return inQuotes // we are in quotes, therefore there can be escaped quotes in here.
&& nextLine.length() > (i+1) // there is indeed another character to check.
&& nextLine.charAt(i+1) == quotechar;
}
/**
* precondition: the current character is an escape
* @param nextLine the current line
* @param inQuotes true if the current context is quoted
* @param i current index in line
* @return true if the following character is a quote
*/
private boolean isEscapable(String nextLine, boolean inQuotes, int i) {
return inQuotes // we are in quotes, therefore there can be escaped quotes in here.
&& nextLine.length() > (i+1) // there is indeed another character to check.
&& ( nextLine.charAt(i+1) == quotechar || nextLine.charAt(i+1) == this.escape);
}
/**
* Closes the underlying reader.
*
* @throws IOException if the close fails
*/
public void close() throws IOException{
br.close();
}
}