POI讀寫海量Excel
阿新 • • 發佈:2019-01-07
目前處理Excel的開源javaAPI主要有兩種,一是Jxl(Java Excel API),Jxl只支援Excel2003以下的版本。另外一種是Apache的Jakarta POI,相比於Jxl,POI對微軟辦公文件的支援更加強大,但是它使用複雜,上手慢。POI可支援更高的Excel版本2007。對Excel的讀取,POI有兩種模式,一是使用者模式,這種方式同Jxl的使用很類似,使用簡單,都是將檔案一次性讀到記憶體,檔案小的時候,沒有什麼問題,當檔案大的時候,就會出現OutOfMemory的記憶體溢位問題。第二種是事件驅動模式,拿Excel2007來說,其內容採用XML的格式來儲存,所以處理excel就是解析XML,而目前使用事件驅動模式解析XML的API是SAX(Simple
API for XML),這種模型在讀取XML文件時,並沒有將整個文件讀入記憶體,而是按順序將整個文件解析完,在解析過程中,會主動產生事件交給程式中相應的處理函式來處理當前內容。因此這種方式對系統資源要求不高,可以處理海量資料。筆者曾經做過測試,這種方法處理一千萬條,每條五列的資料花費大約11分鐘。可見處理海量資料的檔案事件驅動是一個很好的方式。而本文中用到的AbstractExcel2003Reader、AbstractExcel2007Reader對Excel的讀取都是採用這種POI的事件驅動模式。至於Excel的寫操作,對較高版本的Excel2007,POI提供了很好的支援,主要流程是第一步構建工作薄和電子表格物件,第二步在一個流中構建文字檔案,第三步使用流中產生的資料替換模板中的電子表格。這種方式也可以處理海量資料檔案。AbstractExcel2007Writer就是使用這種方式進行寫操作。對於寫入較低版本的Excel2003,POI使用了使用者模式來處理,就是將整個文件載入進記憶體,如果資料量大的話就會出現記憶體溢位的問題,Excel2003Writer就是使用這種方式。據筆者的測試,如果資料量大於3萬條,每條8列的話,就會報OutOfMemory的錯誤。Excel2003中每個電子表格的記錄數必須在65536以下,否則就會發生異常。目前還沒有好的解決方案,建議對於海量資料寫入操作,儘量使用Excel2007。
- /**
- * 抽象Excel2003讀取器,通過實現HSSFListener監聽器,採用事件驅動模式解析excel2003
- * 中的內容,遇到特定事件才會觸發,大大減少了記憶體的使用。
- *
- */
- publicclass Excel2003Reader implements HSSFListener{
- privateint minColumns = -1;
- private POIFSFileSystem fs;
- privateint lastRowNumber;
- privateint lastColumnNumber;
- /** Should we output the formula, or the value it has? */
- privateboolean outputFormulaValues = true;
- /** For parsing Formulas */
- private SheetRecordCollectingListener workbookBuildingListener;
- //excel2003工作薄
- private HSSFWorkbook stubWorkbook;
- // Records we pick up as we process
- private SSTRecord sstRecord;
- private FormatTrackingHSSFListener formatListener;
- //表索引
- privateint sheetIndex = -1;
- private BoundSheetRecord[] orderedBSRs;
- @SuppressWarnings("unchecked")
- private ArrayList boundSheetRecords = new ArrayList();
- // For handling formulas with string results
- privateint nextRow;
- privateint nextColumn;
- privateboolean outputNextStringRecord;
- //當前行
- privateint curRow = 0;
- //儲存行記錄的容器
- private List<String> rowlist = new ArrayList<String>();;
- @SuppressWarnings( "unused")
- private String sheetName;
- private IRowReader rowReader;
- publicvoid setRowReader(IRowReader rowReader){
- this.rowReader = rowReader;
- }
- /**
- * 遍歷excel下所有的sheet
- * @throws IOException
- */
- publicvoid process(String fileName) throws IOException {
- this.fs = new POIFSFileSystem(new FileInputStream(fileName));
- MissingRecordAwareHSSFListener listener = new MissingRecordAwareHSSFListener(
- this);
- formatListener = new FormatTrackingHSSFListener(listener);
- HSSFEventFactory factory = new HSSFEventFactory();
- HSSFRequest request = new HSSFRequest();
- if (outputFormulaValues) {
- request.addListenerForAllRecords(formatListener);
- } else {
- workbookBuildingListener = new SheetRecordCollectingListener(
- formatListener);
- request.addListenerForAllRecords(workbookBuildingListener);
- }
- factory.processWorkbookEvents(request, fs);
- }
- /**
- * HSSFListener 監聽方法,處理 Record
- */
- @SuppressWarnings("unchecked")
- publicvoid processRecord(Record record) {
- int thisRow = -1;
- int thisColumn = -1;
- String thisStr = null;
- String value = null;
- switch (record.getSid()) {
- case BoundSheetRecord.sid:
- boundSheetRecords.add(record);
- break;
- case BOFRecord.sid:
- BOFRecord br = (BOFRecord) record;
- if (br.getType() == BOFRecord.TYPE_WORKSHEET) {
- // 如果有需要,則建立子工作薄
- if (workbookBuildingListener != null && stubWorkbook == null) {
- stubWorkbook = workbookBuildingListener
- .getStubHSSFWorkbook();
- }
- sheetIndex++;
- if (orderedBSRs == null) {
- orderedBSRs = BoundSheetRecord
- .orderByBofPosition(boundSheetRecords);
- }
- sheetName = orderedBSRs[sheetIndex].getSheetname();
- }
- break;
- case SSTRecord.sid:
- sstRecord = (SSTRecord) record;
- break;
- case BlankRecord.sid:
- BlankRecord brec = (BlankRecord) record;
- thisRow = brec.getRow();
- thisColumn = brec.getColumn();
- thisStr = "";
- rowlist.add(thisColumn, thisStr);
- break;
- case BoolErrRecord.sid: //單元格為布林型別
- BoolErrRecord berec = (BoolErrRecord) record;
- thisRow = berec.getRow();
- thisColumn = berec.getColumn();
- thisStr = berec.getBooleanValue()+"";
- rowlist.add(thisColumn, thisStr);
- break;
- case FormulaRecord.sid: //單元格為公式型別
- FormulaRecord frec = (FormulaRecord) record;
- thisRow = frec.getRow();
- thisColumn = frec.getColumn();
- if (outputFormulaValues) {
- if (Double.isNaN(frec.getValue())) {
- // Formula result is a string
- // This is stored in the next record
- outputNextStringRecord = true;
- nextRow = frec.getRow();
- nextColumn = frec.getColumn();
- } else {
- thisStr = formatListener.formatNumberDateCell(frec);
- }
- } else {
- thisStr = '"' + HSSFFormulaParser.toFormulaString(stubWorkbook,
- frec.getParsedExpression()) + '"';
- }
- rowlist.add(thisColumn,thisStr);
- break;
- case StringRecord.sid://單元格中公式的字串
- if (outputNextStringRecord) {
- // String for formula
- StringRecord srec = (StringRecord) record;
- thisStr = srec.getString();
- thisRow = nextRow;
- thisColumn = nextColumn;
- outputNextStringRecord = false;
- }
- break;
- case LabelRecord.sid:
- LabelRecord lrec = (LabelRecord) record;
- curRow = thisRow = lrec.getRow();
- thisColumn = lrec.getColumn();
- value = lrec.getValue().trim();
- value = value.equals("")?" ":value;
- this.rowlist.add(thisColumn, value);
- break;
- case LabelSSTRecord.sid: //單元格為字串型別
- LabelSSTRecord lsrec = (LabelSSTRecord) record;
- curRow = thisRow = lsrec.getRow();
- thisColumn = lsrec.getColumn();
- if (sstRecord == null) {
- rowlist.add(thisColumn, " ");
- } else {
- value = sstRecord
- .getString(lsrec.getSSTIndex()).toString().trim();
- value = value.equals("")?" ":value;
- rowlist.add(thisColumn,value);
- }
- break;
- case NumberRecord.sid: //單元格為數字型別
- NumberRecord numrec = (NumberRecord) record;
- curRow = thisRow = numrec.getRow();
- thisColumn = numrec.getColumn();
- value = formatListener.formatNumberDateCell(numrec).trim();
- value = value.equals("")?" ":value;
- // 向容器加入列值
- rowlist.add(thisColumn, value);
- break;
- default:
- break;
- }
- // 遇到新行的操作
- if (thisRow != -1 && thisRow != lastRowNumber) {
- lastColumnNumber = -1;
- }
- // 空值的操作
- if (record instanceof MissingCellDummyRecord) {
- MissingCellDummyRecord mc = (MissingCellDummyRecord) record;
- curRow = thisRow = mc.getRow();
- thisColumn = mc.getColumn();
- rowlist.add(thisColumn," ");
- }
- // 更新行和列的值
- if (thisRow > -1)
- lastRowNumber = thisRow;
- if (thisColumn > -1)
- lastColumnNumber = thisColumn;
- // 行結束時的操作
- if (record instanceof LastCellOfRowDummyRecord) {
- if (minColumns > 0) {
- // 列值重新置空
- if (lastColumnNumber == -1) {
- lastColumnNumber = 0;
- }
- }
- lastColumnNumber = -1;
- // 每行結束時, 呼叫getRows() 方法
- rowReader.getRows(sheetIndex,curRow, rowlist);
- // 清空容器
- rowlist.clear();
- }
- }
- }
- /**
- * 抽象Excel2007讀取器,excel2007的底層資料結構是xml檔案,採用SAX的事件驅動的方法解析
- * xml,需要繼承DefaultHandler,在遇到檔案內容時,事件會觸發,這種做法可以大大降低
- * 記憶體的耗費,特別使用於大資料量的檔案。
- *
- */
- publicclass Excel2007Reader extends DefaultHandler {
- //共享字串表
- private SharedStringsTable sst;
- //上一次的內容
- private String lastContents;
- privateboolean nextIsString;
- privateint sheetIndex = -1;
- private List<String> rowlist = new ArrayList<String>();
- //當前行
- privateint curRow = 0;
- //當前列
- privateint curCol = 0;
- //日期標誌
- privateboolean dateFlag;
- //數字標誌
- privateboolean numberFlag;
- privateboolean isTElement;
- private IRowReader rowReader;
- publicvoid setRowReader(IRowReader rowReader){
- this.rowReader = rowReader;
- }
- /**只遍歷一個電子表格,其中sheetId為要遍歷的sheet索引,從1開始,1-3
- * @param filename
- * @param sheetId
- * @throws Exception
- */
- publicvoid processOneSheet(String filename,int sheetId) throws Exception {
- OPCPackage pkg = OPCPackage.open(filename);
- XSSFReader r = new XSSFReader(pkg);
- SharedStringsTable sst = r.getSharedStringsTable();
- XMLReader parser = fetchSheetParser(sst);
- // 根據 rId# 或 rSheet# 查詢sheet
- InputStream sheet2 = r.getSheet("rId"+sheetId);
- sheetIndex++;
- InputSource sheetSource = new InputSource(sheet2);
- parser.parse(sheetSource);
- sheet2.close();
- }
- /**
- * 遍歷工作簿中所有的電子表格
- * @param filename
- * @throws Exception
- */
- publicvoid process(String filename) throws Exception {
- OPCPackage pkg = OPCPackage.open(filename);
- XSSFReader r = new XSSFReader(pkg);
- SharedStringsTable sst = r.getSharedStringsTable();
- XMLReader parser = fetchSheetParser(sst);
- Iterator<InputStream> sheets = r.getSheetsData();
- while (sheets.hasNext()) {
- curRow = 0;
- sheetIndex++;
- InputStream sheet = sheets.next();
- InputSource sheetSource = new InputSource(sheet);
- parser.parse(sheetSource);
- sheet.close();
- }
- }
- public XMLReader fetchSheetParser(SharedStringsTable sst)
- throws SAXException {
- XMLReader parser = XMLReaderFactory
- .createXMLReader("org.apache.xerces.parsers.SAXParser");
- this.sst = sst;
- parser.setContentHandler(this);
- return parser;
- }
- publicvoid startElement(String uri, String localName, String name,
- Attributes attributes) throws SAXException {
- // c => 單元格
- if ("c".equals(name)) {
- // 如果下一個元素是 SST 的索引,則將nextIsString標記為true
- String cellType = attributes.getValue("t");
- if ("s".equals(cellType)) {
- nextIsString = true;
- } else {
- nextIsString = false;
- }
- //日期格式
- String cellDateType = attributes.getValue("s");
- if ("1".equals(cellDateType)){
- dateFlag = true;
- } else {
- dateFlag = false;
- }
- String cellNumberType = attributes.getValue("s");
- if("2".equals(cellNumberType)){
- numberFlag = true;
- } else {
- numberFlag = false;
- }
- }
- //當元素為t時
- if("t".equals(name)){
- isTElement = true;
- } else {
- isTElement = false;
- }
- // 置空
- lastContents = "";
- }
- publicvoid endElement(String uri, String localName, String name)
- throws SAXException {
- // 根據SST的索引值的到單元格的真正要儲存的字串
- // 這時characters()方法可能會被呼叫多次
- if (nextIsString) {
- try {
- int idx = Integer.parseInt(lastContents);
- lastContents = new XSSFRichTextString(sst.getEntryAt(idx))
- .toString();
- } catch (Exception e) {
- }
- }
- //t元素也包含字串
- if(isTElement){
- String value = lastContents.trim();
- rowlist.add(curCol, value);
- curCol++;
- isTElement = false;
- // v => 單元格的值,如果單元格是字串則v標籤的值為該字串在SST中的索引
- // 將單元格內容加入rowlist中,在這之前先去掉字串前後的空白符
- } elseif ("v".equals(name)) {
- String value = lastContents.trim();
- value = value.equals("")?" ":value;
- //日期格式處理
- if(dateFlag){
- Date date = HSSFDateUtil.getJavaDate(Double.valueOf(value));
- SimpleDateFormat dateFormat = new SimpleDateFormat(
- "dd/MM/yyyy");
- value = dateFormat.format(date);
- }
- //數字型別處理
- if(numberFlag){
- BigDecimal bd = new BigDecimal(value);
- value = bd.setScale(3,BigDecimal.ROUND_UP).toString();
- }
- rowlist.add(curCol, value);
- curCol++;
- }else {
- //如果標籤名稱為 row ,這說明已到行尾,呼叫 optRows() 方法
- if (name.equals("row")) {
- rowReader.getRows(sheetIndex,curRow,rowlist);
- rowlist.clear();
- curRow++;
- curCol = 0;
- }
- }
- }
- publicvoid characters(char[] ch, int start, int length)
- throws SAXException {
- //得到單元格內容的值
- lastContents += new String(ch, start, length);
- }
- }
- publicclass ExcelReaderUtil {
- //excel2003副檔名
- publicstaticfinal String EXCEL03_EXTENSION = ".xls";
- //excel2007副檔名
- publicstaticfinal String EXCEL07_EXTENSION = ".xlsx";
- /**
- * 讀取Excel檔案,可能是03也可能是07版本
- * @param excel03
- * @param excel07
- * @param fileName
- * @throws Exception
- */
- publicstaticvoid readExcel(IRowReader reader,String fileName) throws Exception{
- // 處理excel2003檔案
- if (fileName.endsWith(EXCEL03_EXTENSION)){
- Excel2003Reader excel03 = new Excel2003Reader();
- excel03.setRowReader(reader);
- excel03.process(fileName);
- // 處理excel2007檔案
- } elseif (fileName.endsWith(EXCEL07_EXTENSION)){
- Excel2007Reader excel07 = new Excel2007Reader();
- excel07.setRowReader(reader);
- excel07.process(fileName);
- } else {
- thrownew Exception("檔案格式錯誤,fileName的副檔名只能是xls或xlsx。");
- }
- }
- }
- publicinterface IRowReader {
- /**業務邏輯實現方法
- * @param sheetIndex
- * @param curRow
- * @param rowlist
- */
- publicvoid getRows(int sheetIndex,int curRow, List<String> rowlist);
- }
- publicclass RowReader implements IRowReader{
- /* 業務邏輯實現方法
- * @see com.eprosun.util.excel.IRowReader#getRows(int, int, java.util.List)
- */
- publicvoid getRows(int sheetIndex, int curRow, List<String> rowlist) {
- // TODO Auto-generated method stub
- System.out.print(curRow+" ");
- for (int i = 0; i < rowlist.size(); i++) {
- System.out.print(rowlist.get(i) + " ");
- }
- System.out.println();
- }
- }
- publicclass Main {
- publicstaticvoid main(String[] args) throws Exception {
- IRowReader reader = new RowReader();
- //ExcelReaderUtil.readExcel(reader, "F://te03.xls");
- ExcelReaderUtil.readExcel(reader, "F://test07.xlsx");
- }
- }
- publicclass Excel2003Writer {
- /**
- * @param args
- */
- publicstaticvoid main(String[] args) {
- try{
- System.out.println("開始寫入excel2003....");
- writeExcel("tes2003.xls");
- System.out.println("寫完xcel2003");
- } catch (IOException e) {
- }
- }
- /**
- * 寫入excel並填充內容,一個sheet只能寫65536行以下,超出會報異常,寫入時建議使用AbstractExcel2007Writer
- * @param fileName
- * @throws IOException
- */
- publicstaticvoid writeExcel(String fileName) throws IOException{
- // 建立excel2003物件
- Workbook wb = new HSSFWorkbook();
- // 設定檔案放置路徑和檔名
- FileOutputStream fileOut = new FileOutputStream(fileName);
- // 建立新的表單
- Sheet sheet = wb.createSheet("newsheet");
- // 建立新行
- for(int i=0;i<20000;i++){
- Row row = sheet.createRow(i);
- // 建立單元格
- Cell cell = row.createCell(0);
- // 設定單元格值
- cell.setCellValue(1);
- row.createCell(1).setCellValue(1+i);
- row.createCell(2).setCellValue(true);
- row.createCell(3).setCellValue(0.43d);
- row.createCell(4).setCellValue('d');
- row.createCell(5).setCellValue("");
- row.createCell(6).setCellValue("第七列"+i);
- row.createCell(7).setCellValue("第八列"+i);
- }
- wb.write(fileOut);
- fileOut.close();
- }
- }
- /**
- * 抽象excel2007讀入器,先構建.xlsx一張模板,改寫模板中的sheet.xml,使用這種方法
- * 寫入.xlsx檔案,不需要太大的記憶體
- *
- */
- publicabstractclass AbstractExcel2007Writer {
- private SpreadsheetWriter sw;
- /**
- * 寫入電子表格的主要流程
- * @param fileName
- * @throws Exception
- */
- publicvoid process(String fileName) throws Exception{
- // 建立工作簿和電子表格物件
- XSSFWorkbook wb = new XSSFWorkbook();
- XSSFSheet sheet = wb.createSheet("sheet1");
- // 持有電子表格資料的xml檔名 例如 /xl/worksheets/sheet1.xml
- String sheetRef = sheet.getPackagePart().getPartName().getName();
- // 儲存模板
- FileOutputStream os = new FileOutputStream("template.xlsx");
- wb.write(os);
- os.close();
- // 生成xml檔案
- File tmp = File.createTempFile("sheet", ".xml");
- Writer fw = new FileWriter(tmp);
- sw = new SpreadsheetWriter(fw);
- generate();
- fw.close();
- // 使用產生的資料替換模板
- File templateFile = new File("template.xlsx");
- FileOutputStream out = new FileOutputStream(fileName);
- substitute(templateFile, tmp, sheetRef.substring(1), out);
- out.close();
- //刪除檔案之前呼叫一下垃圾回收器,否則無法刪除模板檔案
- System.gc();
- // 刪除臨時模板檔案
- if (templateFile.isFile()&&templateFile.exists()){
- templateFile.delete();
- }
- }
- /**
- * 類使用者應該使用此方法進行寫操作
- * @throws Exception
- */
- publicabstractvoid generate() throws Exception;
- publicvoid beginSheet() throws IOException {
- sw.beginSheet();
- }
- publicvoid insertRow(int rowNum) throws IOException {
- sw.insertRow(rowNum);
- }
- publicvoid createCell(int columnIndex, String value) throws IOException {
- sw.createCell(columnIndex, value, -1);
- }
- publicvoid createCell(int columnIndex, double value) throws IOException {
- sw.createCell(columnIndex, value, -1);
- }
- publicvoid endRow() throws IOException {
- sw.endRow();
- }
- publicvoid endSheet() throws IOException {
- sw.endSheet();
- }
- /**
- *
- * @param zipfile the template file
- * @param tmpfile the XML file with the sheet data
- * @param entry the name of the sheet entry to substitute, e.g. xl/worksheets/sheet1.xml
- * @param out the stream to write the result to
- */
- privatestaticvoid substitute(File zipfile, File tmpfile, String entry,
- OutputStream out) throws IOException {
- ZipFile zip = new ZipFile(zipfile);
- ZipOutputStream zos = new ZipOutputStream(out);
- @SuppressWarnings("unchecked")
- Enumeration<ZipEntry> en = (Enumeration<ZipEntry>) zip.entries();
- while