Excel大檔案時讀取記憶體溢位的解決方案,測試50M的Excel通過。
阿新 • • 發佈:2019-02-18
1.大檔案xlsx的檔案解析,本案例僅僅只是讀取Excel檔案的內容為例。具體需求可按自己的需求改。
package com.sundy.parse.util; import java.io.File; import java.io.IOException; import java.io.InputStream; import java.util.ArrayList; import org.apache.poi.openxml4j.exceptions.InvalidFormatException; import org.apache.poi.openxml4j.exceptions.OpenXML4JException; import org.apache.poi.openxml4j.opc.OPCPackage; import org.apache.poi.openxml4j.opc.PackageAccess; import org.apache.poi.xssf.eventusermodel.XSSFReader; import org.apache.poi.xssf.model.SharedStringsTable; import org.apache.poi.xssf.usermodel.XSSFRichTextString; import org.xml.sax.Attributes; import org.xml.sax.InputSource; import org.xml.sax.SAXException; import org.xml.sax.XMLReader; import org.xml.sax.helpers.DefaultHandler; import org.xml.sax.helpers.XMLReaderFactory; public class ParseXlsxExcel extends DefaultHandler { private OPCPackage opcPackage = null; private XSSFReader xssfReader = null; private boolean nextIsStr; private String cellContent; //單元格的內容 private ArrayList<String> sheetList; // sheet的所有內容 private ArrayList<String> excelList; // Excel的所有內容 private SharedStringsTable sst; private int currentColumn = 0; private XMLReader reader; /** * @param path .xlxs字尾的Excel檔案 * @param sheetId * @throws IOException * @throws OpenXML4JException * @throws SAXException */ public ParseXlsxExcel(String path,int sheetId) throws IOException, OpenXML4JException, SAXException{ init(new File(path)); parseSheet(sheetId); } public ParseXlsxExcel(File file,int sheetId) throws IOException, OpenXML4JException, SAXException{ init(file); parseSheet(sheetId); } /** * @param file .xlxs 字尾的Excel檔案 * @param flag true 表示查詢所有 ,false 表示只獲取sheetId為1的內容 * @throws IOException * @throws OpenXML4JException * @throws SAXException */ public ParseXlsxExcel(File file,boolean flag) throws IOException, OpenXML4JException, SAXException{ initAll(file, flag); } public ParseXlsxExcel(String path,boolean flag) throws IOException, OpenXML4JException, SAXException{ initAll(new File(path), flag); } private void initAll(File file, boolean flag)throws InvalidFormatException, IOException, OpenXML4JException,SAXException { init(file); if(flag){ parseAllSheet(); }else { parseSheet(1); } } private void parseAllSheet() { int x = 0; int y = 0; while(x==y){ try { parseSheet(++x); excelList.addAll(sheetList); y=x; currentColumn=0; } catch (Exception e) { // e.printStackTrace(); } } } private void init(File file) throws IOException, OpenXML4JException, SAXException{ opcPackage = OPCPackage.open((file), PackageAccess.READ); sheetList = new ArrayList<String>(); excelList = new ArrayList<String>(); xssfReader = new XSSFReader(opcPackage); sst = xssfReader.getSharedStringsTable(); reader = XMLReaderFactory.createXMLReader("org.apache.xerces.parsers.SAXParser"); reader.setContentHandler(this); } public void close() throws IOException{ if(opcPackage!=null) opcPackage.close(); } private void parseSheet(int sheetId) throws IOException, OpenXML4JException, SAXException{ sheetList.clear(); InputStream inStream = xssfReader.getSheet("rId"+sheetId); InputSource inputSource = new InputSource(inStream); reader.parse(inputSource); } @Override public void startElement(String uri, String localName, String qName,Attributes attributes) throws SAXException { if(qName.equals("c")){ // c為單元格,v為值,row為行。 String type = attributes.getValue("t"); if(type!=null && type.equals("s")){ nextIsStr = true; }else { nextIsStr = false; } } } @Override public void characters(char[] ch, int start, int length) throws SAXException { cellContent = new String(ch, start, length); } @Override public void endElement(String uri, String localName, String qName) throws SAXException{ if(nextIsStr&&cellContent!=""&&cellContent!=null){ try { int idx = Integer.parseInt(cellContent); cellContent = new XSSFRichTextString(sst.getEntryAt(idx)).toString(); } catch (Exception e) { // e.printStackTrace(); } } if(qName.equals("v")){ sheetList.add(currentColumn++, cellContent); cellContent=""; } } /** sheet的所有內容 */ public ArrayList<String> getSheetList() { return sheetList; } /** Excel的所有內容 */ public ArrayList<String> getExcelList() { return excelList; } public static void main(String[] args) throws Exception{ long start = System.currentTimeMillis(); ParseXlsxExcel excel = new ParseXlsxExcel("C:/Users/admin/Desktop/大師傅家的感覺.xlsx",true); excel.close(); ArrayList<String> list = excel.getExcelList(); long end = System.currentTimeMillis(); System.out.println(list.size());//19677984 System.out.println(end-start); //16389ms } }
2.xls大檔案的解析,參考如下(http://gaosheng08.iteye.com/blog/624604);
package com.sundy.parse.util; import java.io.FileInputStream; import java.io.IOException; import java.util.ArrayList; import java.util.List; import java.util.regex.Pattern; import org.apache.poi.hssf.eventusermodel.EventWorkbookBuilder.SheetRecordCollectingListener; import org.apache.poi.hssf.eventusermodel.FormatTrackingHSSFListener; import org.apache.poi.hssf.eventusermodel.HSSFEventFactory; import org.apache.poi.hssf.eventusermodel.HSSFListener; import org.apache.poi.hssf.eventusermodel.HSSFRequest; import org.apache.poi.hssf.eventusermodel.MissingRecordAwareHSSFListener; import org.apache.poi.hssf.eventusermodel.dummyrecord.LastCellOfRowDummyRecord; import org.apache.poi.hssf.eventusermodel.dummyrecord.MissingCellDummyRecord; import org.apache.poi.hssf.model.HSSFFormulaParser; import org.apache.poi.hssf.record.BOFRecord; import org.apache.poi.hssf.record.BlankRecord; import org.apache.poi.hssf.record.BoolErrRecord; import org.apache.poi.hssf.record.BoundSheetRecord; import org.apache.poi.hssf.record.FormulaRecord; import org.apache.poi.hssf.record.LabelRecord; import org.apache.poi.hssf.record.LabelSSTRecord; import org.apache.poi.hssf.record.NoteRecord; import org.apache.poi.hssf.record.NumberRecord; import org.apache.poi.hssf.record.RKRecord; import org.apache.poi.hssf.record.Record; import org.apache.poi.hssf.record.SSTRecord; import org.apache.poi.hssf.record.StringRecord; import org.apache.poi.hssf.usermodel.HSSFWorkbook; import org.apache.poi.poifs.filesystem.POIFSFileSystem; /** * 用於解析大於5M的字尾為.xls的Excel檔案,防止記憶體溢位 * @author sundy * 參考:http://gaosheng08.iteye.com/blog/624604 */ public class ParseXlsExcel implements HSSFListener{ private int minColumns; private POIFSFileSystem fs; private int lastRowNumber; private int lastColumnNumber; /** Should we output the formula, or the value it has? */ private boolean outputFormulaValues = true; /** For parsing Formulas */ private SheetRecordCollectingListener workbookBuildingListener; private HSSFWorkbook stubWorkbook; // Records we pick up as we process private SSTRecord sstRecord; private FormatTrackingHSSFListener formatListener; /** So we known which sheet we're on */ private int sheetIndex = -1; private BoundSheetRecord[] orderedBSRs; @SuppressWarnings("unchecked") private ArrayList boundSheetRecords = new ArrayList(); // For handling formulas with string results private int nextRow; private int nextColumn; private boolean outputNextStringRecord; private int curRow; private List<String> rowlist; @SuppressWarnings( "unused") private String sheetName; private List<String> contents = new ArrayList<String>(); public ParseXlsExcel(POIFSFileSystem fs) throws Exception { this.fs = fs; this.minColumns = -1; this.curRow = 0; this.rowlist = new ArrayList<String>(); } public ParseXlsExcel(String filename) throws Exception { this(new POIFSFileSystem(new FileInputStream(filename))); } //excel記錄行操作方法,以sheet索引,行索引和行元素列表為引數,對sheet的一行元素進行操作,元素為String型別 public void optRows(int sheetIndex,int curRow, List<String> rowlist){ StringBuffer sb = new StringBuffer(); for (int i = 0 ;i< rowlist.size();i++){ String str = rowlist.get(i); if(null!=str&&!"".equals(str)) sb.append(str.trim()+" "); } contents.add(sb.toString()); } /**讀取到的Excel文件,返回文件的content*/ public List<String> getList() { return contents; } /** * 遍歷 excel 檔案 */ public void process() throws IOException { MissingRecordAwareHSSFListener listener = new MissingRecordAwareHSSFListener(this); formatListener = new FormatTrackingHSSFListener(listener); HSSFEventFactory factory = new HSSFEventFactory(); HSSFRequest request = new HSSFRequest(); if (outputFormulaValues) { request.addListenerForAllRecords(formatListener); } else { workbookBuildingListener = new SheetRecordCollectingListener(formatListener); request.addListenerForAllRecords(workbookBuildingListener); } factory.processWorkbookEvents(request, fs); } /** * HSSFListener 監聽方法,處理 Record */ @SuppressWarnings("unchecked") public void processRecord(Record record) { int thisRow = -1; int thisColumn = -1; String thisStr = null; String value = null; switch (record.getSid()) { case BoundSheetRecord.sid: boundSheetRecords.add(record); break; case BOFRecord.sid: BOFRecord br = (BOFRecord) record; if (br.getType() == BOFRecord.TYPE_WORKSHEET) { // Create sub workbook if required if (workbookBuildingListener != null && stubWorkbook == null) { stubWorkbook = workbookBuildingListener.getStubHSSFWorkbook(); } sheetIndex++; if (orderedBSRs == null) { orderedBSRs = BoundSheetRecord.orderByBofPosition(boundSheetRecords); } sheetName = orderedBSRs[sheetIndex].getSheetname(); } break; case SSTRecord.sid: sstRecord = (SSTRecord) record; break; case BlankRecord.sid: BlankRecord brec = (BlankRecord) record; thisRow = brec.getRow(); thisColumn = brec.getColumn(); thisStr = ""; break; case BoolErrRecord.sid: BoolErrRecord berec = (BoolErrRecord) record; thisRow = berec.getRow(); thisColumn = berec.getColumn(); thisStr = ""; break; case FormulaRecord.sid: FormulaRecord frec = (FormulaRecord) record; thisRow = frec.getRow(); thisColumn = frec.getColumn(); if (outputFormulaValues) { if (Double.isNaN(frec.getValue())) { outputNextStringRecord = true; nextRow = frec.getRow(); nextColumn = frec.getColumn(); } else { thisStr = formatListener.formatNumberDateCell(frec); } } else { thisStr = '"' + HSSFFormulaParser.toFormulaString(stubWorkbook, frec.getParsedExpression()) + '"'; } break; case StringRecord.sid: if (outputNextStringRecord) { // String for formula StringRecord srec = (StringRecord) record; thisStr = srec.getString(); thisRow = nextRow; thisColumn = nextColumn; outputNextStringRecord = false; } break; case LabelRecord.sid: LabelRecord lrec = (LabelRecord) record; curRow = thisRow = lrec.getRow(); thisColumn = lrec.getColumn(); value = lrec.getValue().trim(); value = value.equals("")?"":value; // this.rowlist.add(thisColumn, value); rowlist.add(value); break; case LabelSSTRecord.sid: LabelSSTRecord lsrec = (LabelSSTRecord) record; curRow = thisRow = lsrec.getRow(); thisColumn = lsrec.getColumn(); if (sstRecord != null){ value = sstRecord .getString(lsrec.getSSTIndex()).toString().trim(); value = value.equals("")?"":value; rowlist.add(value); } break; case NoteRecord.sid: NoteRecord nrec = (NoteRecord) record; thisRow = nrec.getRow(); thisColumn = nrec.getColumn(); thisStr = '"' + "(TODO)" + '"'; break; case NumberRecord.sid: NumberRecord numrec = (NumberRecord) record; curRow = thisRow = numrec.getRow(); thisColumn = numrec.getColumn(); value = formatListener.formatNumberDateCell(numrec).trim(); value = value.equals("")?"":value; rowlist.add(value); break; case RKRecord.sid: RKRecord rkrec = (RKRecord) record; thisRow = rkrec.getRow(); thisColumn = rkrec.getColumn(); thisStr = '"' + "(TODO)" + '"'; break; default: break; } // 遇到新行的操作 if (thisRow != -1 && thisRow != lastRowNumber) { lastColumnNumber = -1; } // 空值的操作 if (record instanceof MissingCellDummyRecord) { MissingCellDummyRecord mc = (MissingCellDummyRecord) record; curRow = thisRow = mc.getRow(); thisColumn = mc.getColumn(); // rowlist.add(thisColumn," "); } // 更新行和列的值 if (thisRow > -1) lastRowNumber = thisRow; if (thisColumn > -1) lastColumnNumber = thisColumn; // 行結束時的操作 if (record instanceof LastCellOfRowDummyRecord) { if (minColumns > 0) { // 列值重新置空 if (lastColumnNumber == -1) { lastColumnNumber = 0; } } // 行結束時, 呼叫 optRows() 方法 lastColumnNumber = -1; try { optRows(sheetIndex,curRow, rowlist); } catch (Exception e) { e.printStackTrace(); } rowlist.clear(); } } public static void main(String[] args) throws Exception { ParseXlsExcel excel = new ParseXlsExcel("F:\\srv\\tiptop\\crawl\\www.anxiang.gov.cn\\Excel\\3fbc7e659d1f42311ee89c6d47fac414.xls"); excel.process(); int size = excel.getList().size(); List<String> list2 = excel.getList(); String reg = "楚源收儲點"; Pattern pattern = Pattern.compile(reg); int x = 0; for (int i = 0; i < size; i++) { String replace = list2.get(i).trim().replace("",""); if(null!=replace&&!"".equals(replace)){ if(pattern.matcher(replace).find()) x+=1; System.out.println(replace); } } System.err.println(" 存在關鍵字【"+reg+"】的個數: "+x); } }