doc,docx,pdf,ppt等檔案型別讀取方法
阿新 • • 發佈:2019-02-11
//讀取ppt檔案
public String readPPT(String file) throws IOException {
String re="";
InputStream is = null;
try{
is = new FileInputStream(new File(file));
PowerPointExtractor extractor = new PowerPointExtractor(is);
re = extractor.getText();
}catch (Exception e){
System.out.println("讀取ppt出錯"+e.toString());
}finally {
is.close();
}
return re;
}
//讀取pptx檔案
public String readPPT2007(String file) throws IOException {
OPCPackage opc = null;
String re = "";
try{
opc = POIXMLDocument.openPackage(file);
re = new XSLFPowerPointExtractor(opc).getText();
} catch (Exception e) {
System.out.println("讀取pptx出錯"+e.toString());
}finally {
opc.close();
}
return re;
}
// 讀取pdf檔案
public String readPDF(String file) throws IOException {
String result = null ;
FileInputStream is = null;
PDDocument document = null;
try{
is = new FileInputStream(file);
PDFParser parser = new PDFParser(is);
parser.parse();
document = parser.getPDDocument();
PDFTextStripper stripper = new PDFTextStripper();
result = stripper.getText(document);
} catch (Exception e) {
e.printStackTrace();
}finally {
if(is != null){
is.close();
}
if (document != null){
document.close();
}
}
return result;
}
// 讀取doc檔案
public String readWord(String file){
String result ="";
WordExtractor wordExtractor = null;
try{
wordExtractor = new WordExtractor(new FileInputStream(file));
result = wordExtractor.getText();
} catch (FileNotFoundException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
}
return result;
}
// 讀取docx檔案
public String readDocx(String file) throws IOException {
String result = "";
OPCPackage opc = null;
try{
opc = POIXMLDocument.openPackage(file);
result = new XWPFWordExtractor(opc).getText();
} catch (Exception e) {
e.printStackTrace();
}finally {
opc.close();
}
return result;
}
// 讀取xls檔案
public String readEXCEL(String file){
StringBuilder content = new StringBuilder();
HSSFWorkbook workbook = null;
try{
workbook = new HSSFWorkbook(new FileInputStream(file));
for(int numSheets = 0;numSheets <workbook.getNumberOfSheets();numSheets++){
if(null != workbook.getSheetAt(numSheets)){
HSSFSheet sheet = workbook.getSheetAt(numSheets);
for(int rowNumOfSheet = 0;rowNumOfSheet <= sheet.getLastRowNum();rowNumOfSheet++ ){
HSSFRow row = sheet.getRow(rowNumOfSheet);
for (short cellNumOfRow = 0; cellNumOfRow <=row.getLastCellNum();cellNumOfRow ++){
HSSFCell cell = row.getCell(cellNumOfRow);
if(this.convertCellHSSFCell(cell).length() > 0){
content.append(this.convertCellHSSFCell(cell));
}
}
content.append("\n");
}
}
}
} catch (Exception e) {
e.printStackTrace();
}
return content.toString();
}
private String convertCellHSSFCell(HSSFCell cell){
NumberFormat format = NumberFormat.getInstance();
format.setGroupingUsed(false);
String cellValue = "";
if(cell == null){
return cellValue;
}
switch (cell.getCellType()){
case HSSFCell.CELL_TYPE_NUMERIC:
cellValue = format.format(cell.getNumericCellValue());
break;
case HSSFCell.CELL_TYPE_STRING:
cellValue = cell.getStringCellValue();
break;
case HSSFCell.CELL_TYPE_BLANK:
cellValue = cell.getStringCellValue();
break;
case HSSFCell.CELL_TYPE_BOOLEAN:
cellValue = Boolean.valueOf(cell.getBooleanCellValue()).toString();
break;
case HSSFCell.CELL_TYPE_ERROR:
cellValue = String.valueOf(cell.getErrorCellValue());
break;
default:
cellValue = "";
}
return cellValue.trim();
}
// 讀取xlsx檔案
public String readEXCEL2007(String file) throws IOException {
XSSFWorkbook workbook=null;
StringBuilder content = new StringBuilder();
try{
workbook = new XSSFWorkbook(file);
for (int numSheets = 0; numSheets < workbook.getNumberOfSheets(); numSheets++) {
if (null != workbook.getSheetAt(numSheets)) {
XSSFSheet aSheet = workbook.getSheetAt(numSheets);// 獲得一個sheet
for (int rowNumOfSheet = 0; rowNumOfSheet <= aSheet.getLastRowNum(); rowNumOfSheet++) {
if (null != aSheet.getRow(rowNumOfSheet)) {
XSSFRow aRow = aSheet.getRow(rowNumOfSheet); // 獲得一個行
for (short cellNumOfRow = 0; cellNumOfRow <= aRow.getLastCellNum(); cellNumOfRow++) {
if (null != aRow.getCell(cellNumOfRow)) {
XSSFCell aCell = aRow.getCell(cellNumOfRow);// 獲得列值
if (this.convertCellXHSSFCell(aCell).length() > 0) {
content.append(this.convertCellXHSSFCell(aCell));
}
}
content.append("\n");
}
}
}
}
}
}
catch(Exception ex){
System.out.println("讀取excel出錯"+ex.toString());
}
return content.toString();
}
private String convertCellXHSSFCell(XSSFCell aCell) {
NumberFormat formater = NumberFormat.getInstance();
formater.setGroupingUsed(false);
String cellValue = "";
if (aCell == null) {
return cellValue;
}
switch (aCell.getCellType()) {
case HSSFCell.CELL_TYPE_NUMERIC:
cellValue = formater.format(aCell.getNumericCellValue());
break;
case HSSFCell.CELL_TYPE_STRING:
cellValue = aCell.getStringCellValue();
break;
case HSSFCell.CELL_TYPE_BLANK:
cellValue = aCell.getStringCellValue();
break;
case HSSFCell.CELL_TYPE_BOOLEAN:
cellValue = Boolean.valueOf(aCell.getBooleanCellValue()).toString();
break;
case HSSFCell.CELL_TYPE_ERROR:
cellValue = String.valueOf(aCell.getErrorCellValue());
break;
default:
cellValue = "";
}
return cellValue.trim();
}
所需jar包的maven座標 :
<dependency>
<groupId>org.apache.poi</groupId>
<artifactId>poi</artifactId>
<version>3.9</version>
</dependency>
<dependency>
<groupId>org.apache.poi</groupId>
<artifactId>poi-ooxml</artifactId>
<version>3.9</version>
</dependency>
<dependency>
<groupId>fr.opensagres.xdocreport</groupId>
<artifactId>org.apache.poi.xwpf.converter.pdf</artifactId>
<version>1.0.6</version>
</dependency>
<dependency>
<groupId>fr.opensagres.xdocreport</groupId>
<artifactId>org.apache.poi.xwpf.converter.core</artifactId>
<version>1.0.6</version>
</dependency>
<dependency>
<groupId>org.apache.directory.studio</groupId>
<artifactId>org.apache.commons.collections</artifactId>
<version>3.2.1</version>
</dependency>
<dependency>
<groupId>fr.opensagres.xdocreport</groupId>
<artifactId>org.apache.poi.xwpf.converter.core</artifactId>
<version>1.0.5</version>
</dependency>
<dependency>
<groupId>fr.opensagres.xdocreport</groupId>
<artifactId>org.apache.poi.xwpf.converter.pdf</artifactId>
<version>1.0.5</version>
</dependency>
<dependency>
<groupId>fr.opensagres.xdocreport</groupId>
<artifactId>org.apache.poi.xwpf.converter.xhtml</artifactId>
<version>1.0.5</version>
</dependency>
<dependency>
<groupId>org.apache.pdfbox</groupId>
<artifactId>pdfbox</artifactId>
<version>1.8.13</version>
</dependency>
<dependency>
<groupId>org.apache.pdfbox</groupId>
<artifactId>pdfbox-tools</artifactId>
<version>2.0.8</version>
</dependency>