java利用poi解析docx生成html
阿新 • • 發佈:2018-12-20
公司業務需要把world文件中編輯好的新聞(文字+圖片)錄入到CMS管理後臺,生成一篇新聞釋出。因為不能把圖片直接複製貼上到UEditor編輯器上,還要一個一個上傳太麻煩。所以這裡做了一個上傳docx檔案解析後,直接返回html正文放到前端編輯器繼續編輯。 功能要求: 1.圖片要下載到伺服器指定位置,並把前端請求圖片地址拼接到img標籤的src上。 2.圖片文字要按照順序排列。 3.過濾掉超連結、其他圖形等一般新聞不用的元素。 實現:
- maven最小依賴,3.17版本支援jdk1.6及以上。4版本需要jdk1.8及以上支援了
<groupId>org.apache.poi</groupId> <artifactId>poi</artifactId> <version>3.17</version> </dependency> <dependency> <groupId>org.apache.poi</groupId> <artifactId>poi-ooxml</artifactId> <version>3.17</version> </dependency> <dependency> <groupId>org.apache.poi</groupId> <artifactId>poi-ooxml-schemas</artifactId> <version>3.17</version> </dependency>
2.程式碼實現
import java.io.FileInputStream; import java.io.FileOutputStream; import java.io.InputStream; import java.util.List; import org.apache.poi.xwpf.usermodel.XWPFDocument; import org.apache.poi.xwpf.usermodel.XWPFParagraph; import org.apache.poi.xwpf.usermodel.XWPFPictureData; import org.apache.poi.xwpf.usermodel.XWPFRun; import org.apache.xmlbeans.XmlCursor; import org.apache.xmlbeans.XmlObject; import org.openxmlformats.schemas.drawingml.x2006.main.CTGraphicalObject; import org.openxmlformats.schemas.drawingml.x2006.picture.CTPicture; import org.openxmlformats.schemas.drawingml.x2006.wordprocessingDrawing.CTInline; import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTDrawing; import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTR; import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTText; public class AnalyzeDocx { public static void main(String[] args) throws Exception { String content = analyzeDocx("e://abc.docx"); System.out.println(content); } public static String analyzeDocx(String path) throws Exception { StringBuilder sb = new StringBuilder(); try (InputStream in = new FileInputStream(path); XWPFDocument xwpfDocument = new XWPFDocument(in);) { List<XWPFParagraph> paragraphs = xwpfDocument.getParagraphs(); for (XWPFParagraph xwpfParagraph : paragraphs) { List<XWPFRun> runs = xwpfParagraph.getRuns(); for (XWPFRun xwpfRun : runs) { CTR ctr = xwpfRun.getCTR(); XmlCursor newCursor = ctr.newCursor(); newCursor.selectPath("./*"); while (newCursor.toNextSelection()) { XmlObject object = newCursor.getObject(); if (object instanceof CTText) {// 文字 CTText ctText = (CTText) object; if (ctText.isSetSpace()) { continue;// 先不支援超連結 } String text = ctText.getStringValue(); if (text != null && text.length() > 0) { sb.append(text); } } else if (object instanceof CTDrawing) {// 圖片1 CTDrawing drawing = (CTDrawing) object; CTInline[] inlineArray = drawing.getInlineArray(); for (CTInline ctInline : inlineArray) { CTGraphicalObject graphic = ctInline.getGraphic(); XmlCursor newCursor2 = graphic.getGraphicData().newCursor(); newCursor2.selectPath("./*"); while (newCursor2.toNextSelection()) { XmlObject object2 = newCursor2.getObject(); if (object2 instanceof CTPicture) { CTPicture picture = (org.openxmlformats.schemas.drawingml.x2006.picture.CTPicture) object2; sb.append("<br>").append( imgHtml(xwpfDocument, picture.getBlipFill().getBlip().getEmbed())) .append("<br>"); } } } } } } sb.append("<br>");// 分段 } } catch (Exception e) { e.printStackTrace(); } return sb.toString(); } private static String imgHtml(XWPFDocument xwpfDocument, String blipID) { XWPFPictureData pictureData = xwpfDocument.getPictureDataByID(blipID); String imageName = pictureData.getFileName(); String newfilename = System.currentTimeMillis() + imageName; byte[] bytev = pictureData.getData(); try (FileOutputStream fos = new FileOutputStream("E:/" + newfilename);) { fos.write(bytev);// 此處儲存圖片後,變成可訪問的http然後用<img>標籤包裹 } catch (Exception e) { e.printStackTrace(); } return "<img src='/rongmeitiapi/api/picture/find/image/20181107/d66ce5ffc18365a3dab1e46c484dfabb.jpeg'>"; } }
imgHtml方法需要把圖片重新命名後,變成前端可訪問的連線,再去拼接img標籤。我這邊因為是測試,所以寫死了img標籤。 注意:這個只是處理正常的可檢視片,對於emf型別的圖片,不處理因為新聞也用不到。 如果需要捕獲所有的,請參考https://www.jb51.net/article/132091.htm