Java 將Word2003(doc)/Word2007(docx)轉Html格式檔案
阿新 • • 發佈:2019-01-28
程式碼實現:
這裡將Word轉為html格式的字串返回給前臺,可以通過我下面提供的方法使其轉換成txt和html檔案import org.apache.poi.hwpf.HWPFDocument; import org.apache.poi.hwpf.converter.PicturesManager; import org.apache.poi.hwpf.converter.WordToHtmlConverter; import org.apache.poi.hwpf.usermodel.PictureType; import org.apache.poi.xwpf.converter.core.BasicURIResolver; import org.apache.poi.xwpf.converter.core.FileImageExtractor; import org.apache.poi.xwpf.converter.xhtml.XHTMLOptions; import org.apache.poi.xwpf.usermodel.XWPFDocument; import org.w3c.dom.Document; import javax.xml.parsers.DocumentBuilderFactory; import java.io.*; import java.nio.file.Path; import java.nio.file.Paths; /** * @ClassName:WordToString * @Description: * @author: * @data:2017/10/24 */ public class WordToString { public static void main(String[] args) throws Throwable { //final String path = "D:\\Test\\xxx.doc"; final String filePath = "D:\\Test\\xxx.docx"; readWordToString(filePath); } public static String readWordToString(String filePath) throws Exception{ String str = ""; if (FileNameUtil.isWord2003(filePath)) { // docToHtml(filePath, "D:\\Test\\Word2003(doc).html"); str = docToHtml(filePath, "D:\\Test\\Word2003(doc).html"); // System.out.print(doc); // FileNameUtil.StringToFile(doc, "D:\\Test\\xxx.txt"); // FileNameUtil.txtToHtml("D:\\Test\\xxx.txt", "D:\\Test\\xxx.html"); } if (FileNameUtil.isWord2007(filePath)) { str = docxToHtml(filePath, "D:\\Test\\Word2007(docx).html"); // System.out.print(docx); // FileNameUtil.StringToFile(docx, "D:\\Test\\xxx.txt"); // FileNameUtil.txtToHtml("D:\\Test\\xxx.txt", "D:\\Test\\xxx.html"); } return str; } /* * doc轉換為html * docFilename:源word檔案路徑 * htmlFilename:生成的html檔案路徑 */ public static String docToHtml(String docFilename, String targetFileName) throws Exception { final Path imagePath = Paths.get(targetFileName).getParent().resolve("image"); HWPFDocument wordDocument = new HWPFDocument(new FileInputStream(docFilename)); Document document = DocumentBuilderFactory.newInstance().newDocumentBuilder().newDocument(); WordToHtmlConverter wordToHtmlConverter = new WordToHtmlConverter(document); // 儲存圖片,並返回圖片的相對路徑 wordToHtmlConverter.setPicturesManager(new PicturesManager() { @Override public String savePicture(byte[] content, PictureType pictureType, String name, float width, float height) { try (FileOutputStream out = new FileOutputStream(imagePath.resolve(name).toString())) { out.write(content); } catch (Exception e) { e.printStackTrace(); } return "../tmp/image/" + name; } }); wordToHtmlConverter.processDocument(wordDocument); try { String str = ""; FileInputStream in = new FileInputStream(targetFileName); // size 為字串的長度 ,這裡一次性讀完 int size = in.available(); byte[] buffer = new byte[size]; in.read(buffer); in.close(); str = new String(buffer, "UTF-8"); return str; } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); return null; } } /* * docx轉換為html * sourceFilePath:源word檔案路徑 * targetFileName:生成的html檔案路徑 */ public static String docxToHtml(String sourceFilePath, String targetFileName) throws Exception { String imagePathStr = Paths.get(targetFileName).getParent().resolve("../tmp/image/word/media").toString(); OutputStreamWriter outputStreamWriter = null; try { XWPFDocument document = new XWPFDocument(new FileInputStream(sourceFilePath)); XHTMLOptions options = XHTMLOptions.create(); // 存放圖片的資料夾 options.setExtractor(new FileImageExtractor(new File(imagePathStr))); // html中圖片的路徑 options.URIResolver(new BasicURIResolver("../tmp/image/word/media")); String str = ""; FileInputStream in = new FileInputStream(targetFileName); // size 為字串的長度 ,這裡一次性讀完 int size = in.available(); byte[] buffer = new byte[size]; in.read(buffer); in.close(); str = new String(buffer, "UTF-8"); return str; } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); return null; } } }
import java.io.*; /** * @ClassName:FileNameUtil * @Description: * @author: * @data:2017/10/24 */ public class FileNameUtil { public static boolean isWord2003(String filePath) { return filePath.matches("^.+\\.(?i)(doc)$"); } public static boolean isWord2007(String filePath) { return filePath.matches("^.+\\.(?i)(docx)$"); } public static boolean isExcel2003(String filePath) { return filePath.matches("^.+\\.(?i)(xls)$"); } public static boolean isExcel2007(String filePath) { return filePath.matches("^.+\\.(?i)(xlsx)$"); } public static boolean isPDF(String filePath) { return filePath.matches("^.+\\.(?i)(pdf)$"); } /** * 字串儲存到.txt檔案 * @param str * @param filename */ public static void StringToFile(String str, String filename) { try { //建立檔案物件 File file = new File(filename); // 向檔案寫入物件寫入資訊 FileWriter fileWriter = new FileWriter(file); // 寫檔案 fileWriter.write(str); // 關閉 fileWriter.close(); } catch (IOException e) { // e.printStackTrace(); } } /** * .txt檔案儲存為html檔案 * @param filePath * @param htmlPosition */ public static void txtToHtml(String filePath, String htmlPosition) { try { // String encoding = "GBK"; String encoding = "UTF-8"; File file = new File(filePath); if (file.isFile() && file.exists()) { // 判斷檔案是否存在 InputStreamReader read = new InputStreamReader(new FileInputStream(file), encoding); // 考慮到編碼格式 BufferedReader bufferedReader = new BufferedReader(read); // 寫檔案 FileOutputStream fos = new FileOutputStream(new File(htmlPosition)); OutputStreamWriter osw = new OutputStreamWriter(fos, "UTF-8"); BufferedWriter bw = new BufferedWriter(osw); String lineTxt = null; while ((lineTxt = bufferedReader.readLine()) != null) { bw.write(lineTxt + "</br>"); } bw.close(); osw.close(); fos.close(); read.close(); } else { System.out.println("找不到指定的檔案"); } } catch (Exception e) { System.out.println("讀取檔案內容出錯"); e.printStackTrace(); } } }
需要說明的一點,在我匯入poi包執行docx檔案時會報錯,最後通過匯入ooxml-schemas-1.1.jar得以解決