POI读取第三方下载的Word文档

in 编程
关注公众号【好便宜】( ID:haopianyi222 ),领红包啦~
阿里云,国内最大的云服务商,注册就送数千元优惠券:https://t.cn/AiQe5A0g
腾讯云,良心云,价格优惠: https://t.cn/AieHwwKl
搬瓦工,CN2 GIA 优质线路,搭梯子、海外建站推荐: https://t.cn/AieHwfX9

因为从第三方读取到的word可能是其他格式(例如:html)转成word的,此时去读取word可能会失败。这里以HTML为例

依赖
 <!-- parse world -->
        <dependency>
            <groupId>org.apache.poi</groupId>
            <artifactId>poi-ooxml</artifactId>
            <version>4.0.1</version>
        </dependency>

        <dependency>
            <groupId>org.apache.poi</groupId>
            <artifactId>poi-scratchpad</artifactId>
            <version>4.0.1</version>
        </dependency>

        <dependency>
            <groupId>org.jsoup</groupId>
            <artifactId>jsoup</artifactId>
            <version>1.10.3</version>
        </dependency>
代码片段
private String parseWord(String path) throws ParseWordException {
        // inspect
        if (isEmpty(path)) {
            throw new ParseWordException(Code.PARAM_EMPTY.getCode(), Code.PARAM_EMPTY.getMessage());
        }

        // reader
        File file = new File(path);
        FileInputStream fis = null;
        try {
            fis = new FileInputStream(file);
        }  catch (FileNotFoundException e) {
            throw new ParseWordException(Code.READER_FILE_FAILURE.getCode(), Code.READER_FILE_FAILURE.getMessage());
        }


        // parse

        try {
            if (path.toUpperCase().endsWith(FileType.DOC.toString())) {
                HWPFDocument wordDoc = new HWPFDocument(fis);
                // 自己读

            } else if (path.toUpperCase().endsWith(FileType.DOCX.toString())) {
                XWPFDocument wordDocx = new XWPFDocument(fis);
                // 自己读

            } else {
                // 文件格式非法
                throw new ParseWordException(Code.FILE_TYPE_ILLEGAL.getCode(), Code.FILE_TYPE_ILLEGAL.getMessage());
            }

        }
        catch (IllegalArgumentException ie) {
            System.out.println(ie.getMessage());
            if (isEmpty(ie.getMessage())) {
                throw new ParseWordException(Code.PARAM_EMPTY.getCode(), Code.PARAM_EMPTY.getMessage());
            }
            if (ie.getMessage().contains("The document is really a HTML file")) {
                // 格式转换
                try {
                    String htmlPath = parseHtml(file);
                    Document doc = Jsoup.parse(new File(htmlPath), "GBK"); // 自己定
                    Elements elements = doc.select("table").select("tbody"); //读取所有的tbody标签,视情况而定
                    elements.forEach(e -> {
                        //读取td中所有的span标签,视情况而定,可能有图片,自己处理
                        e.select("td").select("span").eachText().stream().filter(d -> d != null && d.trim().length() > 0).forEach(System.out::println);

                    });


                } catch (IOException e) {
                    throw new ParseWordException(Code.FILE_CONVERT_FAILURE.getCode(), Code.FILE_CONVERT_FAILURE.getMessage());
                }
            }


        }
        catch (IOException e) {
            throw new ParseWordException(Code.PARSE_FAILURE.getCode(), Code.PARSE_FAILURE.getMessage());
        }
        return null;
    }

    /**
     * parse HTML
     *
     * @param readerFile
     * @return
     * @throws IOException
     */
    private String parseHtml(File readerFile) throws IOException {
        String tempPath = "d:\\1.html"; // 创建一个零时文件,自己换一下路径

        File outFile = new File(tempPath);
        if (outFile.exists()) {
            outFile.delete(); // 删掉之前已经存在的文件
        }
        FileInputStream fis = new FileInputStream(readerFile);
        FileOutputStream fileOutputStream = new FileOutputStream(outFile);
        int len = 0;
        byte[] buffer = new byte[1024];
        while ((len = fis.read(buffer)) != -1) {
            fileOutputStream.write(buffer, 0, len);

        }

        return tempPath;
    }

    public static void main(String[] args) throws IOException, ParseWordException {
       ParseWorld parse = new ParseWorld();
       parse.parseWord("D:\\aaa.doc");


//
    }

    private boolean isEmpty(String str) {
        return str == null || str.trim().length() == 0;
    }
关注公众号【好便宜】( ID:haopianyi222 ),领红包啦~
阿里云,国内最大的云服务商,注册就送数千元优惠券:https://t.cn/AiQe5A0g
腾讯云,良心云,价格优惠: https://t.cn/AieHwwKl
搬瓦工,CN2 GIA 优质线路,搭梯子、海外建站推荐: https://t.cn/AieHwfX9
扫一扫关注公众号添加购物返利助手,领红包
Comments are closed.

推荐使用阿里云服务器

超多优惠券

服务器最低一折,一年不到100!

朕已阅去看看