apache pdfbox

mil · 发表于 2015-8-6 08:26:05

　　转 http://www.blogjava.net/sxyx2008/archive/2010/07/23/326890.html
　　轻松使用apache pdfbox将pdf文件生成图
　　近期在项目中使用到了大量的报表开发，需要将html页面中的表格内容导出到pdf word excel和图片，前三者都比较好实现。唯独后者生成图片使用ImageIo操作时生成的图片有点惨不忍睹。经过大量google后发现，pdfbox这个组件不错，可以将pdf文件轻松生成图片。这不问题解决了，但在使用过程中不然，受到了很多致命性的打击。pdfbox在处理中文pdf的时候就会表现的比较脆弱点。但对英文版的pdf导出图片，那是杠杠的。尽管这样，还是记录一下，毕竟这方面的资料很少。我几乎搜遍了整个google，baidu才搜集到那么一点点资料。这里跟大家分享下。所依赖的JAR： commons-logging-1.1.1.jar fontbox-1.2.1.jar pdfbox-1.2.1.jar 示例代码：

/**//*

* Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License.

*/

package com.future.pdfbox.image; import java.awt.image.BufferedImage; import java.io.File; import java.io.FileOutputStream; import java.io.IOException; import java.util.Iterator; import java.util.List; import javax.imageio.IIOImage; import javax.imageio.ImageIO; import javax.imageio.ImageWriter; import javax.imageio.stream.ImageOutputStream; import org.apache.pdfbox.pdmodel.PDDocument; import org.apache.pdfbox.pdmodel.PDPage; public class ExtractImages

{

public static void main(String[] args) throws IOException {          PDDocument doc = PDDocument.load("F:\\1.pdf");       int pageCount = doc.getPageCount();          System.out.println(pageCount);          List pages = doc.getDocumentCatalog().getAllPages();          for(int i=0;iConfig Build Path->Add Jars”命令，把工程lib目录下面的包都加入工程的Build Path。
4、使用PDFBox解析PDF内容
　　在刚刚创建的Eclipse工程中，创建一个ch7.pdfbox包，并创建一个PdfboxTest类。该类包含一个getText方法，用于从一个PDF中获取文本信息，其代码如下。
　　import java.io.BufferedWriter;          import java.io.FileInputStream;          import java.io.FileWriter;
         import org.pdfbox.pdfparser.PDFParser;          import org.pdfbox.util.PDFTextStripper;
         public class PdfParser {
            /**             * @param args             */             // TODO 自动生成方法存根
   public static void main(String[] args) throws Exception{                         FileInputStream fis = new FileInputStream("F:\\task\\lerman-atem2001.pdf");                         BufferedWriter writer = new BufferedWriter(new FileWriter("F:\\task\\pdf_change.txt"));                      PDFParser p = new PDFParser(fis);                         p.parse();                               PDFTextStripper ts = new PDFTextStripper();                               String s = ts.getText(p.getPDDocument());                         writer.write(s);                      System.out.println(s);                         fis.close();                         writer.close();                                     }          }
　　下面是自己按照书上的例子写的代码。

1package TestPDF.pdfbox; 2 3import java.io.File; 4import java.io.FileOutputStream; 5import java.io.IOException; 6import java.io.OutputStreamWriter; 7import java.io.Writer; 8import java.net.URL; 9  10import org.apache.lucene.analysis.standard.StandardAnalyzer;  11import org.apache.lucene.document.Document;  12import org.apache.lucene.index.IndexWriter;  13import org.apache.lucene.index.Term;  14import org.apache.lucene.search.IndexSearcher;  15import org.apache.lucene.search.PhraseQuery;  16import org.apache.lucene.search.Query;  17import org.apache.lucene.search.ScoreDoc;  18import org.apache.lucene.search.TermQuery;  19import org.apache.lucene.search.TopDocCollector;  20import org.apache.lucene.search.TopDocs;  21import org.pdfbox.pdmodel.PDDocument;  22import org.pdfbox.searchengine.lucene.LucenePDFDocument;  23import org.pdfbox.util.PDFTextStripper;  24  25public class Test {  26  27 public void getText(String file) throws Exception{  28       //是否排序  29       boolean sort = false;  30       //pdf文件名  31       String pdfFile = file;  32       //输入文本文件名称  33       String textFile = null;  34       //编码方式  35       String encoding = "UTF-8";  36       //开始提取页数  37       int startPage = 1;  38       //结束提取页数  39       int endPage = Integer.MAX_VALUE;  40       //文件输入流，输入文本文件  41       Writer output = null; 42       //内存中存储的PDF Document  43       PDDocument document = null;  44       45       try{  46          try{  47             //首先当作一个URL来加载文件，如果得到异常再从本地系统装载文件  48             URL url = new URL(pdfFile);  49             document = PDDocument.load(url);  50             String fileName = url.getFile();  51             52             if(fileName.length() > 4){  53                   //以原来pdf名称来命名新产生的txt文件  54                   File outputFile = new File(fileName.substring(0, fileName.length()-4) + ".txt");  55                   textFile = outputFile.getName();  56             }             57          }catch(Exception e){  58             //如果作为URL装载得到异常则从文件系统装载  59             document = PDDocument.load(pdfFile);  60             if(pdfFile.length() > 4){  61                   textFile = pdfFile.substring(0, pdfFile.length() - 4) + ".txt";  62             }  63          }  64          //文件输出流，写入文件到textFile  65          output = new OutputStreamWriter(new FileOutputStream(textFile),encoding);  66          //PDFTextStripper来提取文本  67          PDFTextStripper stripper = new PDFTextStripper();  68          //设置是否排序  69          stripper.setSortByPosition(sort);  70          //设置起始页  71          stripper.setStartPage(startPage);  72          //设置结束页  73          stripper.setEndPage(endPage);  74          //调用PDFTextStripper的writeText提取并输出文本  75          stripper.writeText(document, output);  76       }finally{  77          if(output != null){  78             output.close();                79          }  80          if(document != null){  81             document.close();  82          }  83       }       84 }  85    86 /** *//**  87    * test Lucene with pdfbox  88    * @throws IOException  89    */  90 public void LuceneTest() throws IOException{  91       92       String path = "D:\\index";  93       String pdfpath = "D:\\index\\Lucene.Net基本用法.pdf";  94       95       IndexWriter writer = new IndexWriter(path, new StandardAnalyzer(),true);  96       //writer.setMaxFieldLength(10240);  97       //LucenePDFDocument返回由PDF产生的Lucene Document  98       Document d = LucenePDFDocument.getDocument(new File(pdfpath));  99       //System.out.println(d); 100       //写入索引 101       writer.addDocument(d); 102       writer.close(); 103       104       //读取d:\index下的索引文件，建立IndexSearcher 105       IndexSearcher searcher = new IndexSearcher(path); 106       //对索引的contents Field进行关键字Query的查找 107       Term t = new Term("contents","优"); 108       Term m = new Term("contents","化"); 109       PhraseQuery q = new PhraseQuery(); 110       q.add(t); 111       q.add(m); 112       //Query q = new TermQuery(t); 113       TopDocCollector co = new TopDocCollector(10); 114       searcher.search(q,co); 115       116       Document document; 117       TopDocs docs = co.topDocs(); 118       ScoreDoc[] doc = docs.scoreDocs; 119       //System.out.println(doc.length); 120       121       for(int i=0;i

账号		自动登录	找回密码
密码			立即注册

c++ size_t 和 int 的区别

HERE 使用 AWS EF 和 JFrog Artifactory 打

C++ 指针大全：从基础到进阶，一篇快速上手

wirelessnetview好用的无线分析工具

Red Hat RHCE 8 (EX294) Cert Guide

亿图图示专家(EDraw Max) V7.9 中文破解版

zabbix3.4.1安装部署+微信推送信息+大屏显

[经验分享] apache pdfbox

浏览过的版块

扫码加入运维网微信交流群