In this tutorial, we will introduce the way to extract text from a pdf file in java, we will use Tika PDFParser to implment it.
1. Import packages
import java.io.IOException; import java.io.InputStream; import org.apache.tika.exception.TikaException; import org.apache.tika.metadata.Metadata; import org.apache.tika.parser.ParseContext; import org.apache.tika.parser.pdf.PDFParser; import org.apache.tika.sax.BodyContentHandler; import org.xml.sax.SAXException;
2. Create a pdf handler ,pdf parser, pdf metadata parser and a context parser object
BodyContentHandler handler = new BodyContentHandler(); PDFParser parser = new PDFParser(); Metadata metadata = new Metadata(); ParseContext pcontext = new ParseContext();
3. Start to extract text and metadata from a pdf file
parser.parse(stream, handler, metadata, pcontext); System.out.println("Document Content:" + handler.toString()); System.out.println("Document Metadata:"); String[] metadatas = metadata.names(); for(String data : metadatas) { System.out.println(data + ": " + metadata.get(data)); }
Finally, we add all codes above to a java class.
import java.io.IOException; import java.io.InputStream; import org.apache.tika.exception.TikaException; import org.apache.tika.metadata.Metadata; import org.apache.tika.parser.ParseContext; import org.apache.tika.parser.pdf.PDFParser; import org.apache.tika.sax.BodyContentHandler; import org.xml.sax.SAXException; public class PdfParserExample { public static void main(String[] args) throws IOException, SAXException, TikaException { BodyContentHandler handler = new BodyContentHandler(); PDFParser parser = new PDFParser(); Metadata metadata = new Metadata(); ParseContext pcontext = new ParseContext(); try ( InputStream stream = AutoDetectParseExample.class.getResourceAsStream("java.pdf"){ parser.parse(stream, handler, metadata, pcontext); System.out.println("Document Content:" + handler.toString()); System.out.println("Document Metadata:"); String[] metadatas = metadata.names(); for(String data : metadatas) { System.out.println(data + ": " + metadata.get(data)); } }catch(Exception e) {System.out.println("Exception message: "+ e.getMessage());} } }