001/** 002 * 003 * Copyright (c) 2014, the Railo Company Ltd. All rights reserved. 004 * 005 * This library is free software; you can redistribute it and/or 006 * modify it under the terms of the GNU Lesser General Public 007 * License as published by the Free Software Foundation; either 008 * version 2.1 of the License, or (at your option) any later version. 009 * 010 * This library is distributed in the hope that it will be useful, 011 * but WITHOUT ANY WARRANTY; without even the implied warranty of 012 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 013 * Lesser General Public License for more details. 014 * 015 * You should have received a copy of the GNU Lesser General Public 016 * License along with this library. If not, see <http://www.gnu.org/licenses/>. 017 * 018 **/ 019package lucee.runtime.search.lucene2.docs; 020 021import java.io.IOException; 022import java.io.InputStream; 023import java.io.StringWriter; 024import java.util.Date; 025 026import lucee.commons.io.IOUtil; 027import lucee.commons.io.res.Resource; 028import lucee.commons.lang.ExceptionUtil; 029import lucee.commons.lang.StringUtil; 030 031import org.apache.lucene.document.DateField; 032import org.apache.lucene.document.Document; 033import org.pdfbox.pdmodel.PDDocument; 034import org.pdfbox.pdmodel.PDDocumentInformation; 035import org.pdfbox.util.PDFTextStripper; 036 037/** 038 * This class is used to create a document for the lucene search engine. 039 * This should easily plug into the IndexHTML or IndexFiles that comes with 040 * the lucene project. This class will populate the following fields. 041 * <table> 042 * <tr> 043 * <td>Lucene Field Name</td> 044 * <td>Description</td> 045 * </tr> 046 * <tr> 047 * <td>path</td> 048 * <td>File system path if loaded from a file</td> 049 * </tr> 050 * <tr> 051 * <td>url</td> 052 * <td>URL to PDF document</td> 053 * </tr> 054 * <tr> 055 * <td>contents</td> 056 * <td>Entire contents of PDF document, indexed but not stored</td> 057 * </tr> 058 * <tr> 059 * <td>summary</td> 060 * <td>First 500 characters of content</td> 061 * </tr> 062 * <tr> 063 * <td>modified</td> 064 * <td>The modified date/time according to the url or path</td> 065 * </tr> 066 * <tr> 067 * <td>uid</td> 068 * <td>A unique identifier for the Lucene document.</td> 069 * </tr> 070 * <tr> 071 * <td>CreationDate</td> 072 * <td>From PDF meta-data if available</td> 073 * </tr> 074 * <tr> 075 * <td>Creator</td> 076 * <td>From PDF meta-data if available</td> 077 * </tr> 078 * <tr> 079 * <td>Keywords</td> 080 * <td>From PDF meta-data if available</td> 081 * </tr> 082 * <tr> 083 * <td>ModificationDate</td> 084 * <td>From PDF meta-data if available</td> 085 * </tr> 086 * <tr> 087 * <td>Producer</td> 088 * <td>From PDF meta-data if available</td> 089 * </tr> 090 * <tr> 091 * <td>Subject</td> 092 * <td>From PDF meta-data if available</td> 093 * </tr> 094 * <tr> 095 * <td>Trapped</td> 096 * <td>From PDF meta-data if available</td> 097 * </tr> 098 * </table> 099 * 100 */ 101public final class PDFDocument 102{ 103 private static final char FILE_SEPARATOR = System.getProperty("file.separator").charAt(0); 104 private static final int SUMMERY_SIZE=200; 105 106 /** 107 * private constructor because there are only static methods. 108 */ 109 private PDFDocument() 110 { 111 //utility class should not be instantiated 112 } 113 114 /** 115 * This will get a lucene document from a PDF file. 116 * @param is The stream to read the PDF from. 117 * @return The lucene document. 118 * @throws IOException If there is an error parsing or indexing the document. 119 */ 120 public static Document getDocument(StringBuffer content, InputStream is ) { 121 Document document = new Document(); 122 addContent(content, document, is); 123 return document; 124 } 125 126 /** 127 * This will get a lucene document from a PDF file. 128 * @param res The file to get the document for. 129 * @return The lucene document. 130 * @throws IOException If there is an error parsing or indexing the document. 131 */ 132 public static Document getDocument( Resource res ) { 133 Document document = new Document(); 134 FieldUtil.setMimeType(document, "application/pdf"); 135 //document.add(FieldUtil.UnIndexed("mime-type", "application/pdf")); 136 document.add( FieldUtil.UnIndexed("path", res.getPath() ) ); 137 138 String uid = res.getPath().replace(FILE_SEPARATOR, '\u0000') + "\u0000" + 139 DateField.timeToString(res.lastModified() ); 140 document.add(FieldUtil.Text("uid", uid, false)); 141 142 // Add the uid as a field, so that index can be incrementally maintained. 143 // This field is not stored with document, it is indexed, but it is not 144 // tokenized prior to indexing. 145 //document.add(new Field("uid", uid, Field.Store.NO,Field.Index.UN_TOKENIZED)); 146 //document.add(new Field("uid", uid, false, true, false)); 147 148 InputStream is = null; 149 try 150 { 151 is = IOUtil.toBufferedInputStream(res.getInputStream()); 152 addContent(null, document, is); 153 } 154 catch(IOException ioe) { 155 156 } 157 finally { 158 IOUtil.closeEL(is); 159 } 160 161 162 // return the document 163 164 return document; 165 } 166 167 /** 168 * This will add the contents to the lucene document. 169 * @param content 170 * 171 * @param document The document to add the contents to. 172 * @param is The stream to get the contents from. 173 * @param documentLocation The location of the document, used just for debug messages. 174 * 175 * @throws IOException If there is an error parsing the document. 176 */ 177 private static void addContent( StringBuffer content, Document document, InputStream is) { 178 179 PDDocument pdfDocument=null; 180 try { 181 pdfDocument = PDDocument.load( is ); 182 183 if( pdfDocument.isEncrypted() ) 184 { 185 //Just try using the default password and move on 186 pdfDocument.decrypt( "" ); 187 } 188 189 //create a writer where to append the text content. 190 StringWriter writer = new StringWriter(); 191 PDFTextStripper stripper = new PDFTextStripper(); 192 stripper.writeText( pdfDocument, writer ); 193 194 // Note: the buffer to string operation is costless; 195 // the char array value of the writer buffer and the content string 196 // is shared as long as the buffer content is not modified, which will 197 // not occur here. 198 String contents = writer.getBuffer().toString(); 199 if(content!=null)content.append(contents); 200 201 FieldUtil.setRaw(document,contents); 202 FieldUtil.setContent(document, contents); 203 FieldUtil.setSummary(document, StringUtil.max(contents,SUMMERY_SIZE),false); 204 205 206 PDDocumentInformation info = pdfDocument.getDocumentInformation(); 207 if( info.getAuthor() != null) { 208 FieldUtil.setAuthor(document, info.getAuthor()); 209 } 210 if( info.getCreationDate() != null ) 211 { 212 Date date = info.getCreationDate().getTime(); 213 if( date.getTime() >= 0 ) { 214 document.add(FieldUtil.Text("CreationDate", DateField.dateToString( date ) ) ); 215 } 216 } 217 if( info.getCreator() != null ){ 218 document.add( FieldUtil.Text( "Creator", info.getCreator() ) ); 219 } 220 if( info.getKeywords() != null ){ 221 FieldUtil.setKeywords(document, info.getKeywords()); 222 } 223 if( info.getModificationDate() != null) { 224 Date date = info.getModificationDate().getTime(); 225 if( date.getTime() >= 0 ){ 226 document.add(FieldUtil.Text("ModificationDate", DateField.dateToString( date ) ) ); 227 } 228 } 229 if( info.getProducer() != null ){ 230 document.add( FieldUtil.Text( "Producer", info.getProducer() ) ); 231 } 232 if( info.getSubject() != null ){ 233 document.add( FieldUtil.Text( "Subject", info.getSubject() ) ); 234 } 235 if( info.getTitle() != null ){ 236 FieldUtil.setTitle(document, info.getTitle()); 237 } 238 if( info.getTrapped() != null ) { 239 document.add( FieldUtil.Text( "Trapped", info.getTrapped() ) ); 240 } 241 } 242 catch(Throwable t) { 243 ExceptionUtil.rethrowIfNecessary(t); 244 } 245 finally { 246 if( pdfDocument != null ) { 247 try { 248 pdfDocument.close(); 249 } 250 catch (IOException e) {e.printStackTrace();} 251 } 252 } 253 } 254}