001 package railo.runtime.search.lucene2.docs; 002 003 import java.io.IOException; 004 import java.io.InputStream; 005 import java.io.StringWriter; 006 import java.util.Date; 007 008 import org.apache.lucene.document.DateField; 009 import org.apache.lucene.document.Document; 010 import org.pdfbox.pdmodel.PDDocument; 011 import org.pdfbox.pdmodel.PDDocumentInformation; 012 import org.pdfbox.util.PDFTextStripper; 013 014 import railo.commons.io.IOUtil; 015 import railo.commons.io.res.Resource; 016 import railo.commons.lang.StringUtil; 017 018 /** 019 * This class is used to create a document for the lucene search engine. 020 * This should easily plug into the IndexHTML or IndexFiles that comes with 021 * the lucene project. This class will populate the following fields. 022 * <table> 023 * <tr> 024 * <td>Lucene Field Name</td> 025 * <td>Description</td> 026 * </tr> 027 * <tr> 028 * <td>path</td> 029 * <td>File system path if loaded from a file</td> 030 * </tr> 031 * <tr> 032 * <td>url</td> 033 * <td>URL to PDF document</td> 034 * </tr> 035 * <tr> 036 * <td>contents</td> 037 * <td>Entire contents of PDF document, indexed but not stored</td> 038 * </tr> 039 * <tr> 040 * <td>summary</td> 041 * <td>First 500 characters of content</td> 042 * </tr> 043 * <tr> 044 * <td>modified</td> 045 * <td>The modified date/time according to the url or path</td> 046 * </tr> 047 * <tr> 048 * <td>uid</td> 049 * <td>A unique identifier for the Lucene document.</td> 050 * </tr> 051 * <tr> 052 * <td>CreationDate</td> 053 * <td>From PDF meta-data if available</td> 054 * </tr> 055 * <tr> 056 * <td>Creator</td> 057 * <td>From PDF meta-data if available</td> 058 * </tr> 059 * <tr> 060 * <td>Keywords</td> 061 * <td>From PDF meta-data if available</td> 062 * </tr> 063 * <tr> 064 * <td>ModificationDate</td> 065 * <td>From PDF meta-data if available</td> 066 * </tr> 067 * <tr> 068 * <td>Producer</td> 069 * <td>From PDF meta-data if available</td> 070 * </tr> 071 * <tr> 072 * <td>Subject</td> 073 * <td>From PDF meta-data if available</td> 074 * </tr> 075 * <tr> 076 * <td>Trapped</td> 077 * <td>From PDF meta-data if available</td> 078 * </tr> 079 * </table> 080 * 081 */ 082 public final class PDFDocument 083 { 084 private static final char FILE_SEPARATOR = System.getProperty("file.separator").charAt(0); 085 private static final int SUMMERY_SIZE=200; 086 087 /** 088 * private constructor because there are only static methods. 089 */ 090 private PDFDocument() 091 { 092 //utility class should not be instantiated 093 } 094 095 /** 096 * This will get a lucene document from a PDF file. 097 * @param is The stream to read the PDF from. 098 * @return The lucene document. 099 * @throws IOException If there is an error parsing or indexing the document. 100 */ 101 public static Document getDocument(StringBuffer content, InputStream is ) { 102 Document document = new Document(); 103 addContent(content, document, is); 104 return document; 105 } 106 107 /** 108 * This will get a lucene document from a PDF file. 109 * @param res The file to get the document for. 110 * @return The lucene document. 111 * @throws IOException If there is an error parsing or indexing the document. 112 */ 113 public static Document getDocument( Resource res ) { 114 Document document = new Document(); 115 FieldUtil.setMimeType(document, "application/pdf"); 116 //document.add(FieldUtil.UnIndexed("mime-type", "application/pdf")); 117 document.add( FieldUtil.UnIndexed("path", res.getPath() ) ); 118 119 String uid = res.getPath().replace(FILE_SEPARATOR, '\u0000') + "\u0000" + 120 DateField.timeToString(res.lastModified() ); 121 document.add(FieldUtil.Text("uid", uid, false)); 122 123 // Add the uid as a field, so that index can be incrementally maintained. 124 // This field is not stored with document, it is indexed, but it is not 125 // tokenized prior to indexing. 126 //document.add(new Field("uid", uid, Field.Store.NO,Field.Index.UN_TOKENIZED)); 127 //document.add(new Field("uid", uid, false, true, false)); 128 129 InputStream is = null; 130 try 131 { 132 is = IOUtil.toBufferedInputStream(res.getInputStream()); 133 addContent(null, document, is); 134 } 135 catch(IOException ioe) { 136 137 } 138 finally { 139 IOUtil.closeEL(is); 140 } 141 142 143 // return the document 144 145 return document; 146 } 147 148 /** 149 * This will add the contents to the lucene document. 150 * @param content 151 * 152 * @param document The document to add the contents to. 153 * @param is The stream to get the contents from. 154 * @param documentLocation The location of the document, used just for debug messages. 155 * 156 * @throws IOException If there is an error parsing the document. 157 */ 158 private static void addContent( StringBuffer content, Document document, InputStream is) { 159 160 PDDocument pdfDocument=null; 161 try { 162 pdfDocument = PDDocument.load( is ); 163 164 if( pdfDocument.isEncrypted() ) 165 { 166 //Just try using the default password and move on 167 pdfDocument.decrypt( "" ); 168 } 169 170 //create a writer where to append the text content. 171 StringWriter writer = new StringWriter(); 172 PDFTextStripper stripper = new PDFTextStripper(); 173 stripper.writeText( pdfDocument, writer ); 174 175 // Note: the buffer to string operation is costless; 176 // the char array value of the writer buffer and the content string 177 // is shared as long as the buffer content is not modified, which will 178 // not occur here. 179 String contents = writer.getBuffer().toString(); 180 if(content!=null)content.append(contents); 181 182 FieldUtil.setRaw(document,contents); 183 FieldUtil.setContent(document, contents); 184 FieldUtil.setSummary(document, StringUtil.max(contents,SUMMERY_SIZE),false); 185 186 187 PDDocumentInformation info = pdfDocument.getDocumentInformation(); 188 if( info.getAuthor() != null) { 189 FieldUtil.setAuthor(document, info.getAuthor()); 190 } 191 if( info.getCreationDate() != null ) 192 { 193 Date date = info.getCreationDate().getTime(); 194 if( date.getTime() >= 0 ) { 195 document.add(FieldUtil.Text("CreationDate", DateField.dateToString( date ) ) ); 196 } 197 } 198 if( info.getCreator() != null ){ 199 document.add( FieldUtil.Text( "Creator", info.getCreator() ) ); 200 } 201 if( info.getKeywords() != null ){ 202 FieldUtil.setKeywords(document, info.getKeywords()); 203 } 204 if( info.getModificationDate() != null) { 205 Date date = info.getModificationDate().getTime(); 206 if( date.getTime() >= 0 ){ 207 document.add(FieldUtil.Text("ModificationDate", DateField.dateToString( date ) ) ); 208 } 209 } 210 if( info.getProducer() != null ){ 211 document.add( FieldUtil.Text( "Producer", info.getProducer() ) ); 212 } 213 if( info.getSubject() != null ){ 214 document.add( FieldUtil.Text( "Subject", info.getSubject() ) ); 215 } 216 if( info.getTitle() != null ){ 217 FieldUtil.setTitle(document, info.getTitle()); 218 } 219 if( info.getTrapped() != null ) { 220 document.add( FieldUtil.Text( "Trapped", info.getTrapped() ) ); 221 } 222 } 223 catch(Throwable t) {} 224 finally { 225 if( pdfDocument != null ) { 226 try { 227 pdfDocument.close(); 228 } 229 catch (IOException e) {e.printStackTrace();} 230 } 231 } 232 } 233 }