001    package railo.runtime.search.lucene2.docs;
002    
003    import java.io.IOException;
004    import java.io.InputStream;
005    import java.io.StringWriter;
006    import java.util.Date;
007    
008    import org.apache.lucene.document.DateField;
009    import org.apache.lucene.document.Document;
010    import org.pdfbox.pdmodel.PDDocument;
011    import org.pdfbox.pdmodel.PDDocumentInformation;
012    import org.pdfbox.util.PDFTextStripper;
013    
014    import railo.commons.io.IOUtil;
015    import railo.commons.io.res.Resource;
016    import railo.commons.lang.StringUtil;
017    
018    /**
019     * This class is used to create a document for the lucene search engine.
020     * This should easily plug into the IndexHTML or IndexFiles that comes with
021     * the lucene project.  This class will populate the following fields.
022     * <table>
023     *      <tr>
024     *          <td>Lucene Field Name</td>
025     *          <td>Description</td>
026     *      </tr>
027     *      <tr>
028     *          <td>path</td>
029     *          <td>File system path if loaded from a file</td>
030     *      </tr>
031     *      <tr>
032     *          <td>url</td>
033     *          <td>URL to PDF document</td>
034     *      </tr>
035     *      <tr>
036     *          <td>contents</td>
037     *          <td>Entire contents of PDF document, indexed but not stored</td>
038     *      </tr>
039     *      <tr>
040     *          <td>summary</td>
041     *          <td>First 500 characters of content</td>
042     *      </tr>
043     *      <tr>
044     *          <td>modified</td>
045     *          <td>The modified date/time according to the url or path</td>
046     *      </tr>
047     *      <tr>
048     *          <td>uid</td>
049     *          <td>A unique identifier for the Lucene document.</td>
050     *      </tr>
051     *      <tr>
052     *          <td>CreationDate</td>
053     *          <td>From PDF meta-data if available</td>
054     *      </tr>
055     *      <tr>
056     *          <td>Creator</td>
057     *          <td>From PDF meta-data if available</td>
058     *      </tr>
059     *      <tr>
060     *          <td>Keywords</td>
061     *          <td>From PDF meta-data if available</td>
062     *      </tr>
063     *      <tr>
064     *          <td>ModificationDate</td>
065     *          <td>From PDF meta-data if available</td>
066     *      </tr>
067     *      <tr>
068     *          <td>Producer</td>
069     *          <td>From PDF meta-data if available</td>
070     *      </tr>
071     *      <tr>
072     *          <td>Subject</td>
073     *          <td>From PDF meta-data if available</td>
074     *      </tr>
075     *      <tr>
076     *          <td>Trapped</td>
077     *          <td>From PDF meta-data if available</td>
078     *      </tr>
079     * </table>
080     *
081     */
082    public final class PDFDocument
083    {
084        private static final char FILE_SEPARATOR = System.getProperty("file.separator").charAt(0);
085        private static final int SUMMERY_SIZE=200;
086    
087        /**
088         * private constructor because there are only static methods.
089         */
090        private PDFDocument()
091        {
092            //utility class should not be instantiated
093        }
094        
095        /**
096         * This will get a lucene document from a PDF file.
097         * @param is The stream to read the PDF from.
098         * @return The lucene document.
099         * @throws IOException If there is an error parsing or indexing the document.
100         */
101        public static Document getDocument(StringBuffer content, InputStream is ) {
102            Document document = new Document();
103            addContent(content, document, is);
104            return document;
105        }
106    
107        /**
108         * This will get a lucene document from a PDF file.
109         * @param res The file to get the document for.
110         * @return The lucene document.
111         * @throws IOException If there is an error parsing or indexing the document.
112         */
113        public static Document getDocument( Resource res ) {
114            Document document = new Document();
115            FieldUtil.setMimeType(document, "application/pdf");
116            //document.add(FieldUtil.UnIndexed("mime-type", "application/pdf"));
117            document.add( FieldUtil.UnIndexed("path", res.getPath() ) );
118            
119            String uid = res.getPath().replace(FILE_SEPARATOR, '\u0000') + "\u0000" +
120                   DateField.timeToString(res.lastModified() );
121            document.add(FieldUtil.Text("uid", uid, false));
122            
123            // Add the uid as a field, so that index can be incrementally maintained.
124            // This field is not stored with document, it is indexed, but it is not
125            // tokenized prior to indexing.
126            //document.add(new Field("uid", uid, Field.Store.NO,Field.Index.UN_TOKENIZED));
127            //document.add(new Field("uid", uid, false, true, false));
128    
129            InputStream is = null;
130            try
131            {
132                is = IOUtil.toBufferedInputStream(res.getInputStream());
133                addContent(null, document, is);
134            }
135            catch(IOException ioe) {
136                
137            }
138            finally {
139                IOUtil.closeEL(is);
140            }
141    
142    
143            // return the document
144    
145            return document;
146        }
147    
148        /**
149         * This will add the contents to the lucene document.
150         * @param content 
151         *
152         * @param document The document to add the contents to.
153         * @param is The stream to get the contents from.
154         * @param documentLocation The location of the document, used just for debug messages.
155         *
156         * @throws IOException If there is an error parsing the document.
157         */
158        private static void addContent( StringBuffer content, Document document, InputStream is) {
159            
160            PDDocument pdfDocument=null;
161            try {
162                pdfDocument = PDDocument.load( is );
163    
164                if( pdfDocument.isEncrypted() )
165                {
166                    //Just try using the default password and move on
167                    pdfDocument.decrypt( "" );
168                }
169                
170                //create a writer where to append the text content.
171                StringWriter writer = new StringWriter();
172                PDFTextStripper stripper = new PDFTextStripper();
173                stripper.writeText( pdfDocument, writer );
174    
175                // Note: the buffer to string operation is costless;
176                // the char array value of the writer buffer and the content string
177                // is shared as long as the buffer content is not modified, which will
178                // not occur here.
179                String contents = writer.getBuffer().toString();
180                if(content!=null)content.append(contents);
181                
182                FieldUtil.setRaw(document,contents);
183                FieldUtil.setContent(document, contents);
184                FieldUtil.setSummary(document, StringUtil.max(contents,SUMMERY_SIZE),false);
185                
186                
187                PDDocumentInformation info = pdfDocument.getDocumentInformation();
188                if( info.getAuthor() != null)       {
189                    FieldUtil.setAuthor(document, info.getAuthor());
190                }
191                if( info.getCreationDate() != null )
192                {
193                    Date date = info.getCreationDate().getTime();
194                    if( date.getTime() >= 0 )    {
195                        document.add(FieldUtil.Text("CreationDate", DateField.dateToString( date ) ) );
196                    }
197                }
198                if( info.getCreator() != null ){
199                    document.add( FieldUtil.Text( "Creator", info.getCreator() ) );
200                }
201                if( info.getKeywords() != null ){
202                    FieldUtil.setKeywords(document, info.getKeywords());
203                }
204                if( info.getModificationDate() != null)     {
205                    Date date = info.getModificationDate().getTime();
206                    if( date.getTime() >= 0 ){
207                        document.add(FieldUtil.Text("ModificationDate", DateField.dateToString( date ) ) );
208                    }
209                }
210                if( info.getProducer() != null ){
211                    document.add( FieldUtil.Text( "Producer", info.getProducer() ) );
212                }
213                if( info.getSubject() != null ){
214                    document.add( FieldUtil.Text( "Subject", info.getSubject() ) );
215                }
216                if( info.getTitle() != null ){
217                    FieldUtil.setTitle(document, info.getTitle());
218                }
219                if( info.getTrapped() != null ) {
220                    document.add( FieldUtil.Text( "Trapped", info.getTrapped() ) );
221                }
222            }
223            catch(Throwable t) {}
224            finally {
225                if( pdfDocument != null ) {
226                    try {
227                        pdfDocument.close();
228                    } 
229                    catch (IOException e) {e.printStackTrace();}
230                }
231            }
232        }
233    }