001/**
002 *
003 * Copyright (c) 2014, the Railo Company Ltd. All rights reserved.
004 *
005 * This library is free software; you can redistribute it and/or
006 * modify it under the terms of the GNU Lesser General Public
007 * License as published by the Free Software Foundation; either 
008 * version 2.1 of the License, or (at your option) any later version.
009 * 
010 * This library is distributed in the hope that it will be useful,
011 * but WITHOUT ANY WARRANTY; without even the implied warranty of
012 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
013 * Lesser General Public License for more details.
014 * 
015 * You should have received a copy of the GNU Lesser General Public 
016 * License along with this library.  If not, see <http://www.gnu.org/licenses/>.
017 * 
018 **/
019package lucee.runtime.search.lucene2.docs;
020
021import java.io.IOException;
022import java.io.InputStream;
023import java.io.StringWriter;
024import java.util.Date;
025
026import lucee.commons.io.IOUtil;
027import lucee.commons.io.res.Resource;
028import lucee.commons.lang.ExceptionUtil;
029import lucee.commons.lang.StringUtil;
030
031import org.apache.lucene.document.DateField;
032import org.apache.lucene.document.Document;
033import org.pdfbox.pdmodel.PDDocument;
034import org.pdfbox.pdmodel.PDDocumentInformation;
035import org.pdfbox.util.PDFTextStripper;
036
037/**
038 * This class is used to create a document for the lucene search engine.
039 * This should easily plug into the IndexHTML or IndexFiles that comes with
040 * the lucene project.  This class will populate the following fields.
041 * <table>
042 *      <tr>
043 *          <td>Lucene Field Name</td>
044 *          <td>Description</td>
045 *      </tr>
046 *      <tr>
047 *          <td>path</td>
048 *          <td>File system path if loaded from a file</td>
049 *      </tr>
050 *      <tr>
051 *          <td>url</td>
052 *          <td>URL to PDF document</td>
053 *      </tr>
054 *      <tr>
055 *          <td>contents</td>
056 *          <td>Entire contents of PDF document, indexed but not stored</td>
057 *      </tr>
058 *      <tr>
059 *          <td>summary</td>
060 *          <td>First 500 characters of content</td>
061 *      </tr>
062 *      <tr>
063 *          <td>modified</td>
064 *          <td>The modified date/time according to the url or path</td>
065 *      </tr>
066 *      <tr>
067 *          <td>uid</td>
068 *          <td>A unique identifier for the Lucene document.</td>
069 *      </tr>
070 *      <tr>
071 *          <td>CreationDate</td>
072 *          <td>From PDF meta-data if available</td>
073 *      </tr>
074 *      <tr>
075 *          <td>Creator</td>
076 *          <td>From PDF meta-data if available</td>
077 *      </tr>
078 *      <tr>
079 *          <td>Keywords</td>
080 *          <td>From PDF meta-data if available</td>
081 *      </tr>
082 *      <tr>
083 *          <td>ModificationDate</td>
084 *          <td>From PDF meta-data if available</td>
085 *      </tr>
086 *      <tr>
087 *          <td>Producer</td>
088 *          <td>From PDF meta-data if available</td>
089 *      </tr>
090 *      <tr>
091 *          <td>Subject</td>
092 *          <td>From PDF meta-data if available</td>
093 *      </tr>
094 *      <tr>
095 *          <td>Trapped</td>
096 *          <td>From PDF meta-data if available</td>
097 *      </tr>
098 * </table>
099 *
100 */
101public final class PDFDocument
102{
103    private static final char FILE_SEPARATOR = System.getProperty("file.separator").charAt(0);
104    private static final int SUMMERY_SIZE=200;
105
106    /**
107     * private constructor because there are only static methods.
108     */
109    private PDFDocument()
110    {
111        //utility class should not be instantiated
112    }
113    
114    /**
115     * This will get a lucene document from a PDF file.
116     * @param is The stream to read the PDF from.
117     * @return The lucene document.
118     * @throws IOException If there is an error parsing or indexing the document.
119     */
120    public static Document getDocument(StringBuffer content, InputStream is ) {
121        Document document = new Document();
122        addContent(content, document, is);
123        return document;
124    }
125
126    /**
127     * This will get a lucene document from a PDF file.
128     * @param res The file to get the document for.
129     * @return The lucene document.
130     * @throws IOException If there is an error parsing or indexing the document.
131     */
132    public static Document getDocument( Resource res ) {
133        Document document = new Document();
134        FieldUtil.setMimeType(document, "application/pdf");
135        //document.add(FieldUtil.UnIndexed("mime-type", "application/pdf"));
136        document.add( FieldUtil.UnIndexed("path", res.getPath() ) );
137        
138        String uid = res.getPath().replace(FILE_SEPARATOR, '\u0000') + "\u0000" +
139               DateField.timeToString(res.lastModified() );
140        document.add(FieldUtil.Text("uid", uid, false));
141        
142        // Add the uid as a field, so that index can be incrementally maintained.
143        // This field is not stored with document, it is indexed, but it is not
144        // tokenized prior to indexing.
145        //document.add(new Field("uid", uid, Field.Store.NO,Field.Index.UN_TOKENIZED));
146        //document.add(new Field("uid", uid, false, true, false));
147
148        InputStream is = null;
149        try
150        {
151            is = IOUtil.toBufferedInputStream(res.getInputStream());
152            addContent(null, document, is);
153        }
154        catch(IOException ioe) {
155            
156        }
157        finally {
158            IOUtil.closeEL(is);
159        }
160
161
162        // return the document
163
164        return document;
165    }
166
167    /**
168     * This will add the contents to the lucene document.
169     * @param content 
170     *
171     * @param document The document to add the contents to.
172     * @param is The stream to get the contents from.
173     * @param documentLocation The location of the document, used just for debug messages.
174     *
175     * @throws IOException If there is an error parsing the document.
176     */
177    private static void addContent( StringBuffer content, Document document, InputStream is) {
178        
179        PDDocument pdfDocument=null;
180        try {
181            pdfDocument = PDDocument.load( is );
182
183            if( pdfDocument.isEncrypted() )
184            {
185                //Just try using the default password and move on
186                pdfDocument.decrypt( "" );
187            }
188            
189            //create a writer where to append the text content.
190            StringWriter writer = new StringWriter();
191            PDFTextStripper stripper = new PDFTextStripper();
192            stripper.writeText( pdfDocument, writer );
193
194            // Note: the buffer to string operation is costless;
195            // the char array value of the writer buffer and the content string
196            // is shared as long as the buffer content is not modified, which will
197            // not occur here.
198            String contents = writer.getBuffer().toString();
199            if(content!=null)content.append(contents);
200            
201            FieldUtil.setRaw(document,contents);
202            FieldUtil.setContent(document, contents);
203            FieldUtil.setSummary(document, StringUtil.max(contents,SUMMERY_SIZE),false);
204            
205            
206            PDDocumentInformation info = pdfDocument.getDocumentInformation();
207            if( info.getAuthor() != null)       {
208                FieldUtil.setAuthor(document, info.getAuthor());
209            }
210            if( info.getCreationDate() != null )
211            {
212                Date date = info.getCreationDate().getTime();
213                if( date.getTime() >= 0 )    {
214                    document.add(FieldUtil.Text("CreationDate", DateField.dateToString( date ) ) );
215                }
216            }
217            if( info.getCreator() != null ){
218                document.add( FieldUtil.Text( "Creator", info.getCreator() ) );
219            }
220            if( info.getKeywords() != null ){
221                FieldUtil.setKeywords(document, info.getKeywords());
222            }
223            if( info.getModificationDate() != null)     {
224                Date date = info.getModificationDate().getTime();
225                if( date.getTime() >= 0 ){
226                    document.add(FieldUtil.Text("ModificationDate", DateField.dateToString( date ) ) );
227                }
228            }
229            if( info.getProducer() != null ){
230                document.add( FieldUtil.Text( "Producer", info.getProducer() ) );
231            }
232            if( info.getSubject() != null ){
233                document.add( FieldUtil.Text( "Subject", info.getSubject() ) );
234            }
235            if( info.getTitle() != null ){
236                FieldUtil.setTitle(document, info.getTitle());
237            }
238            if( info.getTrapped() != null ) {
239                document.add( FieldUtil.Text( "Trapped", info.getTrapped() ) );
240            }
241        }
242        catch(Throwable t) {
243                        ExceptionUtil.rethrowIfNecessary(t);
244                }
245        finally {
246            if( pdfDocument != null ) {
247                try {
248                    pdfDocument.close();
249                } 
250                catch (IOException e) {e.printStackTrace();}
251            }
252        }
253    }
254}