001/**
002 *
003 * Copyright (c) 2014, the Railo Company Ltd. All rights reserved.
004 *
005 * This library is free software; you can redistribute it and/or
006 * modify it under the terms of the GNU Lesser General Public
007 * License as published by the Free Software Foundation; either 
008 * version 2.1 of the License, or (at your option) any later version.
009 * 
010 * This library is distributed in the hope that it will be useful,
011 * but WITHOUT ANY WARRANTY; without even the implied warranty of
012 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
013 * Lesser General Public License for more details.
014 * 
015 * You should have received a copy of the GNU Lesser General Public 
016 * License along with this library.  If not, see <http://www.gnu.org/licenses/>.
017 * 
018 **/
019package lucee.runtime.search.lucene2.docs;
020
021import java.io.IOException;
022import java.io.InputStream;
023
024import lucee.commons.io.IOUtil;
025import lucee.commons.io.res.Resource;
026import lucee.commons.lang.StringUtil;
027import lucee.runtime.op.Caster;
028
029import org.apache.lucene.document.Document;
030import org.textmining.text.extraction.WordExtractor;
031
032/** A utility for making Lucene Documents from a File. */
033
034public final class WordDocument {
035    
036    private static final int SUMMERY_SIZE=20;
037    //private static final char FILE_SEPARATOR = System.getProperty("file.separator").charAt(0);
038    
039  /** Makes a document for a File.
040    <p>
041    The document has three fields:
042    <ul>
043    <li><code>path</code>--containing the pathname of the file, as a stored,
044    tokenized field;
045    <li><code>modified</code>--containing the last modified date of the file as
046    a keyword field as encoded by <a
047    href="lucene.document.DateField.html">DateField</a>; and
048    <li><code>contents</code>--containing the full contents of the file, as a
049    Reader field;
050 * @param res
051 * @return matching document
052 * @throws IOException
053    */
054    public static Document getDocument(Resource res) throws IOException {
055         
056        // make a new, empty document
057        Document doc = new Document();          
058        InputStream is =null;
059        try{
060                is=IOUtil.toBufferedInputStream(res.getInputStream());
061                addContent(null,doc,is);
062        }
063        finally{
064                IOUtil.closeEL(is);
065        }
066            return doc;
067        }
068    
069    public static Document getDocument(StringBuffer content, InputStream is) throws IOException {
070                Document doc = new Document();
071        addContent(content,doc,is);
072        return doc;
073        }
074  
075  
076
077        private static void addContent(StringBuffer content, Document doc, InputStream is) throws IOException {
078        FieldUtil.setMimeType(doc, "application/msword");
079                WordExtractor extractor = new WordExtractor();
080            String contents;
081                try {
082                        contents = extractor.extractText(is);
083                        if(content!=null)content.append(contents);
084                } catch (Exception e) {
085                        if(e instanceof IOException) throw (IOException)e;
086                        throw new IOException(e.getMessage());
087                }
088            doc.add(FieldUtil.Text("size", Caster.toString(contents.length())));
089            FieldUtil.setRaw(doc,contents);
090            FieldUtil.setContent(doc, contents);
091            //doc.add(FieldUtil.Text("contents", contents.toLowerCase()));
092            FieldUtil.setSummary(doc, StringUtil.max(contents,SUMMERY_SIZE),false);
093            //doc.add(FieldUtil.UnIndexed("summary",));
094        }
095
096
097
098private WordDocument() {}
099}
100