001    package railo.runtime.search.lucene2.docs;
002    
003    import java.io.IOException;
004    import java.io.InputStream;
005    
006    import org.apache.lucene.document.Document;
007    import org.textmining.text.extraction.WordExtractor;
008    
009    import railo.commons.io.IOUtil;
010    import railo.commons.io.res.Resource;
011    import railo.commons.lang.StringUtil;
012    import railo.runtime.op.Caster;
013    
014    /** A utility for making Lucene Documents from a File. */
015    
016    public final class WordDocument {
017        
018        private static final int SUMMERY_SIZE=20;
019        //private static final char FILE_SEPARATOR = System.getProperty("file.separator").charAt(0);
020        
021      /** Makes a document for a File.
022        <p>
023        The document has three fields:
024        <ul>
025        <li><code>path</code>--containing the pathname of the file, as a stored,
026        tokenized field;
027        <li><code>modified</code>--containing the last modified date of the file as
028        a keyword field as encoded by <a
029        href="lucene.document.DateField.html">DateField</a>; and
030        <li><code>contents</code>--containing the full contents of the file, as a
031        Reader field;
032     * @param res
033     * @return matching document
034     * @throws IOException
035        */
036        public static Document getDocument(Resource res) throws IOException {
037             
038            // make a new, empty document
039            Document doc = new Document();          
040            InputStream is =null;
041            try{
042                    is=IOUtil.toBufferedInputStream(res.getInputStream());
043                    addContent(null,doc,is);
044            }
045            finally{
046                    IOUtil.closeEL(is);
047            }
048                return doc;
049            }
050        
051        public static Document getDocument(StringBuffer content, InputStream is) throws IOException {
052                    Document doc = new Document();
053            addContent(content,doc,is);
054            return doc;
055            }
056      
057      
058    
059            private static void addContent(StringBuffer content, Document doc, InputStream is) throws IOException {
060            FieldUtil.setMimeType(doc, "application/msword");
061                    WordExtractor extractor = new WordExtractor();
062                String contents;
063                    try {
064                            contents = extractor.extractText(is);
065                            if(content!=null)content.append(contents);
066                    } catch (Exception e) {
067                            if(e instanceof IOException) throw (IOException)e;
068                            throw new IOException(e.getMessage());
069                    }
070                doc.add(FieldUtil.Text("size", Caster.toString(contents.length())));
071                FieldUtil.setRaw(doc,contents);
072                FieldUtil.setContent(doc, contents);
073                //doc.add(FieldUtil.Text("contents", contents.toLowerCase()));
074                FieldUtil.setSummary(doc, StringUtil.max(contents,SUMMERY_SIZE),false);
075                //doc.add(FieldUtil.UnIndexed("summary",));
076            }
077    
078    
079    
080    private WordDocument() {}
081    }
082