001 package railo.runtime.search.lucene2.docs; 002 003 import java.io.IOException; 004 import java.io.InputStream; 005 006 import org.apache.lucene.document.Document; 007 import org.textmining.text.extraction.WordExtractor; 008 009 import railo.commons.io.IOUtil; 010 import railo.commons.io.res.Resource; 011 import railo.commons.lang.StringUtil; 012 import railo.runtime.op.Caster; 013 014 /** A utility for making Lucene Documents from a File. */ 015 016 public final class WordDocument { 017 018 private static final int SUMMERY_SIZE=20; 019 //private static final char FILE_SEPARATOR = System.getProperty("file.separator").charAt(0); 020 021 /** Makes a document for a File. 022 <p> 023 The document has three fields: 024 <ul> 025 <li><code>path</code>--containing the pathname of the file, as a stored, 026 tokenized field; 027 <li><code>modified</code>--containing the last modified date of the file as 028 a keyword field as encoded by <a 029 href="lucene.document.DateField.html">DateField</a>; and 030 <li><code>contents</code>--containing the full contents of the file, as a 031 Reader field; 032 * @param res 033 * @return matching document 034 * @throws IOException 035 */ 036 public static Document getDocument(Resource res) throws IOException { 037 038 // make a new, empty document 039 Document doc = new Document(); 040 InputStream is =null; 041 try{ 042 is=IOUtil.toBufferedInputStream(res.getInputStream()); 043 addContent(null,doc,is); 044 } 045 finally{ 046 IOUtil.closeEL(is); 047 } 048 return doc; 049 } 050 051 public static Document getDocument(StringBuffer content, InputStream is) throws IOException { 052 Document doc = new Document(); 053 addContent(content,doc,is); 054 return doc; 055 } 056 057 058 059 private static void addContent(StringBuffer content, Document doc, InputStream is) throws IOException { 060 FieldUtil.setMimeType(doc, "application/msword"); 061 WordExtractor extractor = new WordExtractor(); 062 String contents; 063 try { 064 contents = extractor.extractText(is); 065 if(content!=null)content.append(contents); 066 } catch (Exception e) { 067 if(e instanceof IOException) throw (IOException)e; 068 throw new IOException(e.getMessage()); 069 } 070 doc.add(FieldUtil.Text("size", Caster.toString(contents.length()))); 071 FieldUtil.setRaw(doc,contents); 072 FieldUtil.setContent(doc, contents); 073 //doc.add(FieldUtil.Text("contents", contents.toLowerCase())); 074 FieldUtil.setSummary(doc, StringUtil.max(contents,SUMMERY_SIZE),false); 075 //doc.add(FieldUtil.UnIndexed("summary",)); 076 } 077 078 079 080 private WordDocument() {} 081 } 082