001/** 002 * 003 * Copyright (c) 2014, the Railo Company Ltd. All rights reserved. 004 * 005 * This library is free software; you can redistribute it and/or 006 * modify it under the terms of the GNU Lesser General Public 007 * License as published by the Free Software Foundation; either 008 * version 2.1 of the License, or (at your option) any later version. 009 * 010 * This library is distributed in the hope that it will be useful, 011 * but WITHOUT ANY WARRANTY; without even the implied warranty of 012 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 013 * Lesser General Public License for more details. 014 * 015 * You should have received a copy of the GNU Lesser General Public 016 * License along with this library. If not, see <http://www.gnu.org/licenses/>. 017 * 018 **/ 019package lucee.runtime.search.lucene2.docs; 020 021import java.io.IOException; 022import java.io.InputStream; 023 024import lucee.commons.io.IOUtil; 025import lucee.commons.io.res.Resource; 026import lucee.commons.lang.StringUtil; 027import lucee.runtime.op.Caster; 028 029import org.apache.lucene.document.Document; 030import org.textmining.text.extraction.WordExtractor; 031 032/** A utility for making Lucene Documents from a File. */ 033 034public final class WordDocument { 035 036 private static final int SUMMERY_SIZE=20; 037 //private static final char FILE_SEPARATOR = System.getProperty("file.separator").charAt(0); 038 039 /** Makes a document for a File. 040 <p> 041 The document has three fields: 042 <ul> 043 <li><code>path</code>--containing the pathname of the file, as a stored, 044 tokenized field; 045 <li><code>modified</code>--containing the last modified date of the file as 046 a keyword field as encoded by <a 047 href="lucene.document.DateField.html">DateField</a>; and 048 <li><code>contents</code>--containing the full contents of the file, as a 049 Reader field; 050 * @param res 051 * @return matching document 052 * @throws IOException 053 */ 054 public static Document getDocument(Resource res) throws IOException { 055 056 // make a new, empty document 057 Document doc = new Document(); 058 InputStream is =null; 059 try{ 060 is=IOUtil.toBufferedInputStream(res.getInputStream()); 061 addContent(null,doc,is); 062 } 063 finally{ 064 IOUtil.closeEL(is); 065 } 066 return doc; 067 } 068 069 public static Document getDocument(StringBuffer content, InputStream is) throws IOException { 070 Document doc = new Document(); 071 addContent(content,doc,is); 072 return doc; 073 } 074 075 076 077 private static void addContent(StringBuffer content, Document doc, InputStream is) throws IOException { 078 FieldUtil.setMimeType(doc, "application/msword"); 079 WordExtractor extractor = new WordExtractor(); 080 String contents; 081 try { 082 contents = extractor.extractText(is); 083 if(content!=null)content.append(contents); 084 } catch (Exception e) { 085 if(e instanceof IOException) throw (IOException)e; 086 throw new IOException(e.getMessage()); 087 } 088 doc.add(FieldUtil.Text("size", Caster.toString(contents.length()))); 089 FieldUtil.setRaw(doc,contents); 090 FieldUtil.setContent(doc, contents); 091 //doc.add(FieldUtil.Text("contents", contents.toLowerCase())); 092 FieldUtil.setSummary(doc, StringUtil.max(contents,SUMMERY_SIZE),false); 093 //doc.add(FieldUtil.UnIndexed("summary",)); 094 } 095 096 097 098private WordDocument() {} 099} 100