001/** 002 * 003 * Copyright (c) 2014, the Railo Company Ltd. All rights reserved. 004 * 005 * This library is free software; you can redistribute it and/or 006 * modify it under the terms of the GNU Lesser General Public 007 * License as published by the Free Software Foundation; either 008 * version 2.1 of the License, or (at your option) any later version. 009 * 010 * This library is distributed in the hope that it will be useful, 011 * but WITHOUT ANY WARRANTY; without even the implied warranty of 012 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 013 * Lesser General Public License for more details. 014 * 015 * You should have received a copy of the GNU Lesser General Public 016 * License along with this library. If not, see <http://www.gnu.org/licenses/>. 017 * 018 **/ 019package lucee.runtime.search.lucene2; 020 021 022import java.io.IOException; 023import java.io.InputStream; 024import java.io.Reader; 025import java.net.URL; 026 027import lucee.commons.io.IOUtil; 028import lucee.commons.io.res.ContentType; 029import lucee.commons.io.res.ContentTypeImpl; 030import lucee.commons.io.res.Resource; 031import lucee.commons.io.res.util.ResourceUtil; 032import lucee.commons.net.http.HTTPResponse; 033import lucee.runtime.op.Caster; 034import lucee.runtime.search.lucene2.docs.FieldUtil; 035import lucee.runtime.search.lucene2.docs.FileDocument; 036import lucee.runtime.search.lucene2.docs.HTMLDocument; 037import lucee.runtime.search.lucene2.docs.PDFDocument; 038import lucee.runtime.search.lucene2.docs.WordDocument; 039 040import org.apache.lucene.document.DateField; 041import org.apache.lucene.document.Document; 042 043/** 044 * creates a matching Document Object to given File 045 */ 046public final class DocumentUtil { 047 048 public static Document toDocument(StringBuffer content,String root,URL url, HTTPResponse method) throws IOException { 049 if(method.getStatusCode()!=200)return null; 050 051 // get type and charset 052 Document doc=null; 053 ContentType ct = method.getContentType(); 054 long len=method.getContentLength(); 055 String charset=ct==null?"iso-8859-1":ct.getCharset(); 056 057 Runtime rt = Runtime.getRuntime(); 058 if(len>rt.freeMemory()){ 059 Runtime.getRuntime().gc(); 060 if(len>rt.freeMemory()) return null; 061 } 062 063 //print.err("url:"+url+";chr:"+charset+";type:"+type); 064 065 if(ct==null || ct.getMimeType()==null) {} 066 // HTML 067 else if(ct.getMimeType().indexOf("text/html")!=-1) { 068 Reader r=null; 069 try{ 070 r = IOUtil.getReader(method.getContentAsStream(), charset); 071 doc= HTMLDocument.getDocument(content,r); 072 } 073 finally{ 074 IOUtil.closeEL(r); 075 } 076 } 077 // PDF 078 else if(ct.getMimeType().indexOf("application/pdf")!=-1) { 079 InputStream is=null; 080 try{ 081 is=IOUtil.toBufferedInputStream(method.getContentAsStream()); 082 doc= PDFDocument.getDocument(content,is); 083 } 084 finally { 085 IOUtil.closeEL(is); 086 } 087 } 088 // DOC 089 else if(ct.getMimeType().equals("application/msword")) { 090 InputStream is=null; 091 try{ 092 is=IOUtil.toBufferedInputStream(method.getContentAsStream()); 093 doc= WordDocument.getDocument(content,is); 094 } 095 finally { 096 IOUtil.closeEL(is); 097 } 098 099 } 100 // Plain 101 else if(ct.getMimeType().indexOf("text/plain")!=-1) { 102 Reader r=null; 103 try{ 104 r=IOUtil.toBufferedReader(IOUtil.getReader(method.getContentAsStream(), charset)); 105 doc= FileDocument.getDocument(content,r); 106 } 107 finally { 108 IOUtil.closeEL(r); 109 } 110 } 111 112 if(doc!=null){ 113 String strPath=url.toExternalForm(); 114 115 doc.add(FieldUtil.UnIndexed("url", strPath)); 116 doc.add(FieldUtil.UnIndexed("key", strPath)); 117 doc.add(FieldUtil.UnIndexed("path", strPath)); 118 //doc.add(FieldUtil.UnIndexed("size", Caster.toString(file.length()))); 119 //doc.add(FieldUtil.Keyword("modified",DateField.timeToString(file.lastModified()))); 120 } 121 122 return doc; 123 124 } 125 126 /** 127 * translate the file to a Document Object 128 * @param file 129 * @return 130 * @throws InterruptedException 131 * @throws IOException 132 */ 133 public static Document toDocument(Resource file,String url,String charset) throws IOException { 134 String ext = ResourceUtil.getExtension(file,null); 135 136 137 Document doc=null; 138 if(ext!=null) { 139 ext=ext.toLowerCase(); 140 //String mimeType=new MimetypesFileTypeMap().getContentType(f); 141 // HTML 142 if(ext.equals("htm") || ext.equals("html") || ext.equals("cfm") || ext.equals("cfml") || ext.equals("php") || ext.equals("asp") || ext.equals("aspx")) { 143 doc= HTMLDocument.getDocument(file,charset); 144 } 145 // PDF 146 else if(ext.equals("pdf")) { 147 doc= PDFDocument.getDocument(file); 148 } 149 // DOC 150 else if(ext.equals("doc")) { 151 doc= WordDocument.getDocument(file); 152 } 153 } 154 else { 155 ContentTypeImpl ct = (ContentTypeImpl) ResourceUtil.getContentType(file); 156 String type = ct.getMimeType(); 157 String c=ct.getCharset(); 158 if(c!=null) charset=c; 159 //String type=ResourceUtil.getMimeType(file,""); 160 if(type==null) {} 161 // HTML 162 else if(type.equals("text/html")) { 163 doc= HTMLDocument.getDocument(file,charset); 164 } 165 // PDF 166 else if(type.equals("application/pdf")) { 167 doc= PDFDocument.getDocument(file); 168 } 169 // DOC 170 else if(type.equals("application/msword")) { 171 doc= WordDocument.getDocument(file); 172 } 173 } 174 if(doc==null) doc= FileDocument.getDocument(file,charset); 175 176 String strPath=file.getPath().replace('\\', '/'); 177 String strName=strPath.substring(strPath.lastIndexOf('/')); 178 179 180 doc.add(FieldUtil.UnIndexed("url", strName)); 181 182 doc.add(FieldUtil.UnIndexed("key", strPath)); 183 doc.add(FieldUtil.UnIndexed("path", file.getPath())); 184 doc.add(FieldUtil.UnIndexed("size", Caster.toString(file.length()))); 185 doc.add(FieldUtil.UnIndexed("modified",DateField.timeToString(file.lastModified()))); 186 187 188 return doc; 189 } 190 191}