001 package railo.runtime.search.lucene2; 002 003 004 import java.io.IOException; 005 import java.io.InputStream; 006 import java.io.Reader; 007 import java.net.URL; 008 009 import org.apache.commons.httpclient.Header; 010 import org.apache.commons.httpclient.HttpMethod; 011 import org.apache.lucene.document.DateField; 012 import org.apache.lucene.document.Document; 013 014 import railo.commons.io.IOUtil; 015 import railo.commons.io.res.ContentTypeImpl; 016 import railo.commons.io.res.Resource; 017 import railo.commons.io.res.util.ResourceUtil; 018 import railo.commons.lang.StringUtil; 019 import railo.runtime.op.Caster; 020 import railo.runtime.search.lucene2.docs.FieldUtil; 021 import railo.runtime.search.lucene2.docs.FileDocument; 022 import railo.runtime.search.lucene2.docs.HTMLDocument; 023 import railo.runtime.search.lucene2.docs.PDFDocument; 024 import railo.runtime.search.lucene2.docs.WordDocument; 025 026 /** 027 * creates a matching Document Object to given File 028 */ 029 public final class DocumentUtil { 030 031 public static Document toDocument(StringBuffer content,String root,URL url, HttpMethod method) throws IOException { 032 if(method.getStatusCode()!=200)return null; 033 034 // get type and charset 035 Document doc=null; 036 String type=getContentType(method); 037 long len=getContentLength(method); 038 String charset="iso-8859-1"; 039 if(!StringUtil.isEmpty(type)){ 040 String[] types=type.split(";"); 041 type=types[0]; 042 if(types.length>1) { 043 String tmp=types[types.length-1]; 044 int index=tmp.indexOf("charset="); 045 if(index!=-1) { 046 charset=StringUtil.removeQuotes(tmp.substring(index+8),true); 047 } 048 } 049 } 050 Runtime rt = Runtime.getRuntime(); 051 if(len>rt.freeMemory()){ 052 Runtime.getRuntime().gc(); 053 if(len>rt.freeMemory()) return null; 054 } 055 056 //print.err("url:"+url+";chr:"+charset+";type:"+type); 057 058 if(type==null) {} 059 // HTML 060 else if(type.indexOf("text/html")!=-1) { 061 Reader r=null; 062 try{ 063 r = IOUtil.getReader(method.getResponseBodyAsStream(), charset); 064 doc= HTMLDocument.getDocument(content,r); 065 } 066 finally{ 067 IOUtil.closeEL(r); 068 } 069 } 070 // PDF 071 else if(type.indexOf("application/pdf")!=-1) { 072 InputStream is=null; 073 try{ 074 is=IOUtil.toBufferedInputStream(method.getResponseBodyAsStream()); 075 doc= PDFDocument.getDocument(content,is); 076 } 077 finally { 078 IOUtil.closeEL(is); 079 } 080 } 081 // DOC 082 else if(type.equals("application/msword")) { 083 InputStream is=null; 084 try{ 085 is=IOUtil.toBufferedInputStream(method.getResponseBodyAsStream()); 086 doc= WordDocument.getDocument(content,is); 087 } 088 finally { 089 IOUtil.closeEL(is); 090 } 091 092 } 093 // Plain 094 else if(type.indexOf("text/plain")!=-1) { 095 Reader r=null; 096 try{ 097 r=IOUtil.toBufferedReader(IOUtil.getReader(method.getResponseBodyAsStream(), charset)); 098 doc= FileDocument.getDocument(content,r); 099 } 100 finally { 101 IOUtil.closeEL(r); 102 } 103 } 104 105 if(doc!=null){ 106 String strPath=url.toExternalForm(); 107 108 doc.add(FieldUtil.UnIndexed("url", strPath)); 109 doc.add(FieldUtil.UnIndexed("key", strPath)); 110 doc.add(FieldUtil.UnIndexed("path", strPath)); 111 //doc.add(FieldUtil.UnIndexed("size", Caster.toString(file.length()))); 112 //doc.add(FieldUtil.Keyword("modified",DateField.timeToString(file.lastModified()))); 113 } 114 115 return doc; 116 117 } 118 119 private static String getContentType(HttpMethod method) { 120 Header ct = method.getResponseHeader("Content-Type"); 121 if(!StringUtil.isEmpty(ct)) return ct.getValue().toLowerCase(); 122 Header[] headers = method.getResponseHeaders(); 123 for(int i=0;i<headers.length;i++){ 124 if("Content-Type".equalsIgnoreCase(headers[i].getName())){ 125 if(!StringUtil.isEmpty(headers[i].getValue()))return headers[i].getValue().toLowerCase(); 126 return null; 127 } 128 } 129 return null; 130 } 131 132 private static long getContentLength(HttpMethod method) { 133 Header ct = method.getResponseHeader("Content-Length"); 134 if(!StringUtil.isEmpty(ct)) return Caster.toLongValue(ct.getValue(),-1); 135 Header[] headers = method.getResponseHeaders(); 136 for(int i=0;i<headers.length;i++){ 137 if("Content-Length".equalsIgnoreCase(headers[i].getName())){ 138 if(!StringUtil.isEmpty(headers[i].getValue()))return Caster.toLongValue(headers[i].getValue(),-1); 139 return -1; 140 } 141 } 142 return -1; 143 } 144 145 146 147 148 /** 149 * translate the file to a Document Object 150 * @param file 151 * @return 152 * @throws InterruptedException 153 * @throws IOException 154 */ 155 public static Document toDocument(Resource file,String url,String charset) throws IOException { 156 String ext = ResourceUtil.getExtension(file,null); 157 158 159 Document doc=null; 160 if(ext!=null) { 161 ext=ext.toLowerCase(); 162 //String mimeType=new MimetypesFileTypeMap().getContentType(f); 163 // HTML 164 if(ext.equals("htm") || ext.equals("html") || ext.equals("cfm") || ext.equals("cfml") || ext.equals("php") || ext.equals("asp") || ext.equals("aspx")) { 165 doc= HTMLDocument.getDocument(file,charset); 166 } 167 // PDF 168 else if(ext.equals("pdf")) { 169 doc= PDFDocument.getDocument(file); 170 } 171 // DOC 172 else if(ext.equals("doc")) { 173 doc= WordDocument.getDocument(file); 174 } 175 } 176 else { 177 ContentTypeImpl ct = (ContentTypeImpl) ResourceUtil.getContentType(file); 178 String type = ct.getMimeType(); 179 String c=ct.getCharset(); 180 if(c!=null) charset=c; 181 //String type=ResourceUtil.getMymeType(file,""); 182 if(type==null) {} 183 // HTML 184 else if(type.equals("text/html")) { 185 doc= HTMLDocument.getDocument(file,charset); 186 } 187 // PDF 188 else if(type.equals("application/pdf")) { 189 doc= PDFDocument.getDocument(file); 190 } 191 // DOC 192 else if(type.equals("application/msword")) { 193 doc= WordDocument.getDocument(file); 194 } 195 } 196 if(doc==null) doc= FileDocument.getDocument(file,charset); 197 198 String strPath=file.getPath().replace('\\', '/'); 199 String strName=strPath.substring(strPath.lastIndexOf('/')); 200 201 202 doc.add(FieldUtil.UnIndexed("url", strName)); 203 204 doc.add(FieldUtil.UnIndexed("key", strPath)); 205 doc.add(FieldUtil.UnIndexed("path", file.getPath())); 206 doc.add(FieldUtil.UnIndexed("size", Caster.toString(file.length()))); 207 doc.add(FieldUtil.UnIndexed("modified",DateField.timeToString(file.lastModified()))); 208 209 210 return doc; 211 } 212 213 }