001 package railo.runtime.search.lucene2; 002 003 004 import java.io.IOException; 005 import java.io.InputStream; 006 import java.io.Reader; 007 import java.net.URL; 008 009 import org.apache.lucene.document.DateField; 010 import org.apache.lucene.document.Document; 011 012 import railo.commons.io.IOUtil; 013 import railo.commons.io.res.ContentType; 014 import railo.commons.io.res.ContentTypeImpl; 015 import railo.commons.io.res.Resource; 016 import railo.commons.io.res.util.ResourceUtil; 017 import railo.commons.net.http.HTTPResponse; 018 import railo.runtime.op.Caster; 019 import railo.runtime.search.lucene2.docs.FieldUtil; 020 import railo.runtime.search.lucene2.docs.FileDocument; 021 import railo.runtime.search.lucene2.docs.HTMLDocument; 022 import railo.runtime.search.lucene2.docs.PDFDocument; 023 import railo.runtime.search.lucene2.docs.WordDocument; 024 025 /** 026 * creates a matching Document Object to given File 027 */ 028 public final class DocumentUtil { 029 030 public static Document toDocument(StringBuffer content,String root,URL url, HTTPResponse method) throws IOException { 031 if(method.getStatusCode()!=200)return null; 032 033 // get type and charset 034 Document doc=null; 035 ContentType ct = method.getContentType(); 036 long len=method.getContentLength(); 037 String charset=ct==null?"iso-8859-1":ct.getCharset(); 038 039 Runtime rt = Runtime.getRuntime(); 040 if(len>rt.freeMemory()){ 041 Runtime.getRuntime().gc(); 042 if(len>rt.freeMemory()) return null; 043 } 044 045 //print.err("url:"+url+";chr:"+charset+";type:"+type); 046 047 if(ct==null || ct.getMimeType()==null) {} 048 // HTML 049 else if(ct.getMimeType().indexOf("text/html")!=-1) { 050 Reader r=null; 051 try{ 052 r = IOUtil.getReader(method.getContentAsStream(), charset); 053 doc= HTMLDocument.getDocument(content,r); 054 } 055 finally{ 056 IOUtil.closeEL(r); 057 } 058 } 059 // PDF 060 else if(ct.getMimeType().indexOf("application/pdf")!=-1) { 061 InputStream is=null; 062 try{ 063 is=IOUtil.toBufferedInputStream(method.getContentAsStream()); 064 doc= PDFDocument.getDocument(content,is); 065 } 066 finally { 067 IOUtil.closeEL(is); 068 } 069 } 070 // DOC 071 else if(ct.getMimeType().equals("application/msword")) { 072 InputStream is=null; 073 try{ 074 is=IOUtil.toBufferedInputStream(method.getContentAsStream()); 075 doc= WordDocument.getDocument(content,is); 076 } 077 finally { 078 IOUtil.closeEL(is); 079 } 080 081 } 082 // Plain 083 else if(ct.getMimeType().indexOf("text/plain")!=-1) { 084 Reader r=null; 085 try{ 086 r=IOUtil.toBufferedReader(IOUtil.getReader(method.getContentAsStream(), charset)); 087 doc= FileDocument.getDocument(content,r); 088 } 089 finally { 090 IOUtil.closeEL(r); 091 } 092 } 093 094 if(doc!=null){ 095 String strPath=url.toExternalForm(); 096 097 doc.add(FieldUtil.UnIndexed("url", strPath)); 098 doc.add(FieldUtil.UnIndexed("key", strPath)); 099 doc.add(FieldUtil.UnIndexed("path", strPath)); 100 //doc.add(FieldUtil.UnIndexed("size", Caster.toString(file.length()))); 101 //doc.add(FieldUtil.Keyword("modified",DateField.timeToString(file.lastModified()))); 102 } 103 104 return doc; 105 106 } 107 108 /** 109 * translate the file to a Document Object 110 * @param file 111 * @return 112 * @throws InterruptedException 113 * @throws IOException 114 */ 115 public static Document toDocument(Resource file,String url,String charset) throws IOException { 116 String ext = ResourceUtil.getExtension(file,null); 117 118 119 Document doc=null; 120 if(ext!=null) { 121 ext=ext.toLowerCase(); 122 //String mimeType=new MimetypesFileTypeMap().getContentType(f); 123 // HTML 124 if(ext.equals("htm") || ext.equals("html") || ext.equals("cfm") || ext.equals("cfml") || ext.equals("php") || ext.equals("asp") || ext.equals("aspx")) { 125 doc= HTMLDocument.getDocument(file,charset); 126 } 127 // PDF 128 else if(ext.equals("pdf")) { 129 doc= PDFDocument.getDocument(file); 130 } 131 // DOC 132 else if(ext.equals("doc")) { 133 doc= WordDocument.getDocument(file); 134 } 135 } 136 else { 137 ContentTypeImpl ct = (ContentTypeImpl) ResourceUtil.getContentType(file); 138 String type = ct.getMimeType(); 139 String c=ct.getCharset(); 140 if(c!=null) charset=c; 141 //String type=ResourceUtil.getMimeType(file,""); 142 if(type==null) {} 143 // HTML 144 else if(type.equals("text/html")) { 145 doc= HTMLDocument.getDocument(file,charset); 146 } 147 // PDF 148 else if(type.equals("application/pdf")) { 149 doc= PDFDocument.getDocument(file); 150 } 151 // DOC 152 else if(type.equals("application/msword")) { 153 doc= WordDocument.getDocument(file); 154 } 155 } 156 if(doc==null) doc= FileDocument.getDocument(file,charset); 157 158 String strPath=file.getPath().replace('\\', '/'); 159 String strName=strPath.substring(strPath.lastIndexOf('/')); 160 161 162 doc.add(FieldUtil.UnIndexed("url", strName)); 163 164 doc.add(FieldUtil.UnIndexed("key", strPath)); 165 doc.add(FieldUtil.UnIndexed("path", file.getPath())); 166 doc.add(FieldUtil.UnIndexed("size", Caster.toString(file.length()))); 167 doc.add(FieldUtil.UnIndexed("modified",DateField.timeToString(file.lastModified()))); 168 169 170 return doc; 171 } 172 173 }