001    package railo.runtime.search.lucene2;
002    
003    
004    import java.io.IOException;
005    import java.io.InputStream;
006    import java.io.Reader;
007    import java.net.URL;
008    
009    import org.apache.commons.httpclient.Header;
010    import org.apache.commons.httpclient.HttpMethod;
011    import org.apache.lucene.document.DateField;
012    import org.apache.lucene.document.Document;
013    
014    import railo.commons.io.IOUtil;
015    import railo.commons.io.res.ContentTypeImpl;
016    import railo.commons.io.res.Resource;
017    import railo.commons.io.res.util.ResourceUtil;
018    import railo.commons.lang.StringUtil;
019    import railo.runtime.op.Caster;
020    import railo.runtime.search.lucene2.docs.FieldUtil;
021    import railo.runtime.search.lucene2.docs.FileDocument;
022    import railo.runtime.search.lucene2.docs.HTMLDocument;
023    import railo.runtime.search.lucene2.docs.PDFDocument;
024    import railo.runtime.search.lucene2.docs.WordDocument;
025    
026    /**
027     * creates a matching Document Object to given File
028     */
029    public final class DocumentUtil {
030    
031            public static Document toDocument(StringBuffer content,String root,URL url, HttpMethod method) throws IOException {
032            if(method.getStatusCode()!=200)return null;
033            
034                    // get type and charset
035                    Document doc=null;
036                    String type=getContentType(method);
037                    long len=getContentLength(method);
038                    String charset="iso-8859-1";
039            if(!StringUtil.isEmpty(type)){
040                    String[] types=type.split(";");
041                    type=types[0];
042                    if(types.length>1) {
043                    String tmp=types[types.length-1];
044                    int index=tmp.indexOf("charset=");
045                    if(index!=-1) {
046                            charset=StringUtil.removeQuotes(tmp.substring(index+8),true);
047                    }
048                }
049            }
050            Runtime rt = Runtime.getRuntime();
051            if(len>rt.freeMemory()){
052                    Runtime.getRuntime().gc();
053                    if(len>rt.freeMemory()) return null;
054            }
055                    
056            //print.err("url:"+url+";chr:"+charset+";type:"+type);
057            
058            if(type==null)  {}
059            // HTML
060            else if(type.indexOf("text/html")!=-1) {
061                    Reader r=null;
062                    try{
063                            r = IOUtil.getReader(method.getResponseBodyAsStream(), charset);
064                            doc= HTMLDocument.getDocument(content,r);
065                    }
066                    finally{
067                            IOUtil.closeEL(r);
068                    }
069            }
070            // PDF
071            else if(type.indexOf("application/pdf")!=-1) {
072                    InputStream is=null;
073                    try{
074                            is=IOUtil.toBufferedInputStream(method.getResponseBodyAsStream());
075                            doc= PDFDocument.getDocument(content,is);
076                    }
077                    finally {
078                            IOUtil.closeEL(is);
079                    }
080            }
081            // DOC
082            else if(type.equals("application/msword")) {
083                    InputStream is=null;
084                    try{
085                            is=IOUtil.toBufferedInputStream(method.getResponseBodyAsStream());
086                            doc= WordDocument.getDocument(content,is);
087                    }
088                    finally {
089                            IOUtil.closeEL(is);
090                    }
091                
092            }
093            // Plain
094            else if(type.indexOf("text/plain")!=-1) {
095                    Reader r=null;
096                    try{
097                            r=IOUtil.toBufferedReader(IOUtil.getReader(method.getResponseBodyAsStream(), charset));
098                            doc= FileDocument.getDocument(content,r);
099                    }
100                    finally {
101                            IOUtil.closeEL(r);
102                    }
103            }
104            
105            if(doc!=null){
106                    String strPath=url.toExternalForm();
107               
108                doc.add(FieldUtil.UnIndexed("url", strPath));
109                doc.add(FieldUtil.UnIndexed("key", strPath));
110                doc.add(FieldUtil.UnIndexed("path", strPath));
111                //doc.add(FieldUtil.UnIndexed("size", Caster.toString(file.length())));
112                //doc.add(FieldUtil.Keyword("modified",DateField.timeToString(file.lastModified())));
113            }
114            
115            return doc;
116            
117        }
118    
119            private static String getContentType(HttpMethod method) {
120                    Header ct = method.getResponseHeader("Content-Type");
121                    if(!StringUtil.isEmpty(ct)) return ct.getValue().toLowerCase();
122                    Header[] headers = method.getResponseHeaders();
123                    for(int i=0;i<headers.length;i++){
124                            if("Content-Type".equalsIgnoreCase(headers[i].getName())){
125                                    if(!StringUtil.isEmpty(headers[i].getValue()))return headers[i].getValue().toLowerCase();
126                                    return null;
127                            }
128                    }
129            return null;
130            }
131    
132            private static long getContentLength(HttpMethod method) {
133                    Header ct = method.getResponseHeader("Content-Length");
134                    if(!StringUtil.isEmpty(ct)) return Caster.toLongValue(ct.getValue(),-1);
135                    Header[] headers = method.getResponseHeaders();
136                    for(int i=0;i<headers.length;i++){
137                            if("Content-Length".equalsIgnoreCase(headers[i].getName())){
138                                    if(!StringUtil.isEmpty(headers[i].getValue()))return Caster.toLongValue(headers[i].getValue(),-1);
139                                    return -1;
140                            }
141                    }
142            return -1;
143            }
144            
145            
146            
147            
148        /**
149         * translate the file to a Document Object
150         * @param file
151         * @return
152         * @throws InterruptedException
153         * @throws IOException
154         */
155        public static Document toDocument(Resource file,String url,String charset) throws IOException {
156            String ext = ResourceUtil.getExtension(file,null);
157            
158           
159            Document doc=null;
160            if(ext!=null) {
161                ext=ext.toLowerCase();
162                //String mimeType=new MimetypesFileTypeMap().getContentType(f);
163                // HTML
164                if(ext.equals("htm") || ext.equals("html") || ext.equals("cfm") || ext.equals("cfml") || ext.equals("php") || ext.equals("asp") || ext.equals("aspx")) {
165                    doc= HTMLDocument.getDocument(file,charset);
166                }
167                // PDF
168                else if(ext.equals("pdf")) {
169                    doc= PDFDocument.getDocument(file);
170                }
171                // DOC
172                else if(ext.equals("doc")) {
173                    doc= WordDocument.getDocument(file);
174                }
175            }
176            else { 
177                    ContentTypeImpl ct = (ContentTypeImpl) ResourceUtil.getContentType(file);
178                    String type = ct.getMimeType();
179                    String c=ct.getCharset();
180                    if(c!=null) charset=c;
181                //String type=ResourceUtil.getMymeType(file,"");
182                if(type==null)  {}
183                // HTML
184                else if(type.equals("text/html")) {
185                    doc= HTMLDocument.getDocument(file,charset);
186                }
187                // PDF
188                else if(type.equals("application/pdf")) {
189                    doc= PDFDocument.getDocument(file);
190                }
191                // DOC
192                else if(type.equals("application/msword")) {
193                    doc= WordDocument.getDocument(file);
194                }
195            }
196            if(doc==null) doc= FileDocument.getDocument(file,charset);
197            
198            String strPath=file.getPath().replace('\\', '/');
199                String strName=strPath.substring(strPath.lastIndexOf('/'));
200                
201                
202                doc.add(FieldUtil.UnIndexed("url", strName));
203                
204                doc.add(FieldUtil.UnIndexed("key", strPath));
205                doc.add(FieldUtil.UnIndexed("path", file.getPath()));
206                doc.add(FieldUtil.UnIndexed("size", Caster.toString(file.length())));
207                doc.add(FieldUtil.UnIndexed("modified",DateField.timeToString(file.lastModified())));
208            
209            
210            return doc;
211        }
212        
213    }