001    package railo.runtime.search.lucene2;
002    
003    
004    import java.io.IOException;
005    import java.io.InputStream;
006    import java.io.Reader;
007    import java.net.URL;
008    
009    import org.apache.lucene.document.DateField;
010    import org.apache.lucene.document.Document;
011    
012    import railo.commons.io.IOUtil;
013    import railo.commons.io.res.ContentType;
014    import railo.commons.io.res.ContentTypeImpl;
015    import railo.commons.io.res.Resource;
016    import railo.commons.io.res.util.ResourceUtil;
017    import railo.commons.net.http.HTTPResponse;
018    import railo.runtime.op.Caster;
019    import railo.runtime.search.lucene2.docs.FieldUtil;
020    import railo.runtime.search.lucene2.docs.FileDocument;
021    import railo.runtime.search.lucene2.docs.HTMLDocument;
022    import railo.runtime.search.lucene2.docs.PDFDocument;
023    import railo.runtime.search.lucene2.docs.WordDocument;
024    
025    /**
026     * creates a matching Document Object to given File
027     */
028    public final class DocumentUtil {
029    
030            public static Document toDocument(StringBuffer content,String root,URL url, HTTPResponse method) throws IOException {
031            if(method.getStatusCode()!=200)return null;
032            
033                    // get type and charset
034                    Document doc=null;
035                    ContentType ct = method.getContentType();
036                    long len=method.getContentLength();
037                    String charset=ct==null?"iso-8859-1":ct.getCharset();
038            
039            Runtime rt = Runtime.getRuntime();
040            if(len>rt.freeMemory()){
041                    Runtime.getRuntime().gc();
042                    if(len>rt.freeMemory()) return null;
043            }
044                    
045            //print.err("url:"+url+";chr:"+charset+";type:"+type);
046            
047            if(ct==null || ct.getMimeType()==null)  {}
048            // HTML
049            else if(ct.getMimeType().indexOf("text/html")!=-1) {
050                    Reader r=null;
051                    try{
052                            r = IOUtil.getReader(method.getContentAsStream(), charset);
053                            doc= HTMLDocument.getDocument(content,r);
054                    }
055                    finally{
056                            IOUtil.closeEL(r);
057                    }
058            }
059            // PDF
060            else if(ct.getMimeType().indexOf("application/pdf")!=-1) {
061                    InputStream is=null;
062                    try{
063                            is=IOUtil.toBufferedInputStream(method.getContentAsStream());
064                            doc= PDFDocument.getDocument(content,is);
065                    }
066                    finally {
067                            IOUtil.closeEL(is);
068                    }
069            }
070            // DOC
071            else if(ct.getMimeType().equals("application/msword")) {
072                    InputStream is=null;
073                    try{
074                            is=IOUtil.toBufferedInputStream(method.getContentAsStream());
075                            doc= WordDocument.getDocument(content,is);
076                    }
077                    finally {
078                            IOUtil.closeEL(is);
079                    }
080                
081            }
082            // Plain
083            else if(ct.getMimeType().indexOf("text/plain")!=-1) {
084                    Reader r=null;
085                    try{
086                            r=IOUtil.toBufferedReader(IOUtil.getReader(method.getContentAsStream(), charset));
087                            doc= FileDocument.getDocument(content,r);
088                    }
089                    finally {
090                            IOUtil.closeEL(r);
091                    }
092            }
093            
094            if(doc!=null){
095                    String strPath=url.toExternalForm();
096               
097                doc.add(FieldUtil.UnIndexed("url", strPath));
098                doc.add(FieldUtil.UnIndexed("key", strPath));
099                doc.add(FieldUtil.UnIndexed("path", strPath));
100                //doc.add(FieldUtil.UnIndexed("size", Caster.toString(file.length())));
101                //doc.add(FieldUtil.Keyword("modified",DateField.timeToString(file.lastModified())));
102            }
103            
104            return doc;
105            
106        }
107            
108        /**
109         * translate the file to a Document Object
110         * @param file
111         * @return
112         * @throws InterruptedException
113         * @throws IOException
114         */
115        public static Document toDocument(Resource file,String url,String charset) throws IOException {
116            String ext = ResourceUtil.getExtension(file,null);
117            
118           
119            Document doc=null;
120            if(ext!=null) {
121                ext=ext.toLowerCase();
122                //String mimeType=new MimetypesFileTypeMap().getContentType(f);
123                // HTML
124                if(ext.equals("htm") || ext.equals("html") || ext.equals("cfm") || ext.equals("cfml") || ext.equals("php") || ext.equals("asp") || ext.equals("aspx")) {
125                    doc= HTMLDocument.getDocument(file,charset);
126                }
127                // PDF
128                else if(ext.equals("pdf")) {
129                    doc= PDFDocument.getDocument(file);
130                }
131                // DOC
132                else if(ext.equals("doc")) {
133                    doc= WordDocument.getDocument(file);
134                }
135            }
136            else { 
137                    ContentTypeImpl ct = (ContentTypeImpl) ResourceUtil.getContentType(file);
138                    String type = ct.getMimeType();
139                    String c=ct.getCharset();
140                    if(c!=null) charset=c;
141                //String type=ResourceUtil.getMimeType(file,"");
142                if(type==null)  {}
143                // HTML
144                else if(type.equals("text/html")) {
145                    doc= HTMLDocument.getDocument(file,charset);
146                }
147                // PDF
148                else if(type.equals("application/pdf")) {
149                    doc= PDFDocument.getDocument(file);
150                }
151                // DOC
152                else if(type.equals("application/msword")) {
153                    doc= WordDocument.getDocument(file);
154                }
155            }
156            if(doc==null) doc= FileDocument.getDocument(file,charset);
157            
158            String strPath=file.getPath().replace('\\', '/');
159                String strName=strPath.substring(strPath.lastIndexOf('/'));
160                
161                
162                doc.add(FieldUtil.UnIndexed("url", strName));
163                
164                doc.add(FieldUtil.UnIndexed("key", strPath));
165                doc.add(FieldUtil.UnIndexed("path", file.getPath()));
166                doc.add(FieldUtil.UnIndexed("size", Caster.toString(file.length())));
167                doc.add(FieldUtil.UnIndexed("modified",DateField.timeToString(file.lastModified())));
168            
169            
170            return doc;
171        }
172        
173    }