001/**
002 *
003 * Copyright (c) 2014, the Railo Company Ltd. All rights reserved.
004 *
005 * This library is free software; you can redistribute it and/or
006 * modify it under the terms of the GNU Lesser General Public
007 * License as published by the Free Software Foundation; either 
008 * version 2.1 of the License, or (at your option) any later version.
009 * 
010 * This library is distributed in the hope that it will be useful,
011 * but WITHOUT ANY WARRANTY; without even the implied warranty of
012 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
013 * Lesser General Public License for more details.
014 * 
015 * You should have received a copy of the GNU Lesser General Public 
016 * License along with this library.  If not, see <http://www.gnu.org/licenses/>.
017 * 
018 **/
019package lucee.runtime.search.lucene2;
020
021
022import java.io.IOException;
023import java.io.InputStream;
024import java.io.Reader;
025import java.net.URL;
026
027import lucee.commons.io.IOUtil;
028import lucee.commons.io.res.ContentType;
029import lucee.commons.io.res.ContentTypeImpl;
030import lucee.commons.io.res.Resource;
031import lucee.commons.io.res.util.ResourceUtil;
032import lucee.commons.net.http.HTTPResponse;
033import lucee.runtime.op.Caster;
034import lucee.runtime.search.lucene2.docs.FieldUtil;
035import lucee.runtime.search.lucene2.docs.FileDocument;
036import lucee.runtime.search.lucene2.docs.HTMLDocument;
037import lucee.runtime.search.lucene2.docs.PDFDocument;
038import lucee.runtime.search.lucene2.docs.WordDocument;
039
040import org.apache.lucene.document.DateField;
041import org.apache.lucene.document.Document;
042
043/**
044 * creates a matching Document Object to given File
045 */
046public final class DocumentUtil {
047
048        public static Document toDocument(StringBuffer content,String root,URL url, HTTPResponse method) throws IOException {
049        if(method.getStatusCode()!=200)return null;
050        
051                // get type and charset
052                Document doc=null;
053                ContentType ct = method.getContentType();
054                long len=method.getContentLength();
055                String charset=ct==null?"iso-8859-1":ct.getCharset();
056        
057        Runtime rt = Runtime.getRuntime();
058        if(len>rt.freeMemory()){
059                Runtime.getRuntime().gc();
060                if(len>rt.freeMemory()) return null;
061        }
062                
063        //print.err("url:"+url+";chr:"+charset+";type:"+type);
064        
065        if(ct==null || ct.getMimeType()==null)  {}
066        // HTML
067        else if(ct.getMimeType().indexOf("text/html")!=-1) {
068                Reader r=null;
069                try{
070                        r = IOUtil.getReader(method.getContentAsStream(), charset);
071                        doc= HTMLDocument.getDocument(content,r);
072                }
073                finally{
074                        IOUtil.closeEL(r);
075                }
076        }
077        // PDF
078        else if(ct.getMimeType().indexOf("application/pdf")!=-1) {
079                InputStream is=null;
080                try{
081                        is=IOUtil.toBufferedInputStream(method.getContentAsStream());
082                        doc= PDFDocument.getDocument(content,is);
083                }
084                finally {
085                        IOUtil.closeEL(is);
086                }
087        }
088        // DOC
089        else if(ct.getMimeType().equals("application/msword")) {
090                InputStream is=null;
091                try{
092                        is=IOUtil.toBufferedInputStream(method.getContentAsStream());
093                        doc= WordDocument.getDocument(content,is);
094                }
095                finally {
096                        IOUtil.closeEL(is);
097                }
098            
099        }
100        // Plain
101        else if(ct.getMimeType().indexOf("text/plain")!=-1) {
102                Reader r=null;
103                try{
104                        r=IOUtil.toBufferedReader(IOUtil.getReader(method.getContentAsStream(), charset));
105                        doc= FileDocument.getDocument(content,r);
106                }
107                finally {
108                        IOUtil.closeEL(r);
109                }
110        }
111        
112        if(doc!=null){
113                String strPath=url.toExternalForm();
114           
115            doc.add(FieldUtil.UnIndexed("url", strPath));
116            doc.add(FieldUtil.UnIndexed("key", strPath));
117            doc.add(FieldUtil.UnIndexed("path", strPath));
118            //doc.add(FieldUtil.UnIndexed("size", Caster.toString(file.length())));
119            //doc.add(FieldUtil.Keyword("modified",DateField.timeToString(file.lastModified())));
120        }
121        
122        return doc;
123        
124    }
125        
126    /**
127     * translate the file to a Document Object
128     * @param file
129     * @return
130     * @throws InterruptedException
131     * @throws IOException
132     */
133    public static Document toDocument(Resource file,String url,String charset) throws IOException {
134        String ext = ResourceUtil.getExtension(file,null);
135        
136       
137        Document doc=null;
138        if(ext!=null) {
139            ext=ext.toLowerCase();
140            //String mimeType=new MimetypesFileTypeMap().getContentType(f);
141            // HTML
142            if(ext.equals("htm") || ext.equals("html") || ext.equals("cfm") || ext.equals("cfml") || ext.equals("php") || ext.equals("asp") || ext.equals("aspx")) {
143                doc= HTMLDocument.getDocument(file,charset);
144            }
145            // PDF
146            else if(ext.equals("pdf")) {
147                doc= PDFDocument.getDocument(file);
148            }
149            // DOC
150            else if(ext.equals("doc")) {
151                doc= WordDocument.getDocument(file);
152            }
153        }
154        else { 
155                ContentTypeImpl ct = (ContentTypeImpl) ResourceUtil.getContentType(file);
156                String type = ct.getMimeType();
157                String c=ct.getCharset();
158                if(c!=null) charset=c;
159            //String type=ResourceUtil.getMimeType(file,"");
160            if(type==null)  {}
161            // HTML
162            else if(type.equals("text/html")) {
163                doc= HTMLDocument.getDocument(file,charset);
164            }
165            // PDF
166            else if(type.equals("application/pdf")) {
167                doc= PDFDocument.getDocument(file);
168            }
169            // DOC
170            else if(type.equals("application/msword")) {
171                doc= WordDocument.getDocument(file);
172            }
173        }
174        if(doc==null) doc= FileDocument.getDocument(file,charset);
175        
176        String strPath=file.getPath().replace('\\', '/');
177            String strName=strPath.substring(strPath.lastIndexOf('/'));
178            
179            
180            doc.add(FieldUtil.UnIndexed("url", strName));
181            
182            doc.add(FieldUtil.UnIndexed("key", strPath));
183            doc.add(FieldUtil.UnIndexed("path", file.getPath()));
184            doc.add(FieldUtil.UnIndexed("size", Caster.toString(file.length())));
185            doc.add(FieldUtil.UnIndexed("modified",DateField.timeToString(file.lastModified())));
186        
187        
188        return doc;
189    }
190    
191}