001/**
002 *
003 * Copyright (c) 2014, the Railo Company Ltd. All rights reserved.
004 *
005 * This library is free software; you can redistribute it and/or
006 * modify it under the terms of the GNU Lesser General Public
007 * License as published by the Free Software Foundation; either 
008 * version 2.1 of the License, or (at your option) any later version.
009 * 
010 * This library is distributed in the hope that it will be useful,
011 * but WITHOUT ANY WARRANTY; without even the implied warranty of
012 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
013 * Lesser General Public License for more details.
014 * 
015 * You should have received a copy of the GNU Lesser General Public 
016 * License along with this library.  If not, see <http://www.gnu.org/licenses/>.
017 * 
018 **/
019package lucee.runtime.search.lucene2.net;
020
021import java.io.IOException;
022import java.io.PrintWriter;
023import java.net.MalformedURLException;
024import java.net.URL;
025import java.util.ArrayList;
026import java.util.Iterator;
027import java.util.List;
028
029import lucee.commons.io.log.Log;
030import lucee.commons.io.log.LogUtil;
031import lucee.commons.io.res.util.ResourceUtil;
032import lucee.commons.lang.ExceptionUtil;
033import lucee.commons.lang.HTMLUtil;
034import lucee.commons.lang.StringUtil;
035import lucee.commons.lang.SystemOut;
036import lucee.commons.net.HTTPUtil;
037import lucee.commons.net.http.HTTPEngine;
038import lucee.commons.net.http.HTTPResponse;
039import lucee.runtime.config.Config;
040import lucee.runtime.engine.ThreadLocalPageContext;
041import lucee.runtime.search.lucene2.DocumentUtil;
042import lucee.runtime.tag.Index;
043import lucee.runtime.type.util.ArrayUtil;
044
045import org.apache.lucene.document.Document;
046import org.apache.lucene.index.IndexWriter;
047
048/**
049 * 
050 */
051public final class WebCrawler {
052    
053    private static HTMLUtil htmlUtil=new HTMLUtil();
054        private Log log;
055        
056    
057    
058    public WebCrawler(Log log) {
059        this.log=log;
060        }
061
062        
063    
064    public void parse(IndexWriter writer, URL current, String[] extensions, boolean recurse, long timeout) throws IOException {
065        translateExtension(extensions);
066        if(ArrayUtil.isEmpty(extensions))extensions=Index.EXTENSIONS;
067        _parse(log,writer,null,current,new ArrayList(), extensions,recurse,0,timeout);
068    }
069    
070
071        private static URL translateURL(URL url) throws MalformedURLException {
072                
073                
074                //print.out(url.toExternalForm());
075                String path=url.getPath();
076                int dotIndex = path.lastIndexOf('.');
077                // no dot
078                if(dotIndex==-1){
079                        if(path.endsWith("/")) return HTTPUtil.removeRef(url);
080                        
081                        
082                        return HTTPUtil.removeRef(new URL(
083                                        url.getProtocol(),
084                                        url.getHost(),
085                                        url.getPort(),
086                                        path+"/"+StringUtil.emptyIfNull(url.getQuery())));
087                }
088                //print.out("rem:"+HTTPUtil.removeRef(url));
089                return HTTPUtil.removeRef(url);
090        }   
091    
092
093    private void translateExtension(String[] extensions) {
094                for(int i=0;i<extensions.length;i++){
095                        if(extensions[i].startsWith("*."))extensions[i]=extensions[i].substring(2);
096                        else if(extensions[i].startsWith("."))extensions[i]=extensions[i].substring(1);
097                }
098        }
099
100
101        
102
103        /**
104     * @param writer
105     * @param current
106         * @param content 
107         * @throws IOException 
108     */
109
110    private static Document toDocument(StringBuffer content,IndexWriter writer, String root, URL current,long timeout) throws IOException {
111        HTTPResponse rsp = HTTPEngine.get(current, null, null, timeout,HTTPEngine.MAX_REDIRECT, null, "LuceeBot", null, null);
112        Document doc = DocumentUtil.toDocument(content,root,current, rsp);
113        
114                return doc;
115        }
116
117    protected static void _parse(Log log,IndexWriter writer, String root, URL current, List urlsDone, String[] extensions, boolean recurse,int deep,long timeout) throws IOException  {
118        
119        StringBuffer content = _parseItem(log,writer, root, current, urlsDone, extensions, recurse, deep,timeout);
120        if(content!=null)_parseChildren(log,content,writer, root, current, urlsDone, extensions, recurse, deep,timeout);
121    }
122    
123    public static StringBuffer _parseItem(Log log,IndexWriter writer, String root, URL url, List urlsDone, String[] extensions, boolean recurse,int deep,long timeout) throws IOException{
124        try{
125                url=translateURL(url);
126                if(urlsDone.contains(url.toExternalForm())) return null;
127                urlsDone.add(url.toExternalForm());
128                
129                StringBuffer content=new StringBuffer();            
130                Document doc=toDocument(content,writer, root, url,timeout);
131                
132                if(doc==null) return null;
133                if(writer!=null)writer.addDocument(doc);
134                
135                // Test
136                /*Resource dir = ResourcesImpl.getFileResourceProvider().getResource("/Users/mic/Temp/leeway3/");
137                if(!dir.isDirectory())dir.mkdirs();
138                Resource file=dir.getRealResource(url.toExternalForm().replace("/", "_"));
139                IOUtil.write(file, content.toString(), "UTF-8", false);*/
140                
141                info(log,url.toExternalForm());
142                return content;
143        }
144        catch(IOException ioe){
145                error(log,url.toExternalForm(),ioe);
146                throw ioe;
147        }
148    }
149    
150
151
152        protected static void _parseChildren(Log log,StringBuffer content,IndexWriter writer, String root, URL base, List urlsDone, String[] extensions, boolean recurse,int deep,long timeout) throws IOException  {
153        
154
155                
156        if(recurse) {
157            List urls = htmlUtil.getURLS(content.toString(),base);
158                
159            // loop through all children
160            int len=urls.size();
161            List childIndexer=len>1?new ArrayList():null;
162            ChildrenIndexer ci;
163            //print.out("getting content");
164            
165                for(int i=0;i<len;i++) {
166                URL url=(URL) urls.get(i);
167                /*if(url.toExternalForm().indexOf("80")!=-1){
168                                SystemOut.printDate("base:"+base);
169                                SystemOut.printDate("url:"+url);
170                        }*/
171                
172                url=translateURL(url);
173               
174                if(urlsDone.contains(url.toExternalForm())) continue;
175                //urlsDone.add(url.toExternalForm());
176                
177                String protocol=url.getProtocol().toLowerCase();
178                String file=url.getPath();
179                if((protocol.equals("http") || protocol.equals("https")) && validExtension(extensions,file) &&
180                   base.getHost().equalsIgnoreCase(url.getHost())) {
181                        try {
182                                ci=new ChildrenIndexer(log,writer,root,url,urlsDone,extensions,recurse,deep+1,timeout);
183                                
184                                childIndexer.add(ci);
185                                ci.start();
186                    }
187                    catch(Throwable t) {
188                                ExceptionUtil.rethrowIfNecessary(t);
189                        //print.printST(t);
190                    }
191                }
192            }
193                
194                if(childIndexer!=null && !childIndexer.isEmpty()){
195                        Iterator it = childIndexer.iterator();
196                        while(it.hasNext()) {
197                                ci=(ChildrenIndexer) it.next();
198                                if(ci.isAlive()) {
199                                        try {
200                                                ci.join(timeout);
201                                                
202                                        } 
203                                        catch (InterruptedException e) {
204                                                //print.printST(e);
205                                        }
206                                }
207                                // timeout exceptionif(ci.isAlive()) throw new IOException("timeout occur while invoking page ["+ci.url+"]");
208                                
209                                if(ci.isAlive()){
210                                        ci.interrupt();
211                                        Config config = ThreadLocalPageContext.getConfig();
212                                        SystemOut.printDate(config!=null?config.getErrWriter():new PrintWriter(System.err),"timeout ["+timeout+" ms] occur while invoking page ["+ci.url+"]");
213                                }
214                        }
215                        
216                        //print.out("exe child");
217                        it = childIndexer.iterator();
218                        while(it.hasNext()) {
219                                ci=(ChildrenIndexer) it.next();
220                                //print.out("exec-child:"+ci.url);
221                                //print.out(content);
222                                if(ci.content!=null)_parseChildren(log,ci.content,writer, root, ci.url, urlsDone, extensions, recurse, deep,timeout);
223                        }
224                        
225                }
226                
227                
228                urls.clear();
229        }
230        //print.out("end:"+base);
231    }
232    
233
234
235    /*protected static void _sssparse(IndexWriter writer, String root, URL current, List urlsDone, String[] extensions, boolean recurse,int deep,long timeout) throws IOException  {
236        current=translateURL(current);
237        print.out("start:"+current);
238        if(urlsDone.contains(current.toExternalForm())) return;
239        
240        HttpMethod method = HTTPUtil.invoke(current, null, null, -1, null, "LuceeBot", null, -1, null, null, null);
241        StringBuffer content=new StringBuffer();
242        Document doc = DocumentUtil.toDocument(content,root,current, method);
243        
244        urlsDone.add(current.toExternalForm());
245        if(doc==null) return;
246        if(writer!=null)writer.addDocument(doc);
247        
248        
249        if(recurse) {
250            List urls = htmlUtil.getURLS(content.toString(),current);
251                
252            // loop through all children
253            int len=urls.size();
254            List childIndexer=len>1?new ArrayList():null;
255            ChildrenIndexer ci;
256                for(int i=0;i<len;i++) {
257                URL url=(URL) urls.get(i);
258                String protocol=url.getProtocol().toLowerCase();
259                String file=url.getPath();
260                if((protocol.equals("http") || protocol.equals("https")) && validExtension(extensions,file) &&
261                   current.getHost().equalsIgnoreCase(url.getHost())) {
262                        
263                        //_parse(writer,root,url,urlsDone,extensions,recurse,deep+1);
264                        
265                    try {
266                        if(len==1 || true)_parse(writer,root,url,urlsDone,extensions,recurse,deep+1,timeout);
267                        else {
268                                ci=new ChildrenIndexer(writer,root,url,urlsDone,extensions,recurse,deep+1);
269                                ci.start();
270                                childIndexer.add(ci);
271                        }
272                    }
273                    catch(Throwable t) {
274                                                ExceptionUtil.rethrowIfNecessary(t);
275                    }
276                }
277            }
278                
279                if(!childIndexer.isEmpty()){
280                        Iterator it = childIndexer.iterator();
281                        while(it.hasNext()) {
282                                ci=(ChildrenIndexer) it.next();
283                                if(ci.isAlive()) {
284                                        try {
285                                                ci.join(20*1000);
286                                        } 
287                                        catch (InterruptedException e) {}
288                                }
289                        }
290                }
291                
292                
293                urls.clear();
294        }
295        print.out("end:"+current);
296    }*/
297    
298    
299
300
301        private static boolean validExtension(String[] extensions, String file) {
302                
303                String ext = ResourceUtil.getExtension(file,"");
304                ext=lucee.runtime.type.util.ListUtil.first(ext,"/",true);
305                
306                if(StringUtil.isEmpty(ext))return true;
307                for(int i=0;i<extensions.length;i++){
308                        if(ext.equalsIgnoreCase(extensions[i]))return true;
309                }
310                return false;
311        }
312
313
314    private static void info(Log log,String doc) {
315                if(log==null) return;
316                log.log(Log.LEVEL_INFO,"Webcrawler", "invoke "+doc);
317        }
318
319    private static void error(Log log,String doc, Exception e) {
320                if(log==null) return;
321                LogUtil.log(log,Log.LEVEL_ERROR,"Webcrawler", "invoke "+doc+":",e);
322        }
323}
324
325
326class ChildrenIndexer extends Thread {
327        protected IndexWriter writer;
328        protected String root;
329        protected URL url;
330        protected List urlsDone;
331        protected String[] extensions;
332        protected boolean recurse;
333        protected int deep;
334        protected StringBuffer content;
335        private long timeout;
336        private Log log;
337
338        public ChildrenIndexer(Log log,IndexWriter writer, String root, URL url,List urlsDone, String[] extensions,boolean recurse, int deep,long timeout) {
339                this.writer=writer;
340                this.root=root;
341                this.url=url;
342                this.urlsDone=urlsDone;
343                this.extensions=extensions;
344                this.recurse=recurse;
345                this.deep=deep;
346                this.timeout=timeout;
347                this.log=log;
348        }
349
350        public void run(){
351                try {
352                        //WebCrawler._parse(writer, root, url, urlsDone, extensions, recurse, deep);
353                        
354                        this.content=WebCrawler._parseItem(log,writer, root, url, urlsDone, extensions, recurse, deep,timeout+1);
355                        
356                } catch (IOException e) {}
357        }
358        
359        
360}