001    package railo.runtime.search.lucene2.net;
002    
003    import java.io.IOException;
004    import java.io.PrintWriter;
005    import java.net.MalformedURLException;
006    import java.net.URL;
007    import java.util.ArrayList;
008    import java.util.Iterator;
009    import java.util.List;
010    
011    import org.apache.lucene.document.Document;
012    import org.apache.lucene.index.IndexWriter;
013    
014    import railo.commons.io.log.Log;
015    import railo.commons.io.log.LogAndSource;
016    import railo.commons.io.res.util.ResourceUtil;
017    import railo.commons.lang.HTMLUtil;
018    import railo.commons.lang.StringUtil;
019    import railo.commons.lang.SystemOut;
020    import railo.commons.net.HTTPUtil;
021    import railo.commons.net.http.HTTPEngine;
022    import railo.commons.net.http.HTTPResponse;
023    import railo.runtime.config.Config;
024    import railo.runtime.engine.ThreadLocalPageContext;
025    import railo.runtime.search.lucene2.DocumentUtil;
026    import railo.runtime.tag.Index;
027    import railo.runtime.type.util.ArrayUtil;
028    
029    /**
030     * 
031     */
032    public final class WebCrawler {
033        
034        private static HTMLUtil htmlUtil=new HTMLUtil();
035            private LogAndSource log;
036            
037        
038        
039        public WebCrawler(LogAndSource log) {
040            this.log=log;
041            }
042    
043            
044        
045        public void parse(IndexWriter writer, URL current, String[] extensions, boolean recurse, long timeout) throws IOException {
046            translateExtension(extensions);
047            if(ArrayUtil.isEmpty(extensions))extensions=Index.EXTENSIONS;
048            _parse(log,writer,null,current,new ArrayList(), extensions,recurse,0,timeout);
049        }
050        
051    
052            private static URL translateURL(URL url) throws MalformedURLException {
053                    
054                    
055                    //print.out(url.toExternalForm());
056                    String path=url.getPath();
057                    int dotIndex = path.lastIndexOf('.');
058                    // no dot
059                    if(dotIndex==-1){
060                            if(path.endsWith("/")) return HTTPUtil.removeRef(url);
061                            
062                            
063                            return HTTPUtil.removeRef(new URL(
064                                            url.getProtocol(),
065                                            url.getHost(),
066                                            url.getPort(),
067                                            path+"/"+StringUtil.emptyIfNull(url.getQuery())));
068                    }
069                    //print.out("rem:"+HTTPUtil.removeRef(url));
070                    return HTTPUtil.removeRef(url);
071            }   
072        
073    
074        private void translateExtension(String[] extensions) {
075                    for(int i=0;i<extensions.length;i++){
076                            if(extensions[i].startsWith("*."))extensions[i]=extensions[i].substring(2);
077                            else if(extensions[i].startsWith("."))extensions[i]=extensions[i].substring(1);
078                    }
079            }
080    
081    
082            
083    
084            /**
085         * @param writer
086         * @param current
087             * @param content 
088             * @throws IOException 
089         */
090    
091        private static Document toDocument(StringBuffer content,IndexWriter writer, String root, URL current,long timeout) throws IOException {
092            HTTPResponse rsp = HTTPEngine.get(current, null, null, timeout,HTTPEngine.MAX_REDIRECT, null, "RailoBot", null, null);
093            Document doc = DocumentUtil.toDocument(content,root,current, rsp);
094            
095                    return doc;
096            }
097    
098        protected static void _parse(Log log,IndexWriter writer, String root, URL current, List urlsDone, String[] extensions, boolean recurse,int deep,long timeout) throws IOException  {
099            
100            StringBuffer content = _parseItem(log,writer, root, current, urlsDone, extensions, recurse, deep,timeout);
101            if(content!=null)_parseChildren(log,content,writer, root, current, urlsDone, extensions, recurse, deep,timeout);
102        }
103        
104        public static StringBuffer _parseItem(Log log,IndexWriter writer, String root, URL url, List urlsDone, String[] extensions, boolean recurse,int deep,long timeout) throws IOException{
105            try{
106                    url=translateURL(url);
107                    if(urlsDone.contains(url.toExternalForm())) return null;
108                    urlsDone.add(url.toExternalForm());
109                    
110                    StringBuffer content=new StringBuffer();            
111                    Document doc=toDocument(content,writer, root, url,timeout);
112                    
113                    if(doc==null) return null;
114                    if(writer!=null)writer.addDocument(doc);
115                    
116                    // Test
117                    /*Resource dir = ResourcesImpl.getFileResourceProvider().getResource("/Users/mic/Temp/leeway3/");
118                    if(!dir.isDirectory())dir.mkdirs();
119                    Resource file=dir.getRealResource(url.toExternalForm().replace("/", "_"));
120                    IOUtil.write(file, content.toString(), "UTF-8", false);*/
121                    
122                    info(log,url.toExternalForm());
123                    return content;
124            }
125            catch(IOException ioe){
126                    error(log,url.toExternalForm(),ioe);
127                    throw ioe;
128            }
129        }
130        
131    
132    
133            protected static void _parseChildren(Log log,StringBuffer content,IndexWriter writer, String root, URL base, List urlsDone, String[] extensions, boolean recurse,int deep,long timeout) throws IOException  {
134            
135    
136                    
137            if(recurse) {
138                List urls = htmlUtil.getURLS(content.toString(),base);
139                    
140                // loop through all children
141                int len=urls.size();
142                List childIndexer=len>1?new ArrayList():null;
143                ChildrenIndexer ci;
144                //print.out("getting content");
145                
146                    for(int i=0;i<len;i++) {
147                    URL url=(URL) urls.get(i);
148                    /*if(url.toExternalForm().indexOf("80")!=-1){
149                                    SystemOut.printDate("base:"+base);
150                                    SystemOut.printDate("url:"+url);
151                            }*/
152                    
153                    url=translateURL(url);
154                   
155                    if(urlsDone.contains(url.toExternalForm())) continue;
156                    //urlsDone.add(url.toExternalForm());
157                    
158                    String protocol=url.getProtocol().toLowerCase();
159                    String file=url.getPath();
160                    if((protocol.equals("http") || protocol.equals("https")) && validExtension(extensions,file) &&
161                       base.getHost().equalsIgnoreCase(url.getHost())) {
162                            try {
163                                    ci=new ChildrenIndexer(log,writer,root,url,urlsDone,extensions,recurse,deep+1,timeout);
164                                    
165                                    childIndexer.add(ci);
166                                    ci.start();
167                        }
168                        catch(Throwable t) {
169                            //print.printST(t);
170                        }
171                    }
172                }
173                    
174                    if(childIndexer!=null && !childIndexer.isEmpty()){
175                            Iterator it = childIndexer.iterator();
176                            while(it.hasNext()) {
177                                    ci=(ChildrenIndexer) it.next();
178                                    if(ci.isAlive()) {
179                                            try {
180                                                    ci.join(timeout);
181                                                    
182                                            } 
183                                            catch (InterruptedException e) {
184                                                    //print.printST(e);
185                                            }
186                                    }
187                                    // timeout exceptionif(ci.isAlive()) throw new IOException("timeout occur while invoking page ["+ci.url+"]");
188                                    
189                                    if(ci.isAlive()){
190                                            ci.interrupt();
191                                            Config config = ThreadLocalPageContext.getConfig();
192                                            SystemOut.printDate(config!=null?config.getErrWriter():new PrintWriter(System.err),"timeout ["+timeout+" ms] occur while invoking page ["+ci.url+"]");
193                                    }
194                            }
195                            
196                            //print.out("exe child");
197                            it = childIndexer.iterator();
198                            while(it.hasNext()) {
199                                    ci=(ChildrenIndexer) it.next();
200                                    //print.out("exec-child:"+ci.url);
201                                    //print.out(content);
202                                    if(ci.content!=null)_parseChildren(log,ci.content,writer, root, ci.url, urlsDone, extensions, recurse, deep,timeout);
203                            }
204                            
205                    }
206                    
207                    
208                    urls.clear();
209            }
210            //print.out("end:"+base);
211        }
212        
213    
214    
215        /*protected static void _sssparse(IndexWriter writer, String root, URL current, List urlsDone, String[] extensions, boolean recurse,int deep,long timeout) throws IOException  {
216            current=translateURL(current);
217            print.out("start:"+current);
218            if(urlsDone.contains(current.toExternalForm())) return;
219            
220            HttpMethod method = HTTPUtil.invoke(current, null, null, -1, null, "RailoBot", null, -1, null, null, null);
221            StringBuffer content=new StringBuffer();
222            Document doc = DocumentUtil.toDocument(content,root,current, method);
223            
224            urlsDone.add(current.toExternalForm());
225            if(doc==null) return;
226            if(writer!=null)writer.addDocument(doc);
227            
228            
229            if(recurse) {
230                List urls = htmlUtil.getURLS(content.toString(),current);
231                    
232                // loop through all children
233                int len=urls.size();
234                List childIndexer=len>1?new ArrayList():null;
235                ChildrenIndexer ci;
236                    for(int i=0;i<len;i++) {
237                    URL url=(URL) urls.get(i);
238                    String protocol=url.getProtocol().toLowerCase();
239                    String file=url.getPath();
240                    if((protocol.equals("http") || protocol.equals("https")) && validExtension(extensions,file) &&
241                       current.getHost().equalsIgnoreCase(url.getHost())) {
242                            
243                            //_parse(writer,root,url,urlsDone,extensions,recurse,deep+1);
244                            
245                        try {
246                            if(len==1 || true)_parse(writer,root,url,urlsDone,extensions,recurse,deep+1,timeout);
247                            else {
248                                    ci=new ChildrenIndexer(writer,root,url,urlsDone,extensions,recurse,deep+1);
249                                    ci.start();
250                                    childIndexer.add(ci);
251                            }
252                        }
253                        catch(Throwable t) {
254                        }
255                    }
256                }
257                    
258                    if(!childIndexer.isEmpty()){
259                            Iterator it = childIndexer.iterator();
260                            while(it.hasNext()) {
261                                    ci=(ChildrenIndexer) it.next();
262                                    if(ci.isAlive()) {
263                                            try {
264                                                    ci.join(20*1000);
265                                            } 
266                                            catch (InterruptedException e) {}
267                                    }
268                            }
269                    }
270                    
271                    
272                    urls.clear();
273            }
274            print.out("end:"+current);
275        }*/
276        
277        
278    
279    
280            private static boolean validExtension(String[] extensions, String file) {
281                    
282                    String ext = ResourceUtil.getExtension(file,"");
283                    ext=railo.runtime.type.util.ListUtil.first(ext,"/",true);
284                    
285                    if(StringUtil.isEmpty(ext))return true;
286                    for(int i=0;i<extensions.length;i++){
287                            if(ext.equalsIgnoreCase(extensions[i]))return true;
288                    }
289                    return false;
290            }
291    
292    
293        private static void info(Log log,String doc) {
294                    if(log==null) return;
295                    log.info("Webcrawler", "invoke "+doc);
296            }
297    
298        private static void error(Log log,String doc, Exception e) {
299                    if(log==null) return;
300                    log.error("Webcrawler", "invoke "+doc+":"+e.getMessage());
301            }
302    }
303    
304    
305    class ChildrenIndexer extends Thread {
306            protected IndexWriter writer;
307            protected String root;
308            protected URL url;
309            protected List urlsDone;
310            protected String[] extensions;
311            protected boolean recurse;
312            protected int deep;
313            protected StringBuffer content;
314            private long timeout;
315            private Log log;
316    
317            public ChildrenIndexer(Log log,IndexWriter writer, String root, URL url,List urlsDone, String[] extensions,boolean recurse, int deep,long timeout) {
318                    this.writer=writer;
319                    this.root=root;
320                    this.url=url;
321                    this.urlsDone=urlsDone;
322                    this.extensions=extensions;
323                    this.recurse=recurse;
324                    this.deep=deep;
325                    this.timeout=timeout;
326                    this.log=log;
327            }
328    
329            public void run(){
330                    try {
331                            //WebCrawler._parse(writer, root, url, urlsDone, extensions, recurse, deep);
332                            
333                            this.content=WebCrawler._parseItem(log,writer, root, url, urlsDone, extensions, recurse, deep,timeout+1);
334                            
335                    } catch (IOException e) {}
336            }
337            
338            
339    }