001    package railo.runtime.search.lucene2.net;
002    
003    import java.io.IOException;
004    import java.io.PrintWriter;
005    import java.net.MalformedURLException;
006    import java.net.URL;
007    import java.util.ArrayList;
008    import java.util.Iterator;
009    import java.util.List;
010    
011    import org.apache.commons.httpclient.HttpMethod;
012    import org.apache.lucene.document.Document;
013    import org.apache.lucene.index.IndexWriter;
014    
015    import railo.commons.io.log.Log;
016    import railo.commons.io.log.LogAndSource;
017    import railo.commons.io.res.util.ResourceUtil;
018    import railo.commons.lang.HTMLUtil;
019    import railo.commons.lang.StringUtil;
020    import railo.commons.lang.SystemOut;
021    import railo.commons.net.HTTPUtil;
022    import railo.runtime.config.Config;
023    import railo.runtime.engine.ThreadLocalPageContext;
024    import railo.runtime.search.lucene2.DocumentUtil;
025    import railo.runtime.tag.Index;
026    import railo.runtime.type.util.ArrayUtil;
027    
028    /**
029     * 
030     */
031    public final class WebCrawler {
032        
033        private static HTMLUtil htmlUtil=new HTMLUtil();
034            private LogAndSource log;
035            
036        
037        
038        public WebCrawler(LogAndSource log) {
039            this.log=log;
040            }
041    
042            
043        
044        public void parse(IndexWriter writer, URL current, String[] extensions, boolean recurse, long timeout) throws IOException {
045            translateExtension(extensions);
046            if(ArrayUtil.isEmpty(extensions))extensions=Index.EXTENSIONS;
047            _parse(log,writer,null,current,new ArrayList(), extensions,recurse,0,timeout);
048        }
049        
050    
051            private static URL translateURL(URL url) throws MalformedURLException {
052                    
053                    
054                    //print.out(url.toExternalForm());
055                    String path=url.getPath();
056                    int dotIndex = path.lastIndexOf('.');
057                    // no dot
058                    if(dotIndex==-1){
059                            if(path.endsWith("/")) return HTTPUtil.removeRef(url);
060                            
061                            
062                            return HTTPUtil.removeRef(new URL(
063                                            url.getProtocol(),
064                                            url.getHost(),
065                                            url.getPort(),
066                                            path+"/"+StringUtil.emptyIfNull(url.getQuery())));
067                    }
068                    //print.out("rem:"+HTTPUtil.removeRef(url));
069                    return HTTPUtil.removeRef(url);
070            }   
071        
072    
073        private void translateExtension(String[] extensions) {
074                    for(int i=0;i<extensions.length;i++){
075                            if(extensions[i].startsWith("*."))extensions[i]=extensions[i].substring(2);
076                            else if(extensions[i].startsWith("."))extensions[i]=extensions[i].substring(1);
077                    }
078            }
079    
080    
081            
082    
083            /**
084         * @param writer
085         * @param current
086             * @param content 
087             * @throws IOException 
088         */
089    
090        private static Document toDocument(StringBuffer content,IndexWriter writer, String root, URL current,long timeout) throws IOException {
091            HttpMethod method = HTTPUtil.invoke(current, null, null, timeout, null, "RailoBot", null, -1, null, null, null);
092            Document doc = DocumentUtil.toDocument(content,root,current, method);
093            
094                    return doc;
095            }
096    
097        protected static void _parse(Log log,IndexWriter writer, String root, URL current, List urlsDone, String[] extensions, boolean recurse,int deep,long timeout) throws IOException  {
098            
099            StringBuffer content = _parseItem(log,writer, root, current, urlsDone, extensions, recurse, deep,timeout);
100            if(content!=null)_parseChildren(log,content,writer, root, current, urlsDone, extensions, recurse, deep,timeout);
101        }
102        
103        public static StringBuffer _parseItem(Log log,IndexWriter writer, String root, URL url, List urlsDone, String[] extensions, boolean recurse,int deep,long timeout) throws IOException{
104            try{
105                    url=translateURL(url);
106                    if(urlsDone.contains(url.toExternalForm())) return null;
107                    urlsDone.add(url.toExternalForm());
108                    
109                    StringBuffer content=new StringBuffer();            
110                    Document doc=toDocument(content,writer, root, url,timeout);
111                    
112                    if(doc==null) return null;
113                    if(writer!=null)writer.addDocument(doc);
114                    
115                    // Test
116                    /*Resource dir = ResourcesImpl.getFileResourceProvider().getResource("/Users/mic/Temp/leeway3/");
117                    if(!dir.isDirectory())dir.mkdirs();
118                    Resource file=dir.getRealResource(url.toExternalForm().replace("/", "_"));
119                    IOUtil.write(file, content.toString(), "UTF-8", false);*/
120                    
121                    info(log,url.toExternalForm());
122                    return content;
123            }
124            catch(IOException ioe){
125                    error(log,url.toExternalForm(),ioe);
126                    throw ioe;
127            }
128        }
129        
130    
131    
132            protected static void _parseChildren(Log log,StringBuffer content,IndexWriter writer, String root, URL base, List urlsDone, String[] extensions, boolean recurse,int deep,long timeout) throws IOException  {
133            
134    
135                    
136            if(recurse) {
137                List urls = htmlUtil.getURLS(content.toString(),base);
138                    
139                // loop through all children
140                int len=urls.size();
141                List childIndexer=len>1?new ArrayList():null;
142                ChildrenIndexer ci;
143                //print.out("getting content");
144                
145                    for(int i=0;i<len;i++) {
146                    URL url=(URL) urls.get(i);
147                    /*if(url.toExternalForm().indexOf("80")!=-1){
148                                    SystemOut.printDate("base:"+base);
149                                    SystemOut.printDate("url:"+url);
150                            }*/
151                    
152                    url=translateURL(url);
153                   
154                    if(urlsDone.contains(url.toExternalForm())) continue;
155                    //urlsDone.add(url.toExternalForm());
156                    
157                    String protocol=url.getProtocol().toLowerCase();
158                    String file=url.getPath();
159                    if((protocol.equals("http") || protocol.equals("https")) && validExtension(extensions,file) &&
160                       base.getHost().equalsIgnoreCase(url.getHost())) {
161                            try {
162                                    ci=new ChildrenIndexer(log,writer,root,url,urlsDone,extensions,recurse,deep+1,timeout);
163                                    
164                                    childIndexer.add(ci);
165                                    ci.start();
166                        }
167                        catch(Throwable t) {
168                            //print.printST(t);
169                        }
170                    }
171                }
172                    
173                    if(childIndexer!=null && !childIndexer.isEmpty()){
174                            Iterator it = childIndexer.iterator();
175                            while(it.hasNext()) {
176                                    ci=(ChildrenIndexer) it.next();
177                                    if(ci.isAlive()) {
178                                            try {
179                                                    ci.join(timeout);
180                                                    
181                                            } 
182                                            catch (InterruptedException e) {
183                                                    //print.printST(e);
184                                            }
185                                    }
186                                    // timeout exceptionif(ci.isAlive()) throw new IOException("timeout occur while invoking page ["+ci.url+"]");
187                                    
188                                    if(ci.isAlive()){
189                                            ci.interrupt();
190                                            Config config = ThreadLocalPageContext.getConfig();
191                                            SystemOut.printDate(config!=null?config.getErrWriter():new PrintWriter(System.err),"timeout ["+timeout+" ms] occur while invoking page ["+ci.url+"]");
192                                    }
193                            }
194                            
195                            //print.out("exe child");
196                            it = childIndexer.iterator();
197                            while(it.hasNext()) {
198                                    ci=(ChildrenIndexer) it.next();
199                                    //print.out("exec-child:"+ci.url);
200                                    //print.out(content);
201                                    if(ci.content!=null)_parseChildren(log,ci.content,writer, root, ci.url, urlsDone, extensions, recurse, deep,timeout);
202                            }
203                            
204                    }
205                    
206                    
207                    urls.clear();
208            }
209            //print.out("end:"+base);
210        }
211        
212    
213    
214        /*protected static void _sssparse(IndexWriter writer, String root, URL current, List urlsDone, String[] extensions, boolean recurse,int deep,long timeout) throws IOException  {
215            current=translateURL(current);
216            print.out("start:"+current);
217            if(urlsDone.contains(current.toExternalForm())) return;
218            
219            HttpMethod method = HTTPUtil.invoke(current, null, null, -1, null, "RailoBot", null, -1, null, null, null);
220            StringBuffer content=new StringBuffer();
221            Document doc = DocumentUtil.toDocument(content,root,current, method);
222            
223            urlsDone.add(current.toExternalForm());
224            if(doc==null) return;
225            if(writer!=null)writer.addDocument(doc);
226            
227            
228            if(recurse) {
229                List urls = htmlUtil.getURLS(content.toString(),current);
230                    
231                // loop through all children
232                int len=urls.size();
233                List childIndexer=len>1?new ArrayList():null;
234                ChildrenIndexer ci;
235                    for(int i=0;i<len;i++) {
236                    URL url=(URL) urls.get(i);
237                    String protocol=url.getProtocol().toLowerCase();
238                    String file=url.getPath();
239                    if((protocol.equals("http") || protocol.equals("https")) && validExtension(extensions,file) &&
240                       current.getHost().equalsIgnoreCase(url.getHost())) {
241                            
242                            //_parse(writer,root,url,urlsDone,extensions,recurse,deep+1);
243                            
244                        try {
245                            if(len==1 || true)_parse(writer,root,url,urlsDone,extensions,recurse,deep+1,timeout);
246                            else {
247                                    ci=new ChildrenIndexer(writer,root,url,urlsDone,extensions,recurse,deep+1);
248                                    ci.start();
249                                    childIndexer.add(ci);
250                            }
251                        }
252                        catch(Throwable t) {
253                        }
254                    }
255                }
256                    
257                    if(!childIndexer.isEmpty()){
258                            Iterator it = childIndexer.iterator();
259                            while(it.hasNext()) {
260                                    ci=(ChildrenIndexer) it.next();
261                                    if(ci.isAlive()) {
262                                            try {
263                                                    ci.join(20*1000);
264                                            } 
265                                            catch (InterruptedException e) {}
266                                    }
267                            }
268                    }
269                    
270                    
271                    urls.clear();
272            }
273            print.out("end:"+current);
274        }*/
275        
276        
277    
278    
279            private static boolean validExtension(String[] extensions, String file) {
280                    
281                    String ext = ResourceUtil.getExtension(file,"");
282                    ext=railo.runtime.type.List.first(ext,"/",true);
283                    
284                    if(StringUtil.isEmpty(ext))return true;
285                    for(int i=0;i<extensions.length;i++){
286                            if(ext.equalsIgnoreCase(extensions[i]))return true;
287                    }
288                    return false;
289            }
290    
291    
292        private static void info(Log log,String doc) {
293                    if(log==null) return;
294                    log.info("Webcrawler", "invoke "+doc);
295            }
296    
297        private static void error(Log log,String doc, Exception e) {
298                    if(log==null) return;
299                    log.error("Webcrawler", "invoke "+doc+":"+e.getMessage());
300            }
301    }
302    
303    
304    class ChildrenIndexer extends Thread {
305            protected IndexWriter writer;
306            protected String root;
307            protected URL url;
308            protected List urlsDone;
309            protected String[] extensions;
310            protected boolean recurse;
311            protected int deep;
312            protected StringBuffer content;
313            private long timeout;
314            private Log log;
315    
316            public ChildrenIndexer(Log log,IndexWriter writer, String root, URL url,List urlsDone, String[] extensions,boolean recurse, int deep,long timeout) {
317                    this.writer=writer;
318                    this.root=root;
319                    this.url=url;
320                    this.urlsDone=urlsDone;
321                    this.extensions=extensions;
322                    this.recurse=recurse;
323                    this.deep=deep;
324                    this.timeout=timeout;
325                    this.log=log;
326            }
327    
328            public void run(){
329                    try {
330                            //WebCrawler._parse(writer, root, url, urlsDone, extensions, recurse, deep);
331                            
332                            this.content=WebCrawler._parseItem(log,writer, root, url, urlsDone, extensions, recurse, deep,timeout+1);
333                            
334                    } catch (IOException e) {}
335            }
336            
337            
338    }