001 package railo.runtime.search.lucene2.net; 002 003 import java.io.IOException; 004 import java.io.PrintWriter; 005 import java.net.MalformedURLException; 006 import java.net.URL; 007 import java.util.ArrayList; 008 import java.util.Iterator; 009 import java.util.List; 010 011 import org.apache.lucene.document.Document; 012 import org.apache.lucene.index.IndexWriter; 013 014 import railo.commons.io.log.Log; 015 import railo.commons.io.log.LogAndSource; 016 import railo.commons.io.res.util.ResourceUtil; 017 import railo.commons.lang.HTMLUtil; 018 import railo.commons.lang.StringUtil; 019 import railo.commons.lang.SystemOut; 020 import railo.commons.net.HTTPUtil; 021 import railo.commons.net.http.HTTPEngine; 022 import railo.commons.net.http.HTTPResponse; 023 import railo.runtime.config.Config; 024 import railo.runtime.engine.ThreadLocalPageContext; 025 import railo.runtime.search.lucene2.DocumentUtil; 026 import railo.runtime.tag.Index; 027 import railo.runtime.type.util.ArrayUtil; 028 029 /** 030 * 031 */ 032 public final class WebCrawler { 033 034 private static HTMLUtil htmlUtil=new HTMLUtil(); 035 private LogAndSource log; 036 037 038 039 public WebCrawler(LogAndSource log) { 040 this.log=log; 041 } 042 043 044 045 public void parse(IndexWriter writer, URL current, String[] extensions, boolean recurse, long timeout) throws IOException { 046 translateExtension(extensions); 047 if(ArrayUtil.isEmpty(extensions))extensions=Index.EXTENSIONS; 048 _parse(log,writer,null,current,new ArrayList(), extensions,recurse,0,timeout); 049 } 050 051 052 private static URL translateURL(URL url) throws MalformedURLException { 053 054 055 //print.out(url.toExternalForm()); 056 String path=url.getPath(); 057 int dotIndex = path.lastIndexOf('.'); 058 // no dot 059 if(dotIndex==-1){ 060 if(path.endsWith("/")) return HTTPUtil.removeRef(url); 061 062 063 return HTTPUtil.removeRef(new URL( 064 url.getProtocol(), 065 url.getHost(), 066 url.getPort(), 067 path+"/"+StringUtil.emptyIfNull(url.getQuery()))); 068 } 069 //print.out("rem:"+HTTPUtil.removeRef(url)); 070 return HTTPUtil.removeRef(url); 071 } 072 073 074 private void translateExtension(String[] extensions) { 075 for(int i=0;i<extensions.length;i++){ 076 if(extensions[i].startsWith("*."))extensions[i]=extensions[i].substring(2); 077 else if(extensions[i].startsWith("."))extensions[i]=extensions[i].substring(1); 078 } 079 } 080 081 082 083 084 /** 085 * @param writer 086 * @param current 087 * @param content 088 * @throws IOException 089 */ 090 091 private static Document toDocument(StringBuffer content,IndexWriter writer, String root, URL current,long timeout) throws IOException { 092 HTTPResponse rsp = HTTPEngine.get(current, null, null, timeout,HTTPEngine.MAX_REDIRECT, null, "RailoBot", null, null); 093 Document doc = DocumentUtil.toDocument(content,root,current, rsp); 094 095 return doc; 096 } 097 098 protected static void _parse(Log log,IndexWriter writer, String root, URL current, List urlsDone, String[] extensions, boolean recurse,int deep,long timeout) throws IOException { 099 100 StringBuffer content = _parseItem(log,writer, root, current, urlsDone, extensions, recurse, deep,timeout); 101 if(content!=null)_parseChildren(log,content,writer, root, current, urlsDone, extensions, recurse, deep,timeout); 102 } 103 104 public static StringBuffer _parseItem(Log log,IndexWriter writer, String root, URL url, List urlsDone, String[] extensions, boolean recurse,int deep,long timeout) throws IOException{ 105 try{ 106 url=translateURL(url); 107 if(urlsDone.contains(url.toExternalForm())) return null; 108 urlsDone.add(url.toExternalForm()); 109 110 StringBuffer content=new StringBuffer(); 111 Document doc=toDocument(content,writer, root, url,timeout); 112 113 if(doc==null) return null; 114 if(writer!=null)writer.addDocument(doc); 115 116 // Test 117 /*Resource dir = ResourcesImpl.getFileResourceProvider().getResource("/Users/mic/Temp/leeway3/"); 118 if(!dir.isDirectory())dir.mkdirs(); 119 Resource file=dir.getRealResource(url.toExternalForm().replace("/", "_")); 120 IOUtil.write(file, content.toString(), "UTF-8", false);*/ 121 122 info(log,url.toExternalForm()); 123 return content; 124 } 125 catch(IOException ioe){ 126 error(log,url.toExternalForm(),ioe); 127 throw ioe; 128 } 129 } 130 131 132 133 protected static void _parseChildren(Log log,StringBuffer content,IndexWriter writer, String root, URL base, List urlsDone, String[] extensions, boolean recurse,int deep,long timeout) throws IOException { 134 135 136 137 if(recurse) { 138 List urls = htmlUtil.getURLS(content.toString(),base); 139 140 // loop through all children 141 int len=urls.size(); 142 List childIndexer=len>1?new ArrayList():null; 143 ChildrenIndexer ci; 144 //print.out("getting content"); 145 146 for(int i=0;i<len;i++) { 147 URL url=(URL) urls.get(i); 148 /*if(url.toExternalForm().indexOf("80")!=-1){ 149 SystemOut.printDate("base:"+base); 150 SystemOut.printDate("url:"+url); 151 }*/ 152 153 url=translateURL(url); 154 155 if(urlsDone.contains(url.toExternalForm())) continue; 156 //urlsDone.add(url.toExternalForm()); 157 158 String protocol=url.getProtocol().toLowerCase(); 159 String file=url.getPath(); 160 if((protocol.equals("http") || protocol.equals("https")) && validExtension(extensions,file) && 161 base.getHost().equalsIgnoreCase(url.getHost())) { 162 try { 163 ci=new ChildrenIndexer(log,writer,root,url,urlsDone,extensions,recurse,deep+1,timeout); 164 165 childIndexer.add(ci); 166 ci.start(); 167 } 168 catch(Throwable t) { 169 //print.printST(t); 170 } 171 } 172 } 173 174 if(childIndexer!=null && !childIndexer.isEmpty()){ 175 Iterator it = childIndexer.iterator(); 176 while(it.hasNext()) { 177 ci=(ChildrenIndexer) it.next(); 178 if(ci.isAlive()) { 179 try { 180 ci.join(timeout); 181 182 } 183 catch (InterruptedException e) { 184 //print.printST(e); 185 } 186 } 187 // timeout exceptionif(ci.isAlive()) throw new IOException("timeout occur while invoking page ["+ci.url+"]"); 188 189 if(ci.isAlive()){ 190 ci.interrupt(); 191 Config config = ThreadLocalPageContext.getConfig(); 192 SystemOut.printDate(config!=null?config.getErrWriter():new PrintWriter(System.err),"timeout ["+timeout+" ms] occur while invoking page ["+ci.url+"]"); 193 } 194 } 195 196 //print.out("exe child"); 197 it = childIndexer.iterator(); 198 while(it.hasNext()) { 199 ci=(ChildrenIndexer) it.next(); 200 //print.out("exec-child:"+ci.url); 201 //print.out(content); 202 if(ci.content!=null)_parseChildren(log,ci.content,writer, root, ci.url, urlsDone, extensions, recurse, deep,timeout); 203 } 204 205 } 206 207 208 urls.clear(); 209 } 210 //print.out("end:"+base); 211 } 212 213 214 215 /*protected static void _sssparse(IndexWriter writer, String root, URL current, List urlsDone, String[] extensions, boolean recurse,int deep,long timeout) throws IOException { 216 current=translateURL(current); 217 print.out("start:"+current); 218 if(urlsDone.contains(current.toExternalForm())) return; 219 220 HttpMethod method = HTTPUtil.invoke(current, null, null, -1, null, "RailoBot", null, -1, null, null, null); 221 StringBuffer content=new StringBuffer(); 222 Document doc = DocumentUtil.toDocument(content,root,current, method); 223 224 urlsDone.add(current.toExternalForm()); 225 if(doc==null) return; 226 if(writer!=null)writer.addDocument(doc); 227 228 229 if(recurse) { 230 List urls = htmlUtil.getURLS(content.toString(),current); 231 232 // loop through all children 233 int len=urls.size(); 234 List childIndexer=len>1?new ArrayList():null; 235 ChildrenIndexer ci; 236 for(int i=0;i<len;i++) { 237 URL url=(URL) urls.get(i); 238 String protocol=url.getProtocol().toLowerCase(); 239 String file=url.getPath(); 240 if((protocol.equals("http") || protocol.equals("https")) && validExtension(extensions,file) && 241 current.getHost().equalsIgnoreCase(url.getHost())) { 242 243 //_parse(writer,root,url,urlsDone,extensions,recurse,deep+1); 244 245 try { 246 if(len==1 || true)_parse(writer,root,url,urlsDone,extensions,recurse,deep+1,timeout); 247 else { 248 ci=new ChildrenIndexer(writer,root,url,urlsDone,extensions,recurse,deep+1); 249 ci.start(); 250 childIndexer.add(ci); 251 } 252 } 253 catch(Throwable t) { 254 } 255 } 256 } 257 258 if(!childIndexer.isEmpty()){ 259 Iterator it = childIndexer.iterator(); 260 while(it.hasNext()) { 261 ci=(ChildrenIndexer) it.next(); 262 if(ci.isAlive()) { 263 try { 264 ci.join(20*1000); 265 } 266 catch (InterruptedException e) {} 267 } 268 } 269 } 270 271 272 urls.clear(); 273 } 274 print.out("end:"+current); 275 }*/ 276 277 278 279 280 private static boolean validExtension(String[] extensions, String file) { 281 282 String ext = ResourceUtil.getExtension(file,""); 283 ext=railo.runtime.type.util.ListUtil.first(ext,"/",true); 284 285 if(StringUtil.isEmpty(ext))return true; 286 for(int i=0;i<extensions.length;i++){ 287 if(ext.equalsIgnoreCase(extensions[i]))return true; 288 } 289 return false; 290 } 291 292 293 private static void info(Log log,String doc) { 294 if(log==null) return; 295 log.info("Webcrawler", "invoke "+doc); 296 } 297 298 private static void error(Log log,String doc, Exception e) { 299 if(log==null) return; 300 log.error("Webcrawler", "invoke "+doc+":"+e.getMessage()); 301 } 302 } 303 304 305 class ChildrenIndexer extends Thread { 306 protected IndexWriter writer; 307 protected String root; 308 protected URL url; 309 protected List urlsDone; 310 protected String[] extensions; 311 protected boolean recurse; 312 protected int deep; 313 protected StringBuffer content; 314 private long timeout; 315 private Log log; 316 317 public ChildrenIndexer(Log log,IndexWriter writer, String root, URL url,List urlsDone, String[] extensions,boolean recurse, int deep,long timeout) { 318 this.writer=writer; 319 this.root=root; 320 this.url=url; 321 this.urlsDone=urlsDone; 322 this.extensions=extensions; 323 this.recurse=recurse; 324 this.deep=deep; 325 this.timeout=timeout; 326 this.log=log; 327 } 328 329 public void run(){ 330 try { 331 //WebCrawler._parse(writer, root, url, urlsDone, extensions, recurse, deep); 332 333 this.content=WebCrawler._parseItem(log,writer, root, url, urlsDone, extensions, recurse, deep,timeout+1); 334 335 } catch (IOException e) {} 336 } 337 338 339 }