001 package railo.runtime.search.lucene2.net; 002 003 import java.io.IOException; 004 import java.io.PrintWriter; 005 import java.net.MalformedURLException; 006 import java.net.URL; 007 import java.util.ArrayList; 008 import java.util.Iterator; 009 import java.util.List; 010 011 import org.apache.commons.httpclient.HttpMethod; 012 import org.apache.lucene.document.Document; 013 import org.apache.lucene.index.IndexWriter; 014 015 import railo.commons.io.log.Log; 016 import railo.commons.io.log.LogAndSource; 017 import railo.commons.io.res.util.ResourceUtil; 018 import railo.commons.lang.HTMLUtil; 019 import railo.commons.lang.StringUtil; 020 import railo.commons.lang.SystemOut; 021 import railo.commons.net.HTTPUtil; 022 import railo.runtime.config.Config; 023 import railo.runtime.engine.ThreadLocalPageContext; 024 import railo.runtime.search.lucene2.DocumentUtil; 025 import railo.runtime.tag.Index; 026 import railo.runtime.type.util.ArrayUtil; 027 028 /** 029 * 030 */ 031 public final class WebCrawler { 032 033 private static HTMLUtil htmlUtil=new HTMLUtil(); 034 private LogAndSource log; 035 036 037 038 public WebCrawler(LogAndSource log) { 039 this.log=log; 040 } 041 042 043 044 public void parse(IndexWriter writer, URL current, String[] extensions, boolean recurse, long timeout) throws IOException { 045 translateExtension(extensions); 046 if(ArrayUtil.isEmpty(extensions))extensions=Index.EXTENSIONS; 047 _parse(log,writer,null,current,new ArrayList(), extensions,recurse,0,timeout); 048 } 049 050 051 private static URL translateURL(URL url) throws MalformedURLException { 052 053 054 //print.out(url.toExternalForm()); 055 String path=url.getPath(); 056 int dotIndex = path.lastIndexOf('.'); 057 // no dot 058 if(dotIndex==-1){ 059 if(path.endsWith("/")) return HTTPUtil.removeRef(url); 060 061 062 return HTTPUtil.removeRef(new URL( 063 url.getProtocol(), 064 url.getHost(), 065 url.getPort(), 066 path+"/"+StringUtil.emptyIfNull(url.getQuery()))); 067 } 068 //print.out("rem:"+HTTPUtil.removeRef(url)); 069 return HTTPUtil.removeRef(url); 070 } 071 072 073 private void translateExtension(String[] extensions) { 074 for(int i=0;i<extensions.length;i++){ 075 if(extensions[i].startsWith("*."))extensions[i]=extensions[i].substring(2); 076 else if(extensions[i].startsWith("."))extensions[i]=extensions[i].substring(1); 077 } 078 } 079 080 081 082 083 /** 084 * @param writer 085 * @param current 086 * @param content 087 * @throws IOException 088 */ 089 090 private static Document toDocument(StringBuffer content,IndexWriter writer, String root, URL current,long timeout) throws IOException { 091 HttpMethod method = HTTPUtil.invoke(current, null, null, timeout, null, "RailoBot", null, -1, null, null, null); 092 Document doc = DocumentUtil.toDocument(content,root,current, method); 093 094 return doc; 095 } 096 097 protected static void _parse(Log log,IndexWriter writer, String root, URL current, List urlsDone, String[] extensions, boolean recurse,int deep,long timeout) throws IOException { 098 099 StringBuffer content = _parseItem(log,writer, root, current, urlsDone, extensions, recurse, deep,timeout); 100 if(content!=null)_parseChildren(log,content,writer, root, current, urlsDone, extensions, recurse, deep,timeout); 101 } 102 103 public static StringBuffer _parseItem(Log log,IndexWriter writer, String root, URL url, List urlsDone, String[] extensions, boolean recurse,int deep,long timeout) throws IOException{ 104 try{ 105 url=translateURL(url); 106 if(urlsDone.contains(url.toExternalForm())) return null; 107 urlsDone.add(url.toExternalForm()); 108 109 StringBuffer content=new StringBuffer(); 110 Document doc=toDocument(content,writer, root, url,timeout); 111 112 if(doc==null) return null; 113 if(writer!=null)writer.addDocument(doc); 114 115 // Test 116 /*Resource dir = ResourcesImpl.getFileResourceProvider().getResource("/Users/mic/Temp/leeway3/"); 117 if(!dir.isDirectory())dir.mkdirs(); 118 Resource file=dir.getRealResource(url.toExternalForm().replace("/", "_")); 119 IOUtil.write(file, content.toString(), "UTF-8", false);*/ 120 121 info(log,url.toExternalForm()); 122 return content; 123 } 124 catch(IOException ioe){ 125 error(log,url.toExternalForm(),ioe); 126 throw ioe; 127 } 128 } 129 130 131 132 protected static void _parseChildren(Log log,StringBuffer content,IndexWriter writer, String root, URL base, List urlsDone, String[] extensions, boolean recurse,int deep,long timeout) throws IOException { 133 134 135 136 if(recurse) { 137 List urls = htmlUtil.getURLS(content.toString(),base); 138 139 // loop through all children 140 int len=urls.size(); 141 List childIndexer=len>1?new ArrayList():null; 142 ChildrenIndexer ci; 143 //print.out("getting content"); 144 145 for(int i=0;i<len;i++) { 146 URL url=(URL) urls.get(i); 147 /*if(url.toExternalForm().indexOf("80")!=-1){ 148 SystemOut.printDate("base:"+base); 149 SystemOut.printDate("url:"+url); 150 }*/ 151 152 url=translateURL(url); 153 154 if(urlsDone.contains(url.toExternalForm())) continue; 155 //urlsDone.add(url.toExternalForm()); 156 157 String protocol=url.getProtocol().toLowerCase(); 158 String file=url.getPath(); 159 if((protocol.equals("http") || protocol.equals("https")) && validExtension(extensions,file) && 160 base.getHost().equalsIgnoreCase(url.getHost())) { 161 try { 162 ci=new ChildrenIndexer(log,writer,root,url,urlsDone,extensions,recurse,deep+1,timeout); 163 164 childIndexer.add(ci); 165 ci.start(); 166 } 167 catch(Throwable t) { 168 //print.printST(t); 169 } 170 } 171 } 172 173 if(childIndexer!=null && !childIndexer.isEmpty()){ 174 Iterator it = childIndexer.iterator(); 175 while(it.hasNext()) { 176 ci=(ChildrenIndexer) it.next(); 177 if(ci.isAlive()) { 178 try { 179 ci.join(timeout); 180 181 } 182 catch (InterruptedException e) { 183 //print.printST(e); 184 } 185 } 186 // timeout exceptionif(ci.isAlive()) throw new IOException("timeout occur while invoking page ["+ci.url+"]"); 187 188 if(ci.isAlive()){ 189 ci.interrupt(); 190 Config config = ThreadLocalPageContext.getConfig(); 191 SystemOut.printDate(config!=null?config.getErrWriter():new PrintWriter(System.err),"timeout ["+timeout+" ms] occur while invoking page ["+ci.url+"]"); 192 } 193 } 194 195 //print.out("exe child"); 196 it = childIndexer.iterator(); 197 while(it.hasNext()) { 198 ci=(ChildrenIndexer) it.next(); 199 //print.out("exec-child:"+ci.url); 200 //print.out(content); 201 if(ci.content!=null)_parseChildren(log,ci.content,writer, root, ci.url, urlsDone, extensions, recurse, deep,timeout); 202 } 203 204 } 205 206 207 urls.clear(); 208 } 209 //print.out("end:"+base); 210 } 211 212 213 214 /*protected static void _sssparse(IndexWriter writer, String root, URL current, List urlsDone, String[] extensions, boolean recurse,int deep,long timeout) throws IOException { 215 current=translateURL(current); 216 print.out("start:"+current); 217 if(urlsDone.contains(current.toExternalForm())) return; 218 219 HttpMethod method = HTTPUtil.invoke(current, null, null, -1, null, "RailoBot", null, -1, null, null, null); 220 StringBuffer content=new StringBuffer(); 221 Document doc = DocumentUtil.toDocument(content,root,current, method); 222 223 urlsDone.add(current.toExternalForm()); 224 if(doc==null) return; 225 if(writer!=null)writer.addDocument(doc); 226 227 228 if(recurse) { 229 List urls = htmlUtil.getURLS(content.toString(),current); 230 231 // loop through all children 232 int len=urls.size(); 233 List childIndexer=len>1?new ArrayList():null; 234 ChildrenIndexer ci; 235 for(int i=0;i<len;i++) { 236 URL url=(URL) urls.get(i); 237 String protocol=url.getProtocol().toLowerCase(); 238 String file=url.getPath(); 239 if((protocol.equals("http") || protocol.equals("https")) && validExtension(extensions,file) && 240 current.getHost().equalsIgnoreCase(url.getHost())) { 241 242 //_parse(writer,root,url,urlsDone,extensions,recurse,deep+1); 243 244 try { 245 if(len==1 || true)_parse(writer,root,url,urlsDone,extensions,recurse,deep+1,timeout); 246 else { 247 ci=new ChildrenIndexer(writer,root,url,urlsDone,extensions,recurse,deep+1); 248 ci.start(); 249 childIndexer.add(ci); 250 } 251 } 252 catch(Throwable t) { 253 } 254 } 255 } 256 257 if(!childIndexer.isEmpty()){ 258 Iterator it = childIndexer.iterator(); 259 while(it.hasNext()) { 260 ci=(ChildrenIndexer) it.next(); 261 if(ci.isAlive()) { 262 try { 263 ci.join(20*1000); 264 } 265 catch (InterruptedException e) {} 266 } 267 } 268 } 269 270 271 urls.clear(); 272 } 273 print.out("end:"+current); 274 }*/ 275 276 277 278 279 private static boolean validExtension(String[] extensions, String file) { 280 281 String ext = ResourceUtil.getExtension(file,""); 282 ext=railo.runtime.type.List.first(ext,"/",true); 283 284 if(StringUtil.isEmpty(ext))return true; 285 for(int i=0;i<extensions.length;i++){ 286 if(ext.equalsIgnoreCase(extensions[i]))return true; 287 } 288 return false; 289 } 290 291 292 private static void info(Log log,String doc) { 293 if(log==null) return; 294 log.info("Webcrawler", "invoke "+doc); 295 } 296 297 private static void error(Log log,String doc, Exception e) { 298 if(log==null) return; 299 log.error("Webcrawler", "invoke "+doc+":"+e.getMessage()); 300 } 301 } 302 303 304 class ChildrenIndexer extends Thread { 305 protected IndexWriter writer; 306 protected String root; 307 protected URL url; 308 protected List urlsDone; 309 protected String[] extensions; 310 protected boolean recurse; 311 protected int deep; 312 protected StringBuffer content; 313 private long timeout; 314 private Log log; 315 316 public ChildrenIndexer(Log log,IndexWriter writer, String root, URL url,List urlsDone, String[] extensions,boolean recurse, int deep,long timeout) { 317 this.writer=writer; 318 this.root=root; 319 this.url=url; 320 this.urlsDone=urlsDone; 321 this.extensions=extensions; 322 this.recurse=recurse; 323 this.deep=deep; 324 this.timeout=timeout; 325 this.log=log; 326 } 327 328 public void run(){ 329 try { 330 //WebCrawler._parse(writer, root, url, urlsDone, extensions, recurse, deep); 331 332 this.content=WebCrawler._parseItem(log,writer, root, url, urlsDone, extensions, recurse, deep,timeout+1); 333 334 } catch (IOException e) {} 335 } 336 337 338 }