001/** 002 * 003 * Copyright (c) 2014, the Railo Company Ltd. All rights reserved. 004 * 005 * This library is free software; you can redistribute it and/or 006 * modify it under the terms of the GNU Lesser General Public 007 * License as published by the Free Software Foundation; either 008 * version 2.1 of the License, or (at your option) any later version. 009 * 010 * This library is distributed in the hope that it will be useful, 011 * but WITHOUT ANY WARRANTY; without even the implied warranty of 012 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 013 * Lesser General Public License for more details. 014 * 015 * You should have received a copy of the GNU Lesser General Public 016 * License along with this library. If not, see <http://www.gnu.org/licenses/>. 017 * 018 **/ 019package lucee.runtime.search.lucene2.net; 020 021import java.io.IOException; 022import java.io.PrintWriter; 023import java.net.MalformedURLException; 024import java.net.URL; 025import java.util.ArrayList; 026import java.util.Iterator; 027import java.util.List; 028 029import lucee.commons.io.log.Log; 030import lucee.commons.io.log.LogUtil; 031import lucee.commons.io.res.util.ResourceUtil; 032import lucee.commons.lang.ExceptionUtil; 033import lucee.commons.lang.HTMLUtil; 034import lucee.commons.lang.StringUtil; 035import lucee.commons.lang.SystemOut; 036import lucee.commons.net.HTTPUtil; 037import lucee.commons.net.http.HTTPEngine; 038import lucee.commons.net.http.HTTPResponse; 039import lucee.runtime.config.Config; 040import lucee.runtime.engine.ThreadLocalPageContext; 041import lucee.runtime.search.lucene2.DocumentUtil; 042import lucee.runtime.tag.Index; 043import lucee.runtime.type.util.ArrayUtil; 044 045import org.apache.lucene.document.Document; 046import org.apache.lucene.index.IndexWriter; 047 048/** 049 * 050 */ 051public final class WebCrawler { 052 053 private static HTMLUtil htmlUtil=new HTMLUtil(); 054 private Log log; 055 056 057 058 public WebCrawler(Log log) { 059 this.log=log; 060 } 061 062 063 064 public void parse(IndexWriter writer, URL current, String[] extensions, boolean recurse, long timeout) throws IOException { 065 translateExtension(extensions); 066 if(ArrayUtil.isEmpty(extensions))extensions=Index.EXTENSIONS; 067 _parse(log,writer,null,current,new ArrayList(), extensions,recurse,0,timeout); 068 } 069 070 071 private static URL translateURL(URL url) throws MalformedURLException { 072 073 074 //print.out(url.toExternalForm()); 075 String path=url.getPath(); 076 int dotIndex = path.lastIndexOf('.'); 077 // no dot 078 if(dotIndex==-1){ 079 if(path.endsWith("/")) return HTTPUtil.removeRef(url); 080 081 082 return HTTPUtil.removeRef(new URL( 083 url.getProtocol(), 084 url.getHost(), 085 url.getPort(), 086 path+"/"+StringUtil.emptyIfNull(url.getQuery()))); 087 } 088 //print.out("rem:"+HTTPUtil.removeRef(url)); 089 return HTTPUtil.removeRef(url); 090 } 091 092 093 private void translateExtension(String[] extensions) { 094 for(int i=0;i<extensions.length;i++){ 095 if(extensions[i].startsWith("*."))extensions[i]=extensions[i].substring(2); 096 else if(extensions[i].startsWith("."))extensions[i]=extensions[i].substring(1); 097 } 098 } 099 100 101 102 103 /** 104 * @param writer 105 * @param current 106 * @param content 107 * @throws IOException 108 */ 109 110 private static Document toDocument(StringBuffer content,IndexWriter writer, String root, URL current,long timeout) throws IOException { 111 HTTPResponse rsp = HTTPEngine.get(current, null, null, timeout,HTTPEngine.MAX_REDIRECT, null, "LuceeBot", null, null); 112 Document doc = DocumentUtil.toDocument(content,root,current, rsp); 113 114 return doc; 115 } 116 117 protected static void _parse(Log log,IndexWriter writer, String root, URL current, List urlsDone, String[] extensions, boolean recurse,int deep,long timeout) throws IOException { 118 119 StringBuffer content = _parseItem(log,writer, root, current, urlsDone, extensions, recurse, deep,timeout); 120 if(content!=null)_parseChildren(log,content,writer, root, current, urlsDone, extensions, recurse, deep,timeout); 121 } 122 123 public static StringBuffer _parseItem(Log log,IndexWriter writer, String root, URL url, List urlsDone, String[] extensions, boolean recurse,int deep,long timeout) throws IOException{ 124 try{ 125 url=translateURL(url); 126 if(urlsDone.contains(url.toExternalForm())) return null; 127 urlsDone.add(url.toExternalForm()); 128 129 StringBuffer content=new StringBuffer(); 130 Document doc=toDocument(content,writer, root, url,timeout); 131 132 if(doc==null) return null; 133 if(writer!=null)writer.addDocument(doc); 134 135 // Test 136 /*Resource dir = ResourcesImpl.getFileResourceProvider().getResource("/Users/mic/Temp/leeway3/"); 137 if(!dir.isDirectory())dir.mkdirs(); 138 Resource file=dir.getRealResource(url.toExternalForm().replace("/", "_")); 139 IOUtil.write(file, content.toString(), "UTF-8", false);*/ 140 141 info(log,url.toExternalForm()); 142 return content; 143 } 144 catch(IOException ioe){ 145 error(log,url.toExternalForm(),ioe); 146 throw ioe; 147 } 148 } 149 150 151 152 protected static void _parseChildren(Log log,StringBuffer content,IndexWriter writer, String root, URL base, List urlsDone, String[] extensions, boolean recurse,int deep,long timeout) throws IOException { 153 154 155 156 if(recurse) { 157 List urls = htmlUtil.getURLS(content.toString(),base); 158 159 // loop through all children 160 int len=urls.size(); 161 List childIndexer=len>1?new ArrayList():null; 162 ChildrenIndexer ci; 163 //print.out("getting content"); 164 165 for(int i=0;i<len;i++) { 166 URL url=(URL) urls.get(i); 167 /*if(url.toExternalForm().indexOf("80")!=-1){ 168 SystemOut.printDate("base:"+base); 169 SystemOut.printDate("url:"+url); 170 }*/ 171 172 url=translateURL(url); 173 174 if(urlsDone.contains(url.toExternalForm())) continue; 175 //urlsDone.add(url.toExternalForm()); 176 177 String protocol=url.getProtocol().toLowerCase(); 178 String file=url.getPath(); 179 if((protocol.equals("http") || protocol.equals("https")) && validExtension(extensions,file) && 180 base.getHost().equalsIgnoreCase(url.getHost())) { 181 try { 182 ci=new ChildrenIndexer(log,writer,root,url,urlsDone,extensions,recurse,deep+1,timeout); 183 184 childIndexer.add(ci); 185 ci.start(); 186 } 187 catch(Throwable t) { 188 ExceptionUtil.rethrowIfNecessary(t); 189 //print.printST(t); 190 } 191 } 192 } 193 194 if(childIndexer!=null && !childIndexer.isEmpty()){ 195 Iterator it = childIndexer.iterator(); 196 while(it.hasNext()) { 197 ci=(ChildrenIndexer) it.next(); 198 if(ci.isAlive()) { 199 try { 200 ci.join(timeout); 201 202 } 203 catch (InterruptedException e) { 204 //print.printST(e); 205 } 206 } 207 // timeout exceptionif(ci.isAlive()) throw new IOException("timeout occur while invoking page ["+ci.url+"]"); 208 209 if(ci.isAlive()){ 210 ci.interrupt(); 211 Config config = ThreadLocalPageContext.getConfig(); 212 SystemOut.printDate(config!=null?config.getErrWriter():new PrintWriter(System.err),"timeout ["+timeout+" ms] occur while invoking page ["+ci.url+"]"); 213 } 214 } 215 216 //print.out("exe child"); 217 it = childIndexer.iterator(); 218 while(it.hasNext()) { 219 ci=(ChildrenIndexer) it.next(); 220 //print.out("exec-child:"+ci.url); 221 //print.out(content); 222 if(ci.content!=null)_parseChildren(log,ci.content,writer, root, ci.url, urlsDone, extensions, recurse, deep,timeout); 223 } 224 225 } 226 227 228 urls.clear(); 229 } 230 //print.out("end:"+base); 231 } 232 233 234 235 /*protected static void _sssparse(IndexWriter writer, String root, URL current, List urlsDone, String[] extensions, boolean recurse,int deep,long timeout) throws IOException { 236 current=translateURL(current); 237 print.out("start:"+current); 238 if(urlsDone.contains(current.toExternalForm())) return; 239 240 HttpMethod method = HTTPUtil.invoke(current, null, null, -1, null, "LuceeBot", null, -1, null, null, null); 241 StringBuffer content=new StringBuffer(); 242 Document doc = DocumentUtil.toDocument(content,root,current, method); 243 244 urlsDone.add(current.toExternalForm()); 245 if(doc==null) return; 246 if(writer!=null)writer.addDocument(doc); 247 248 249 if(recurse) { 250 List urls = htmlUtil.getURLS(content.toString(),current); 251 252 // loop through all children 253 int len=urls.size(); 254 List childIndexer=len>1?new ArrayList():null; 255 ChildrenIndexer ci; 256 for(int i=0;i<len;i++) { 257 URL url=(URL) urls.get(i); 258 String protocol=url.getProtocol().toLowerCase(); 259 String file=url.getPath(); 260 if((protocol.equals("http") || protocol.equals("https")) && validExtension(extensions,file) && 261 current.getHost().equalsIgnoreCase(url.getHost())) { 262 263 //_parse(writer,root,url,urlsDone,extensions,recurse,deep+1); 264 265 try { 266 if(len==1 || true)_parse(writer,root,url,urlsDone,extensions,recurse,deep+1,timeout); 267 else { 268 ci=new ChildrenIndexer(writer,root,url,urlsDone,extensions,recurse,deep+1); 269 ci.start(); 270 childIndexer.add(ci); 271 } 272 } 273 catch(Throwable t) { 274 ExceptionUtil.rethrowIfNecessary(t); 275 } 276 } 277 } 278 279 if(!childIndexer.isEmpty()){ 280 Iterator it = childIndexer.iterator(); 281 while(it.hasNext()) { 282 ci=(ChildrenIndexer) it.next(); 283 if(ci.isAlive()) { 284 try { 285 ci.join(20*1000); 286 } 287 catch (InterruptedException e) {} 288 } 289 } 290 } 291 292 293 urls.clear(); 294 } 295 print.out("end:"+current); 296 }*/ 297 298 299 300 301 private static boolean validExtension(String[] extensions, String file) { 302 303 String ext = ResourceUtil.getExtension(file,""); 304 ext=lucee.runtime.type.util.ListUtil.first(ext,"/",true); 305 306 if(StringUtil.isEmpty(ext))return true; 307 for(int i=0;i<extensions.length;i++){ 308 if(ext.equalsIgnoreCase(extensions[i]))return true; 309 } 310 return false; 311 } 312 313 314 private static void info(Log log,String doc) { 315 if(log==null) return; 316 log.log(Log.LEVEL_INFO,"Webcrawler", "invoke "+doc); 317 } 318 319 private static void error(Log log,String doc, Exception e) { 320 if(log==null) return; 321 LogUtil.log(log,Log.LEVEL_ERROR,"Webcrawler", "invoke "+doc+":",e); 322 } 323} 324 325 326class ChildrenIndexer extends Thread { 327 protected IndexWriter writer; 328 protected String root; 329 protected URL url; 330 protected List urlsDone; 331 protected String[] extensions; 332 protected boolean recurse; 333 protected int deep; 334 protected StringBuffer content; 335 private long timeout; 336 private Log log; 337 338 public ChildrenIndexer(Log log,IndexWriter writer, String root, URL url,List urlsDone, String[] extensions,boolean recurse, int deep,long timeout) { 339 this.writer=writer; 340 this.root=root; 341 this.url=url; 342 this.urlsDone=urlsDone; 343 this.extensions=extensions; 344 this.recurse=recurse; 345 this.deep=deep; 346 this.timeout=timeout; 347 this.log=log; 348 } 349 350 public void run(){ 351 try { 352 //WebCrawler._parse(writer, root, url, urlsDone, extensions, recurse, deep); 353 354 this.content=WebCrawler._parseItem(log,writer, root, url, urlsDone, extensions, recurse, deep,timeout+1); 355 356 } catch (IOException e) {} 357 } 358 359 360}