001 package railo.runtime.search.lucene2.docs; 002 003 import java.io.Reader; 004 import java.io.StringReader; 005 006 import org.apache.lucene.document.DateField; 007 import org.apache.lucene.document.Document; 008 009 import railo.commons.io.IOUtil; 010 import railo.commons.io.res.Resource; 011 import railo.commons.lang.StringUtil; 012 import railo.runtime.op.Caster; 013 import railo.runtime.search.lucene2.html.HTMLParser; 014 015 /** A utility for making Lucene Documents for HTML documents. */ 016 017 public final class HTMLDocument { 018 private static final char FILE_SEPARATOR = System.getProperty("file.separator").charAt(0); 019 020 public static String uid(Resource f) { 021 return f.getPath().replace(FILE_SEPARATOR, '\u0000') + 022 "\u0000" + 023 DateField.timeToString(f.lastModified()); 024 } 025 026 public static String uid2url(String uid) { 027 String url = uid.replace('\u0000', '/'); // replace nulls with slashes 028 return url.substring(0, url.lastIndexOf('/')); // remove date from end 029 } 030 031 public static Document getDocument(Resource res,String charset) { 032 Document doc = new Document(); 033 doc.add(FieldUtil.Text("uid", uid(res), false)); 034 035 HTMLParser parser = new HTMLParser(); 036 try { 037 parser.parse(res,charset); 038 } 039 catch (Throwable t) { 040 return doc; 041 } 042 addContent(doc,parser); 043 return doc; 044 } 045 046 public static Document getDocument(StringBuffer content, Reader reader) { 047 Document doc = new Document(); 048 049 HTMLParser parser = new HTMLParser(); 050 try { 051 String str = IOUtil.toString(reader); 052 if(content!=null)content.append(str); 053 doc.add(FieldUtil.UnIndexed("size", Caster.toString(str.length()))); 054 StringReader sr = new StringReader(str); 055 parser.parse(sr); 056 } 057 catch (Throwable t) { 058 //t.printStackTrace(); 059 return doc; 060 } 061 062 addContent(doc, parser); 063 return doc; 064 } 065 066 private static void addContent(Document doc, HTMLParser parser) { 067 068 FieldUtil.setMimeType(doc,"text/html"); 069 //doc.add(FieldUtil.UnIndexed("mime-type", "text/html")); 070 071 String content = parser.getContent(); 072 073 FieldUtil.setTitle(doc,parser.getTitle()); 074 075 String summary = parser.getSummary(); 076 if(StringUtil.isEmpty(summary)){ 077 summary=(content.length()<=200)? content:content.substring(0,200); 078 FieldUtil.setSummary(doc,summary,false); 079 } 080 else{ 081 FieldUtil.setSummary(doc,summary,true); 082 } 083 FieldUtil.setRaw(doc,content); 084 FieldUtil.setContent(doc,content); 085 086 //doc.add(FieldUtil.UnIndexed("charset", StringUtil.valueOf(parser.getCharset()))); 087 088 if(parser.hasKeywords()) { 089 FieldUtil.setKeywords(doc,parser.getKeywords()); 090 } 091 092 093 if(parser.hasAuthor()){ 094 FieldUtil.setAuthor(doc,parser.getAuthor()); 095 } 096 if(parser.hasCustom1()){ 097 FieldUtil.setCustom(doc,parser.getCustom1(),1); 098 } 099 if(parser.hasCustom2()){ 100 FieldUtil.setCustom(doc,parser.getCustom2(),2); 101 } 102 if(parser.hasCustom3()){ 103 FieldUtil.setCustom(doc,parser.getCustom3(),3); 104 } 105 if(parser.hasCustom4()){ 106 FieldUtil.setCustom(doc,parser.getCustom4(),4); 107 } 108 109 110 111 } 112 113 private HTMLDocument() {} 114 } 115