001 package railo.runtime.search.lucene2.html; 002 003 import java.io.File; 004 import java.io.IOException; 005 import java.io.Reader; 006 007 import org.ccil.cowan.tagsoup.Parser; 008 import org.xml.sax.Attributes; 009 import org.xml.sax.InputSource; 010 import org.xml.sax.SAXException; 011 import org.xml.sax.XMLReader; 012 import org.xml.sax.helpers.XMLReaderFactory; 013 014 import railo.commons.io.IOUtil; 015 import railo.commons.io.res.Resource; 016 import railo.commons.io.res.util.ResourceUtil; 017 import railo.commons.lang.StringUtil; 018 import railo.runtime.exp.PageException; 019 import railo.runtime.type.util.ListUtil; 020 021 022 public final class HTMLParser extends Parser { 023 024 private XMLReader xmlReader; 025 private String title; 026 private String charset; 027 private StringBuffer current; 028 private StringBuffer content; 029 private boolean hasChanged; 030 private String strContent; 031 private Silent silent=new Silent(null,false); 032 //private boolean silentBefore=false; 033 private String description; 034 private String keywords; 035 private String author; 036 private String custom1; 037 private String custom2; 038 private String custom3; 039 private String custom4; 040 041 042 public HTMLParser() { 043 try { 044 xmlReader=XMLReaderFactory.createXMLReader(Parser.class.getName()); 045 } 046 catch (SAXException e) {} 047 xmlReader.setContentHandler(this); 048 xmlReader.setErrorHandler(this); 049 050 } 051 /** 052 * parse a concret url 053 * @param file 054 * @param charset 055 * @throws IOException 056 * @throws SAXException 057 * @throws SAXException 058 */ 059 public synchronized void parse(File file, String charset) throws IOException, SAXException { 060 parse(ResourceUtil.toResource(file), charset); 061 062 } 063 064 public synchronized void parse(Resource res, String charset) throws IOException, SAXException { 065 title=""; 066 this.charset=charset; 067 current=new StringBuffer(); 068 content=new StringBuffer(); 069 hasChanged=false; 070 071 Reader r=IOUtil.getReader(res,charset); 072 InputSource is=new InputSource(r); 073 is.setSystemId(res.toString()); 074 075 try { 076 xmlReader.parse(is); 077 } 078 finally { 079 IOUtil.closeEL(r); 080 } 081 strContent=content.toString(); 082 } 083 084 public synchronized void parse(Reader reader) throws IOException, SAXException { 085 title=""; 086 this.charset=null; 087 current=new StringBuffer(); 088 content=new StringBuffer(); 089 hasChanged=false; 090 091 InputSource is=new InputSource(reader); 092 093 try { 094 xmlReader.parse(is); 095 } 096 finally { 097 IOUtil.closeEL(reader); 098 } 099 100 101 strContent=content.toString(); 102 } 103 104 105 106 @Override 107 public void startElement(String uri, String name, String qName, Attributes atts)throws SAXException { 108 if(name.equalsIgnoreCase("script")) { 109 silent=new Silent(silent,true); 110 } 111 else if(name.equalsIgnoreCase("body")) { 112 silent=new Silent(silent,false); 113 } 114 else if(name.equalsIgnoreCase("meta")) { 115 doMeta(atts); 116 } 117 118 119 if(hasChanged==false && charset==null && name.equalsIgnoreCase("meta")){ 120 if(atts.getValue("http-equiv")!=null) { 121 String value=atts.getValue("content"); 122 String el; 123 String n; 124 String v; 125 if(value!=null) { 126 try { 127 String[] arr=ListUtil.toStringArray(ListUtil.listToArrayRemoveEmpty(value,';')); 128 for(int i=0;i<arr.length;i++) { 129 el=arr[i]; 130 n=ListUtil.first(el,"=",true).trim(); 131 v=ListUtil.last(el,"=",true).trim(); 132 if(n.equalsIgnoreCase("charset")) { 133 charset=v; 134 hasChanged=true; 135 //throw new SAXException("has found charset info"); 136 } 137 } 138 } 139 catch (PageException e) {} 140 } 141 } 142 } 143 } 144 145 private void doMeta(Attributes atts) { 146 String name=atts.getValue("name"); 147 if(name==null) name=""; 148 else name=name.toLowerCase().trim(); 149 150 if("description".equals(name)) description=atts.getValue("content"); 151 else if("keywords".equals(name)) keywords=atts.getValue("content"); 152 else if("author".equals(name)) author=atts.getValue("content"); 153 else if("custom1".equals(name)) custom1=atts.getValue("content"); 154 else if("custom2".equals(name)) custom2=atts.getValue("content"); 155 else if("custom3".equals(name)) custom3=atts.getValue("content"); 156 else if("custom4".equals(name)) custom4=atts.getValue("content"); 157 158 } 159 // <meta http-equiv="Content-Type" content="text/html; charset=utf-8"> 160 public void endElement(String uri, String name, String qName) { 161 if(name.equalsIgnoreCase("script")) { 162 silent=silent.parent; 163 } 164 else if(name.equalsIgnoreCase("body")) { 165 silent=silent.parent; 166 } 167 168 String c=current.toString().trim(); 169 //if(name.equals("title"))print.out(c); 170 if(c.length()>0) { 171 if(name.equalsIgnoreCase("title"))title=c; 172 else { 173 content.append(c); 174 content.append('\n'); 175 } 176 current=new StringBuffer(); 177 } 178 } 179 180 181 @Override 182 public void characters (char ch[], int start, int length) { 183 if(!silent.value) 184 current.append(ch,start,length); 185 } 186 187 188 /** 189 * @return Returns the content. 190 */ 191 public String getContent() { 192 return strContent; 193 } 194 195 /** 196 * @return Returns the title. 197 */ 198 public String getTitle() { 199 return title; 200 } 201 202 /** 203 * @return Returns the charset. 204 */ 205 public String getCharset() { 206 return charset; 207 } 208 /** 209 * @return Returns the summary 210 */ 211 public String getSummary() { 212 return description; 213 214 } 215 216 /** 217 * @return the keywords 218 */ 219 public String getKeywords() { 220 return keywords; 221 } 222 223 /** 224 * @return if keywords exists 225 */ 226 public boolean hasKeywords() { 227 return !StringUtil.isEmpty(keywords,true); 228 } 229 230 /** 231 * @return the author 232 */ 233 public String getAuthor() { 234 return author; 235 } 236 237 /** 238 * @return if author exists 239 */ 240 public boolean hasAuthor() { 241 return !StringUtil.isEmpty(author,true); 242 } 243 244 public boolean hasCustom1() { 245 return !StringUtil.isEmpty(custom1,true); 246 } 247 public boolean hasCustom2() { 248 return !StringUtil.isEmpty(custom2,true); 249 } 250 public boolean hasCustom3() { 251 return !StringUtil.isEmpty(custom3,true); 252 } 253 public boolean hasCustom4() { 254 return !StringUtil.isEmpty(custom4,true); 255 } 256 257 /** 258 * @return the custom1 259 */ 260 public String getCustom1() { 261 return custom1; 262 } 263 /** 264 * @return the custom2 265 */ 266 public String getCustom2() { 267 return custom2; 268 } 269 /** 270 * @return the custom3 271 */ 272 public String getCustom3() { 273 return custom3; 274 } 275 /** 276 * @return the custom4 277 */ 278 public String getCustom4() { 279 return custom4; 280 } 281 282 283 284 /*public static void main(String[] args) throws Exception { 285 HTMLParser parser = new HTMLParser(); 286 parser.parse(new File("C:\\projects\\jmuffin\\webroot\\cfmx\\jm\\test\\tags\\_tuv.htm"),null); 287 288 //print.ln("title:"+parser.getTitle()); 289 //print.ln(parser.getContent()); 290 291 parser.parse(new File("C:\\projects\\jmuffin\\webroot\\cfmx\\jm\\test\\tags\\_tuv.htm"),"UTF-8"); 292 293 //print.ln("title:"+parser.getTitle()); 294 //print.ln(parser.getContent()); 295 }*/ 296 297 298 private class Silent { 299 Silent parent; 300 boolean value; 301 /** 302 * constructor of the class 303 * @param parent 304 * @param value 305 */ 306 public Silent(Silent parent, boolean value) { 307 this.parent = parent; 308 this.value = value; 309 } 310 311 } 312 313 } 314 315 316 317 318 319 320 321