001 package railo.runtime.search.lucene2.html; 002 003 import java.io.File; 004 import java.io.IOException; 005 import java.io.Reader; 006 007 import org.ccil.cowan.tagsoup.Parser; 008 import org.xml.sax.Attributes; 009 import org.xml.sax.InputSource; 010 import org.xml.sax.SAXException; 011 import org.xml.sax.XMLReader; 012 import org.xml.sax.helpers.XMLReaderFactory; 013 014 import railo.commons.io.IOUtil; 015 import railo.commons.io.res.Resource; 016 import railo.commons.io.res.util.ResourceUtil; 017 import railo.commons.lang.StringUtil; 018 import railo.runtime.exp.PageException; 019 import railo.runtime.type.List; 020 021 022 public final class HTMLParser extends Parser { 023 024 private XMLReader xmlReader; 025 private String title; 026 private String charset; 027 private StringBuffer current; 028 private StringBuffer content; 029 private boolean hasChanged; 030 private String strContent; 031 private Silent silent=new Silent(null,false); 032 //private boolean silentBefore=false; 033 private String description; 034 private String keywords; 035 private String author; 036 private String custom1; 037 private String custom2; 038 private String custom3; 039 private String custom4; 040 041 042 public HTMLParser() { 043 try { 044 xmlReader=XMLReaderFactory.createXMLReader(Parser.class.getName()); 045 } 046 catch (SAXException e) {} 047 xmlReader.setContentHandler(this); 048 xmlReader.setErrorHandler(this); 049 050 } 051 /** 052 * parse a concret url 053 * @param file 054 * @param charset 055 * @throws IOException 056 * @throws SAXException 057 * @throws SAXException 058 */ 059 public synchronized void parse(File file, String charset) throws IOException, SAXException { 060 parse(ResourceUtil.toResource(file), charset); 061 062 } 063 064 public synchronized void parse(Resource res, String charset) throws IOException, SAXException { 065 title=""; 066 this.charset=charset; 067 current=new StringBuffer(); 068 content=new StringBuffer(); 069 hasChanged=false; 070 071 Reader r=IOUtil.getReader(res,charset); 072 InputSource is=new InputSource(r); 073 is.setSystemId(res.toString()); 074 075 try { 076 xmlReader.parse(is); 077 } 078 finally { 079 IOUtil.closeEL(r); 080 } 081 strContent=content.toString(); 082 } 083 084 public synchronized void parse(Reader reader) throws IOException, SAXException { 085 title=""; 086 this.charset=null; 087 current=new StringBuffer(); 088 content=new StringBuffer(); 089 hasChanged=false; 090 091 InputSource is=new InputSource(reader); 092 093 try { 094 xmlReader.parse(is); 095 } 096 finally { 097 IOUtil.closeEL(reader); 098 } 099 100 101 strContent=content.toString(); 102 } 103 104 105 106 /** 107 * @see org.xml.sax.ContentHandler#startElement(java.lang.String, java.lang.String, java.lang.String, org.xml.sax.Attributes) 108 */ 109 public void startElement(String uri, String name, String qName, Attributes atts)throws SAXException { 110 if(name.equalsIgnoreCase("script")) { 111 silent=new Silent(silent,true); 112 } 113 else if(name.equalsIgnoreCase("body")) { 114 silent=new Silent(silent,false); 115 } 116 else if(name.equalsIgnoreCase("meta")) { 117 doMeta(atts); 118 } 119 120 121 if(hasChanged==false && charset==null && name.equalsIgnoreCase("meta")){ 122 if(atts.getValue("http-equiv")!=null) { 123 String value=atts.getValue("content"); 124 String el; 125 String n; 126 String v; 127 if(value!=null) { 128 try { 129 String[] arr=List.toStringArray(List.listToArrayRemoveEmpty(value,';')); 130 for(int i=0;i<arr.length;i++) { 131 el=arr[i]; 132 n=List.first(el,"=",true).trim(); 133 v=List.last(el,"=",true).trim(); 134 if(n.equalsIgnoreCase("charset")) { 135 charset=v; 136 hasChanged=true; 137 //throw new SAXException("has found charset info"); 138 } 139 } 140 } 141 catch (PageException e) {} 142 } 143 } 144 } 145 } 146 147 private void doMeta(Attributes atts) { 148 String name=atts.getValue("name"); 149 if(name==null) name=""; 150 else name=name.toLowerCase().trim(); 151 152 if("description".equals(name)) description=atts.getValue("content"); 153 else if("keywords".equals(name)) keywords=atts.getValue("content"); 154 else if("author".equals(name)) author=atts.getValue("content"); 155 else if("custom1".equals(name)) custom1=atts.getValue("content"); 156 else if("custom2".equals(name)) custom2=atts.getValue("content"); 157 else if("custom3".equals(name)) custom3=atts.getValue("content"); 158 else if("custom4".equals(name)) custom4=atts.getValue("content"); 159 160 } 161 // <meta http-equiv="Content-Type" content="text/html; charset=utf-8"> 162 public void endElement(String uri, String name, String qName) { 163 if(name.equalsIgnoreCase("script")) { 164 silent=silent.parent; 165 } 166 else if(name.equalsIgnoreCase("body")) { 167 silent=silent.parent; 168 } 169 170 String c=current.toString().trim(); 171 //if(name.equals("title"))print.out(c); 172 if(c.length()>0) { 173 if(name.equalsIgnoreCase("title"))title=c; 174 else { 175 content.append(c); 176 content.append('\n'); 177 } 178 current=new StringBuffer(); 179 } 180 } 181 182 183 /** 184 * Geerbte Methode von org.xml.sax.ContentHandler, 185 * wird bei durchparsen des XML, zum einlesen des Content eines Body Element aufgerufen. 186 * 187 * @see org.xml.sax.ContentHandler#characters(char[], int, int) 188 */ 189 public void characters (char ch[], int start, int length) { 190 if(!silent.value) 191 current.append(ch,start,length); 192 } 193 194 195 /** 196 * @return Returns the content. 197 */ 198 public String getContent() { 199 return strContent; 200 } 201 202 /** 203 * @return Returns the title. 204 */ 205 public String getTitle() { 206 return title; 207 } 208 209 /** 210 * @return Returns the charset. 211 */ 212 public String getCharset() { 213 return charset; 214 } 215 /** 216 * @return Returns the summary 217 */ 218 public String getSummary() { 219 return description; 220 221 } 222 223 /** 224 * @return the keywords 225 */ 226 public String getKeywords() { 227 return keywords; 228 } 229 230 /** 231 * @return if keywords exists 232 */ 233 public boolean hasKeywords() { 234 return !StringUtil.isEmpty(keywords,true); 235 } 236 237 /** 238 * @return the author 239 */ 240 public String getAuthor() { 241 return author; 242 } 243 244 /** 245 * @return if author exists 246 */ 247 public boolean hasAuthor() { 248 return !StringUtil.isEmpty(author,true); 249 } 250 251 public boolean hasCustom1() { 252 return !StringUtil.isEmpty(custom1,true); 253 } 254 public boolean hasCustom2() { 255 return !StringUtil.isEmpty(custom2,true); 256 } 257 public boolean hasCustom3() { 258 return !StringUtil.isEmpty(custom3,true); 259 } 260 public boolean hasCustom4() { 261 return !StringUtil.isEmpty(custom4,true); 262 } 263 264 /** 265 * @return the custom1 266 */ 267 public String getCustom1() { 268 return custom1; 269 } 270 /** 271 * @return the custom2 272 */ 273 public String getCustom2() { 274 return custom2; 275 } 276 /** 277 * @return the custom3 278 */ 279 public String getCustom3() { 280 return custom3; 281 } 282 /** 283 * @return the custom4 284 */ 285 public String getCustom4() { 286 return custom4; 287 } 288 289 290 291 /*public static void main(String[] args) throws Exception { 292 HTMLParser parser = new HTMLParser(); 293 parser.parse(new File("C:\\projects\\jmuffin\\webroot\\cfmx\\jm\\test\\tags\\_tuv.htm"),null); 294 295 //print.ln("title:"+parser.getTitle()); 296 //print.ln(parser.getContent()); 297 298 parser.parse(new File("C:\\projects\\jmuffin\\webroot\\cfmx\\jm\\test\\tags\\_tuv.htm"),"UTF-8"); 299 300 //print.ln("title:"+parser.getTitle()); 301 //print.ln(parser.getContent()); 302 }*/ 303 304 305 private class Silent { 306 Silent parent; 307 boolean value; 308 /** 309 * constructor of the class 310 * @param parent 311 * @param value 312 */ 313 public Silent(Silent parent, boolean value) { 314 this.parent = parent; 315 this.value = value; 316 } 317 318 } 319 320 } 321 322 323 324 325 326 327 328