001/** 002 * 003 * Copyright (c) 2014, the Railo Company Ltd. All rights reserved. 004 * 005 * This library is free software; you can redistribute it and/or 006 * modify it under the terms of the GNU Lesser General Public 007 * License as published by the Free Software Foundation; either 008 * version 2.1 of the License, or (at your option) any later version. 009 * 010 * This library is distributed in the hope that it will be useful, 011 * but WITHOUT ANY WARRANTY; without even the implied warranty of 012 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 013 * Lesser General Public License for more details. 014 * 015 * You should have received a copy of the GNU Lesser General Public 016 * License along with this library. If not, see <http://www.gnu.org/licenses/>. 017 * 018 **/ 019package lucee.runtime.search.lucene2.html; 020 021import java.io.File; 022import java.io.IOException; 023import java.io.Reader; 024 025import lucee.commons.io.IOUtil; 026import lucee.commons.io.res.Resource; 027import lucee.commons.io.res.util.ResourceUtil; 028import lucee.commons.lang.StringUtil; 029import lucee.runtime.exp.PageException; 030import lucee.runtime.type.util.ListUtil; 031 032import org.ccil.cowan.tagsoup.Parser; 033import org.xml.sax.Attributes; 034import org.xml.sax.InputSource; 035import org.xml.sax.SAXException; 036import org.xml.sax.XMLReader; 037import org.xml.sax.helpers.XMLReaderFactory; 038 039 040public final class HTMLParser extends Parser { 041 042 private XMLReader xmlReader; 043 private String title; 044 private String charset; 045 private StringBuffer current; 046 private StringBuffer content; 047 private boolean hasChanged; 048 private String strContent; 049 private Silent silent=new Silent(null,false); 050 //private boolean silentBefore=false; 051 private String description; 052 private String keywords; 053 private String author; 054 private String custom1; 055 private String custom2; 056 private String custom3; 057 private String custom4; 058 059 060 public HTMLParser() { 061 try { 062 xmlReader=XMLReaderFactory.createXMLReader(Parser.class.getName()); 063 } 064 catch (SAXException e) {} 065 xmlReader.setContentHandler(this); 066 xmlReader.setErrorHandler(this); 067 068 } 069 /** 070 * parse a concret url 071 * @param file 072 * @param charset 073 * @throws IOException 074 * @throws SAXException 075 * @throws SAXException 076 */ 077 public synchronized void parse(File file, String charset) throws IOException, SAXException { 078 parse(ResourceUtil.toResource(file), charset); 079 080 } 081 082 public synchronized void parse(Resource res, String charset) throws IOException, SAXException { 083 title=""; 084 this.charset=charset; 085 current=new StringBuffer(); 086 content=new StringBuffer(); 087 hasChanged=false; 088 089 Reader r=IOUtil.getReader(res,charset); 090 InputSource is=new InputSource(r); 091 is.setSystemId(res.toString()); 092 093 try { 094 xmlReader.parse(is); 095 } 096 finally { 097 IOUtil.closeEL(r); 098 } 099 strContent=content.toString(); 100 } 101 102 public synchronized void parse(Reader reader) throws IOException, SAXException { 103 title=""; 104 this.charset=null; 105 current=new StringBuffer(); 106 content=new StringBuffer(); 107 hasChanged=false; 108 109 InputSource is=new InputSource(reader); 110 111 try { 112 xmlReader.parse(is); 113 } 114 finally { 115 IOUtil.closeEL(reader); 116 } 117 118 119 strContent=content.toString(); 120 } 121 122 123 124 @Override 125 public void startElement(String uri, String name, String qName, Attributes atts)throws SAXException { 126 if(name.equalsIgnoreCase("script")) { 127 silent=new Silent(silent,true); 128 } 129 else if(name.equalsIgnoreCase("body")) { 130 silent=new Silent(silent,false); 131 } 132 else if(name.equalsIgnoreCase("meta")) { 133 doMeta(atts); 134 } 135 136 137 if(hasChanged==false && charset==null && name.equalsIgnoreCase("meta")){ 138 if(atts.getValue("http-equiv")!=null) { 139 String value=atts.getValue("content"); 140 String el; 141 String n; 142 String v; 143 if(value!=null) { 144 try { 145 String[] arr=ListUtil.toStringArray(ListUtil.listToArrayRemoveEmpty(value,';')); 146 for(int i=0;i<arr.length;i++) { 147 el=arr[i]; 148 n=ListUtil.first(el,"=",true).trim(); 149 v=ListUtil.last(el,"=",true).trim(); 150 if(n.equalsIgnoreCase("charset")) { 151 charset=v; 152 hasChanged=true; 153 //throw new SAXException("has found charset info"); 154 } 155 } 156 } 157 catch (PageException e) {} 158 } 159 } 160 } 161 } 162 163 private void doMeta(Attributes atts) { 164 String name=atts.getValue("name"); 165 if(name==null) name=""; 166 else name=name.toLowerCase().trim(); 167 168 if("description".equals(name)) description=atts.getValue("content"); 169 else if("keywords".equals(name)) keywords=atts.getValue("content"); 170 else if("author".equals(name)) author=atts.getValue("content"); 171 else if("custom1".equals(name)) custom1=atts.getValue("content"); 172 else if("custom2".equals(name)) custom2=atts.getValue("content"); 173 else if("custom3".equals(name)) custom3=atts.getValue("content"); 174 else if("custom4".equals(name)) custom4=atts.getValue("content"); 175 176 } 177 // <meta http-equiv="Content-Type" content="text/html; charset=utf-8"> 178 public void endElement(String uri, String name, String qName) { 179 if(name.equalsIgnoreCase("script")) { 180 silent=silent.parent; 181 } 182 else if(name.equalsIgnoreCase("body")) { 183 silent=silent.parent; 184 } 185 186 String c=current.toString().trim(); 187 //if(name.equals("title"))print.out(c); 188 if(c.length()>0) { 189 if(name.equalsIgnoreCase("title"))title=c; 190 else { 191 content.append(c); 192 content.append('\n'); 193 } 194 current=new StringBuffer(); 195 } 196 } 197 198 199 @Override 200 public void characters (char ch[], int start, int length) { 201 if(!silent.value) 202 current.append(ch,start,length); 203 } 204 205 206 /** 207 * @return Returns the content. 208 */ 209 public String getContent() { 210 return strContent; 211 } 212 213 /** 214 * @return Returns the title. 215 */ 216 public String getTitle() { 217 return title; 218 } 219 220 /** 221 * @return Returns the charset. 222 */ 223 public String getCharset() { 224 return charset; 225 } 226 /** 227 * @return Returns the summary 228 */ 229 public String getSummary() { 230 return description; 231 232 } 233 234 /** 235 * @return the keywords 236 */ 237 public String getKeywords() { 238 return keywords; 239 } 240 241 /** 242 * @return if keywords exists 243 */ 244 public boolean hasKeywords() { 245 return !StringUtil.isEmpty(keywords,true); 246 } 247 248 /** 249 * @return the author 250 */ 251 public String getAuthor() { 252 return author; 253 } 254 255 /** 256 * @return if author exists 257 */ 258 public boolean hasAuthor() { 259 return !StringUtil.isEmpty(author,true); 260 } 261 262 public boolean hasCustom1() { 263 return !StringUtil.isEmpty(custom1,true); 264 } 265 public boolean hasCustom2() { 266 return !StringUtil.isEmpty(custom2,true); 267 } 268 public boolean hasCustom3() { 269 return !StringUtil.isEmpty(custom3,true); 270 } 271 public boolean hasCustom4() { 272 return !StringUtil.isEmpty(custom4,true); 273 } 274 275 /** 276 * @return the custom1 277 */ 278 public String getCustom1() { 279 return custom1; 280 } 281 /** 282 * @return the custom2 283 */ 284 public String getCustom2() { 285 return custom2; 286 } 287 /** 288 * @return the custom3 289 */ 290 public String getCustom3() { 291 return custom3; 292 } 293 /** 294 * @return the custom4 295 */ 296 public String getCustom4() { 297 return custom4; 298 } 299 300 301 302 /*public static void main(String[] args) throws Exception { 303 HTMLParser parser = new HTMLParser(); 304 parser.parse(new File("C:\\projects\\jmuffin\\webroot\\cfmx\\jm\\test\\tags\\_tuv.htm"),null); 305 306 //print.ln("title:"+parser.getTitle()); 307 //print.ln(parser.getContent()); 308 309 parser.parse(new File("C:\\projects\\jmuffin\\webroot\\cfmx\\jm\\test\\tags\\_tuv.htm"),"UTF-8"); 310 311 //print.ln("title:"+parser.getTitle()); 312 //print.ln(parser.getContent()); 313 }*/ 314 315 316 private class Silent { 317 Silent parent; 318 boolean value; 319 /** 320 * constructor of the class 321 * @param parent 322 * @param value 323 */ 324 public Silent(Silent parent, boolean value) { 325 this.parent = parent; 326 this.value = value; 327 } 328 329 } 330 331} 332 333 334 335 336 337 338 339