001/**
002 *
003 * Copyright (c) 2014, the Railo Company Ltd. All rights reserved.
004 *
005 * This library is free software; you can redistribute it and/or
006 * modify it under the terms of the GNU Lesser General Public
007 * License as published by the Free Software Foundation; either 
008 * version 2.1 of the License, or (at your option) any later version.
009 * 
010 * This library is distributed in the hope that it will be useful,
011 * but WITHOUT ANY WARRANTY; without even the implied warranty of
012 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
013 * Lesser General Public License for more details.
014 * 
015 * You should have received a copy of the GNU Lesser General Public 
016 * License along with this library.  If not, see <http://www.gnu.org/licenses/>.
017 * 
018 **/
019package lucee.runtime.search.lucene2.html;
020
021import java.io.File;
022import java.io.IOException;
023import java.io.Reader;
024
025import lucee.commons.io.IOUtil;
026import lucee.commons.io.res.Resource;
027import lucee.commons.io.res.util.ResourceUtil;
028import lucee.commons.lang.StringUtil;
029import lucee.runtime.exp.PageException;
030import lucee.runtime.type.util.ListUtil;
031
032import org.ccil.cowan.tagsoup.Parser;
033import org.xml.sax.Attributes;
034import org.xml.sax.InputSource;
035import org.xml.sax.SAXException;
036import org.xml.sax.XMLReader;
037import org.xml.sax.helpers.XMLReaderFactory;
038
039
040public final class HTMLParser extends Parser {
041
042    private XMLReader xmlReader;
043    private String title;
044    private String charset;
045    private StringBuffer current;
046    private StringBuffer content;
047    private boolean hasChanged;
048    private String strContent;
049    private Silent silent=new Silent(null,false);
050    //private boolean silentBefore=false;
051        private String description;
052        private String keywords;
053        private String author;
054        private String custom1;
055        private String custom2;
056        private String custom3;
057        private String custom4;
058    
059    
060    public HTMLParser() {
061        try {
062            xmlReader=XMLReaderFactory.createXMLReader(Parser.class.getName());
063        } 
064        catch (SAXException e) {}
065        xmlReader.setContentHandler(this);
066        xmlReader.setErrorHandler(this);
067        
068    }
069    /**
070     * parse a concret url
071     * @param file
072     * @param charset
073     * @throws IOException
074     * @throws SAXException 
075     * @throws SAXException
076     */
077    public synchronized void parse(File file, String charset) throws IOException, SAXException {
078        parse(ResourceUtil.toResource(file), charset);
079        
080    }
081
082    public synchronized void parse(Resource res, String charset) throws IOException, SAXException {
083        title="";
084        this.charset=charset;
085        current=new StringBuffer();
086        content=new StringBuffer();
087        hasChanged=false;
088        
089        Reader r=IOUtil.getReader(res,charset);
090        InputSource is=new InputSource(r);
091        is.setSystemId(res.toString());
092        
093        try {
094            xmlReader.parse(is);
095        } 
096        finally {
097                IOUtil.closeEL(r);
098        }
099        strContent=content.toString();
100    }
101    
102        public synchronized void parse(Reader reader) throws IOException, SAXException {
103        title="";
104        this.charset=null;
105        current=new StringBuffer();
106        content=new StringBuffer();
107        hasChanged=false;
108        
109        InputSource is=new InputSource(reader);
110        
111        try {
112            xmlReader.parse(is);
113        } 
114        finally {
115                IOUtil.closeEL(reader);
116        }
117
118        
119        strContent=content.toString();
120    }
121    
122    
123    
124    @Override
125    public void startElement(String uri, String name, String qName, Attributes atts)throws SAXException {
126        if(name.equalsIgnoreCase("script")) {
127            silent=new Silent(silent,true);
128        }
129        else if(name.equalsIgnoreCase("body")) {
130            silent=new Silent(silent,false);
131        }
132        else if(name.equalsIgnoreCase("meta")) {
133            doMeta(atts);
134        }
135        
136        
137        if(hasChanged==false && charset==null && name.equalsIgnoreCase("meta")){
138            if(atts.getValue("http-equiv")!=null) {
139                String value=atts.getValue("content");
140                String el;
141                String n;
142                String v;
143                if(value!=null) {
144                    try {
145                        String[] arr=ListUtil.toStringArray(ListUtil.listToArrayRemoveEmpty(value,';'));
146                        for(int i=0;i<arr.length;i++) {
147                            el=arr[i];
148                            n=ListUtil.first(el,"=",true).trim();
149                            v=ListUtil.last(el,"=",true).trim();
150                            if(n.equalsIgnoreCase("charset")) {
151                                charset=v;
152                                hasChanged=true;
153                                //throw new SAXException("has found charset info");
154                            }
155                        }
156                    } 
157                    catch (PageException e) {}
158                }
159            }
160        }
161    }
162    
163    private void doMeta(Attributes atts) {
164        String name=atts.getValue("name");
165        if(name==null) name="";
166        else name=name.toLowerCase().trim();
167        
168        if("description".equals(name))          description=atts.getValue("content");
169        else if("keywords".equals(name))        keywords=atts.getValue("content");
170        else if("author".equals(name))          author=atts.getValue("content");
171        else if("custom1".equals(name))         custom1=atts.getValue("content");
172        else if("custom2".equals(name))         custom2=atts.getValue("content");
173        else if("custom3".equals(name))         custom3=atts.getValue("content");
174        else if("custom4".equals(name))         custom4=atts.getValue("content");
175        
176        }
177        // <meta http-equiv="Content-Type" content="text/html; charset=utf-8">
178    public void endElement(String uri, String name, String qName) {
179        if(name.equalsIgnoreCase("script")) {
180            silent=silent.parent;
181        }
182        else if(name.equalsIgnoreCase("body")) {
183            silent=silent.parent;
184        }
185        
186        String c=current.toString().trim();
187        //if(name.equals("title"))print.out(c);
188        if(c.length()>0) {
189            if(name.equalsIgnoreCase("title"))title=c;
190            else {
191                content.append(c);
192                content.append('\n');
193            }
194            current=new StringBuffer();
195        }
196    }
197    
198    
199    @Override
200    public void characters (char ch[], int start, int length)   {
201       if(!silent.value)
202                current.append(ch,start,length);
203    }
204
205
206    /**
207     * @return Returns the content.
208     */
209    public String getContent() {
210        return strContent;
211    }
212
213    /**
214     * @return Returns the title.
215     */
216    public String getTitle() {
217        return title;
218    }
219
220    /**
221     * @return Returns the charset.
222     */
223    public String getCharset() {
224        return charset;
225    }
226    /**
227     * @return Returns the summary
228     */
229    public String getSummary() {
230        return description;
231        
232    }
233
234        /**
235         * @return the keywords
236         */
237        public String getKeywords() {
238                return keywords;
239        }
240
241        /**
242         * @return if keywords exists
243         */
244        public boolean hasKeywords() {
245                return !StringUtil.isEmpty(keywords,true);
246        }
247
248        /**
249         * @return the author
250         */
251        public String getAuthor() {
252                return author;
253        }
254
255        /**
256         * @return if author exists
257         */
258        public boolean hasAuthor() {
259                return !StringUtil.isEmpty(author,true);
260        }
261        
262        public boolean hasCustom1() {
263                return !StringUtil.isEmpty(custom1,true);
264        }
265        public boolean hasCustom2() {
266                return !StringUtil.isEmpty(custom2,true);
267        }
268        public boolean hasCustom3() {
269                return !StringUtil.isEmpty(custom3,true);
270        }
271        public boolean hasCustom4() {
272                return !StringUtil.isEmpty(custom4,true);
273        }
274
275        /**
276         * @return the custom1
277         */
278        public String getCustom1() {
279                return custom1;
280        }
281        /**
282         * @return the custom2
283         */
284        public String getCustom2() {
285                return custom2;
286        }
287        /**
288         * @return the custom3
289         */
290        public String getCustom3() {
291                return custom3;
292        }
293        /**
294         * @return the custom4
295         */
296        public String getCustom4() {
297                return custom4;
298        }
299        
300        
301    
302    /*public static void main(String[] args) throws Exception {
303        HTMLParser parser = new HTMLParser();
304        parser.parse(new File("C:\\projects\\jmuffin\\webroot\\cfmx\\jm\\test\\tags\\_tuv.htm"),null);
305        
306        //print.ln("title:"+parser.getTitle());
307        //print.ln(parser.getContent());
308        
309        parser.parse(new File("C:\\projects\\jmuffin\\webroot\\cfmx\\jm\\test\\tags\\_tuv.htm"),"UTF-8");
310        
311        //print.ln("title:"+parser.getTitle());
312        //print.ln(parser.getContent());
313    }*/
314    
315    
316    private class Silent {
317        Silent parent;
318        boolean value;
319        /**
320         * constructor of the class
321         * @param parent
322         * @param value
323         */
324        public Silent(Silent parent, boolean value) {
325            this.parent = parent;
326            this.value = value;
327        }
328        
329    }
330    
331}
332
333
334
335
336
337
338
339