001    package railo.runtime.search.lucene2.html;
002    
003    import java.io.File;
004    import java.io.IOException;
005    import java.io.Reader;
006    
007    import org.ccil.cowan.tagsoup.Parser;
008    import org.xml.sax.Attributes;
009    import org.xml.sax.InputSource;
010    import org.xml.sax.SAXException;
011    import org.xml.sax.XMLReader;
012    import org.xml.sax.helpers.XMLReaderFactory;
013    
014    import railo.commons.io.IOUtil;
015    import railo.commons.io.res.Resource;
016    import railo.commons.io.res.util.ResourceUtil;
017    import railo.commons.lang.StringUtil;
018    import railo.runtime.exp.PageException;
019    import railo.runtime.type.util.ListUtil;
020    
021    
022    public final class HTMLParser extends Parser {
023    
024        private XMLReader xmlReader;
025        private String title;
026        private String charset;
027        private StringBuffer current;
028        private StringBuffer content;
029        private boolean hasChanged;
030        private String strContent;
031        private Silent silent=new Silent(null,false);
032        //private boolean silentBefore=false;
033            private String description;
034            private String keywords;
035            private String author;
036            private String custom1;
037            private String custom2;
038            private String custom3;
039            private String custom4;
040        
041        
042        public HTMLParser() {
043            try {
044                xmlReader=XMLReaderFactory.createXMLReader(Parser.class.getName());
045            } 
046            catch (SAXException e) {}
047            xmlReader.setContentHandler(this);
048            xmlReader.setErrorHandler(this);
049            
050        }
051        /**
052         * parse a concret url
053         * @param file
054         * @param charset
055         * @throws IOException
056         * @throws SAXException 
057         * @throws SAXException
058         */
059        public synchronized void parse(File file, String charset) throws IOException, SAXException {
060            parse(ResourceUtil.toResource(file), charset);
061            
062        }
063    
064        public synchronized void parse(Resource res, String charset) throws IOException, SAXException {
065            title="";
066            this.charset=charset;
067            current=new StringBuffer();
068            content=new StringBuffer();
069            hasChanged=false;
070            
071            Reader r=IOUtil.getReader(res,charset);
072            InputSource is=new InputSource(r);
073            is.setSystemId(res.toString());
074            
075            try {
076                xmlReader.parse(is);
077            } 
078            finally {
079                    IOUtil.closeEL(r);
080            }
081            strContent=content.toString();
082        }
083        
084            public synchronized void parse(Reader reader) throws IOException, SAXException {
085            title="";
086            this.charset=null;
087            current=new StringBuffer();
088            content=new StringBuffer();
089            hasChanged=false;
090            
091            InputSource is=new InputSource(reader);
092            
093            try {
094                xmlReader.parse(is);
095            } 
096            finally {
097                    IOUtil.closeEL(reader);
098            }
099    
100            
101            strContent=content.toString();
102        }
103        
104        
105        
106        @Override
107        public void startElement(String uri, String name, String qName, Attributes atts)throws SAXException {
108            if(name.equalsIgnoreCase("script")) {
109                silent=new Silent(silent,true);
110            }
111            else if(name.equalsIgnoreCase("body")) {
112                silent=new Silent(silent,false);
113            }
114            else if(name.equalsIgnoreCase("meta")) {
115                doMeta(atts);
116            }
117            
118            
119            if(hasChanged==false && charset==null && name.equalsIgnoreCase("meta")){
120                if(atts.getValue("http-equiv")!=null) {
121                    String value=atts.getValue("content");
122                    String el;
123                    String n;
124                    String v;
125                    if(value!=null) {
126                        try {
127                            String[] arr=ListUtil.toStringArray(ListUtil.listToArrayRemoveEmpty(value,';'));
128                            for(int i=0;i<arr.length;i++) {
129                                el=arr[i];
130                                n=ListUtil.first(el,"=",true).trim();
131                                v=ListUtil.last(el,"=",true).trim();
132                                if(n.equalsIgnoreCase("charset")) {
133                                    charset=v;
134                                    hasChanged=true;
135                                    //throw new SAXException("has found charset info");
136                                }
137                            }
138                        } 
139                        catch (PageException e) {}
140                    }
141                }
142            }
143        }
144        
145        private void doMeta(Attributes atts) {
146            String name=atts.getValue("name");
147            if(name==null) name="";
148            else name=name.toLowerCase().trim();
149            
150            if("description".equals(name))          description=atts.getValue("content");
151            else if("keywords".equals(name))        keywords=atts.getValue("content");
152            else if("author".equals(name))          author=atts.getValue("content");
153            else if("custom1".equals(name))         custom1=atts.getValue("content");
154            else if("custom2".equals(name))         custom2=atts.getValue("content");
155            else if("custom3".equals(name))         custom3=atts.getValue("content");
156            else if("custom4".equals(name))         custom4=atts.getValue("content");
157            
158            }
159            // <meta http-equiv="Content-Type" content="text/html; charset=utf-8">
160        public void endElement(String uri, String name, String qName) {
161            if(name.equalsIgnoreCase("script")) {
162                silent=silent.parent;
163            }
164            else if(name.equalsIgnoreCase("body")) {
165                silent=silent.parent;
166            }
167            
168            String c=current.toString().trim();
169            //if(name.equals("title"))print.out(c);
170            if(c.length()>0) {
171                if(name.equalsIgnoreCase("title"))title=c;
172                else {
173                    content.append(c);
174                    content.append('\n');
175                }
176                current=new StringBuffer();
177            }
178        }
179        
180        
181        @Override
182        public void characters (char ch[], int start, int length)   {
183           if(!silent.value)
184                    current.append(ch,start,length);
185        }
186    
187    
188        /**
189         * @return Returns the content.
190         */
191        public String getContent() {
192            return strContent;
193        }
194    
195        /**
196         * @return Returns the title.
197         */
198        public String getTitle() {
199            return title;
200        }
201    
202        /**
203         * @return Returns the charset.
204         */
205        public String getCharset() {
206            return charset;
207        }
208        /**
209         * @return Returns the summary
210         */
211        public String getSummary() {
212            return description;
213            
214        }
215    
216            /**
217             * @return the keywords
218             */
219            public String getKeywords() {
220                    return keywords;
221            }
222    
223            /**
224             * @return if keywords exists
225             */
226            public boolean hasKeywords() {
227                    return !StringUtil.isEmpty(keywords,true);
228            }
229    
230            /**
231             * @return the author
232             */
233            public String getAuthor() {
234                    return author;
235            }
236    
237            /**
238             * @return if author exists
239             */
240            public boolean hasAuthor() {
241                    return !StringUtil.isEmpty(author,true);
242            }
243            
244            public boolean hasCustom1() {
245                    return !StringUtil.isEmpty(custom1,true);
246            }
247            public boolean hasCustom2() {
248                    return !StringUtil.isEmpty(custom2,true);
249            }
250            public boolean hasCustom3() {
251                    return !StringUtil.isEmpty(custom3,true);
252            }
253            public boolean hasCustom4() {
254                    return !StringUtil.isEmpty(custom4,true);
255            }
256    
257            /**
258             * @return the custom1
259             */
260            public String getCustom1() {
261                    return custom1;
262            }
263            /**
264             * @return the custom2
265             */
266            public String getCustom2() {
267                    return custom2;
268            }
269            /**
270             * @return the custom3
271             */
272            public String getCustom3() {
273                    return custom3;
274            }
275            /**
276             * @return the custom4
277             */
278            public String getCustom4() {
279                    return custom4;
280            }
281            
282            
283        
284        /*public static void main(String[] args) throws Exception {
285            HTMLParser parser = new HTMLParser();
286            parser.parse(new File("C:\\projects\\jmuffin\\webroot\\cfmx\\jm\\test\\tags\\_tuv.htm"),null);
287            
288            //print.ln("title:"+parser.getTitle());
289            //print.ln(parser.getContent());
290            
291            parser.parse(new File("C:\\projects\\jmuffin\\webroot\\cfmx\\jm\\test\\tags\\_tuv.htm"),"UTF-8");
292            
293            //print.ln("title:"+parser.getTitle());
294            //print.ln(parser.getContent());
295        }*/
296        
297        
298        private class Silent {
299            Silent parent;
300            boolean value;
301            /**
302             * constructor of the class
303             * @param parent
304             * @param value
305             */
306            public Silent(Silent parent, boolean value) {
307                this.parent = parent;
308                this.value = value;
309            }
310            
311        }
312        
313    }
314    
315    
316    
317    
318    
319    
320    
321