001    package railo.runtime.search.lucene2.html;
002    
003    import java.io.File;
004    import java.io.IOException;
005    import java.io.Reader;
006    
007    import org.ccil.cowan.tagsoup.Parser;
008    import org.xml.sax.Attributes;
009    import org.xml.sax.InputSource;
010    import org.xml.sax.SAXException;
011    import org.xml.sax.XMLReader;
012    import org.xml.sax.helpers.XMLReaderFactory;
013    
014    import railo.commons.io.IOUtil;
015    import railo.commons.io.res.Resource;
016    import railo.commons.io.res.util.ResourceUtil;
017    import railo.commons.lang.StringUtil;
018    import railo.runtime.exp.PageException;
019    import railo.runtime.type.List;
020    
021    
022    public final class HTMLParser extends Parser {
023    
024        private XMLReader xmlReader;
025        private String title;
026        private String charset;
027        private StringBuffer current;
028        private StringBuffer content;
029        private boolean hasChanged;
030        private String strContent;
031        private Silent silent=new Silent(null,false);
032        //private boolean silentBefore=false;
033            private String description;
034            private String keywords;
035            private String author;
036            private String custom1;
037            private String custom2;
038            private String custom3;
039            private String custom4;
040        
041        
042        public HTMLParser() {
043            try {
044                xmlReader=XMLReaderFactory.createXMLReader(Parser.class.getName());
045            } 
046            catch (SAXException e) {}
047            xmlReader.setContentHandler(this);
048            xmlReader.setErrorHandler(this);
049            
050        }
051        /**
052         * parse a concret url
053         * @param file
054         * @param charset
055         * @throws IOException
056         * @throws SAXException 
057         * @throws SAXException
058         */
059        public synchronized void parse(File file, String charset) throws IOException, SAXException {
060            parse(ResourceUtil.toResource(file), charset);
061            
062        }
063    
064        public synchronized void parse(Resource res, String charset) throws IOException, SAXException {
065            title="";
066            this.charset=charset;
067            current=new StringBuffer();
068            content=new StringBuffer();
069            hasChanged=false;
070            
071            Reader r=IOUtil.getReader(res,charset);
072            InputSource is=new InputSource(r);
073            is.setSystemId(res.toString());
074            
075            try {
076                xmlReader.parse(is);
077            } 
078            finally {
079                    IOUtil.closeEL(r);
080            }
081            strContent=content.toString();
082        }
083        
084            public synchronized void parse(Reader reader) throws IOException, SAXException {
085            title="";
086            this.charset=null;
087            current=new StringBuffer();
088            content=new StringBuffer();
089            hasChanged=false;
090            
091            InputSource is=new InputSource(reader);
092            
093            try {
094                xmlReader.parse(is);
095            } 
096            finally {
097                    IOUtil.closeEL(reader);
098            }
099    
100            
101            strContent=content.toString();
102        }
103        
104        
105        
106        /**
107         * @see org.xml.sax.ContentHandler#startElement(java.lang.String, java.lang.String, java.lang.String, org.xml.sax.Attributes)
108         */
109        public void startElement(String uri, String name, String qName, Attributes atts)throws SAXException {
110            if(name.equalsIgnoreCase("script")) {
111                silent=new Silent(silent,true);
112            }
113            else if(name.equalsIgnoreCase("body")) {
114                silent=new Silent(silent,false);
115            }
116            else if(name.equalsIgnoreCase("meta")) {
117                doMeta(atts);
118            }
119            
120            
121            if(hasChanged==false && charset==null && name.equalsIgnoreCase("meta")){
122                if(atts.getValue("http-equiv")!=null) {
123                    String value=atts.getValue("content");
124                    String el;
125                    String n;
126                    String v;
127                    if(value!=null) {
128                        try {
129                            String[] arr=List.toStringArray(List.listToArrayRemoveEmpty(value,';'));
130                            for(int i=0;i<arr.length;i++) {
131                                el=arr[i];
132                                n=List.first(el,"=",true).trim();
133                                v=List.last(el,"=",true).trim();
134                                if(n.equalsIgnoreCase("charset")) {
135                                    charset=v;
136                                    hasChanged=true;
137                                    //throw new SAXException("has found charset info");
138                                }
139                            }
140                        } 
141                        catch (PageException e) {}
142                    }
143                }
144            }
145        }
146        
147        private void doMeta(Attributes atts) {
148            String name=atts.getValue("name");
149            if(name==null) name="";
150            else name=name.toLowerCase().trim();
151            
152            if("description".equals(name))          description=atts.getValue("content");
153            else if("keywords".equals(name))        keywords=atts.getValue("content");
154            else if("author".equals(name))          author=atts.getValue("content");
155            else if("custom1".equals(name))         custom1=atts.getValue("content");
156            else if("custom2".equals(name))         custom2=atts.getValue("content");
157            else if("custom3".equals(name))         custom3=atts.getValue("content");
158            else if("custom4".equals(name))         custom4=atts.getValue("content");
159            
160            }
161            // <meta http-equiv="Content-Type" content="text/html; charset=utf-8">
162        public void endElement(String uri, String name, String qName) {
163            if(name.equalsIgnoreCase("script")) {
164                silent=silent.parent;
165            }
166            else if(name.equalsIgnoreCase("body")) {
167                silent=silent.parent;
168            }
169            
170            String c=current.toString().trim();
171            //if(name.equals("title"))print.out(c);
172            if(c.length()>0) {
173                if(name.equalsIgnoreCase("title"))title=c;
174                else {
175                    content.append(c);
176                    content.append('\n');
177                }
178                current=new StringBuffer();
179            }
180        }
181        
182        
183        /** 
184         * Geerbte Methode von org.xml.sax.ContentHandler, 
185         * wird bei durchparsen des XML, zum einlesen des Content eines Body Element aufgerufen.
186         * 
187         * @see org.xml.sax.ContentHandler#characters(char[], int, int)
188         */
189        public void characters (char ch[], int start, int length)   {
190           if(!silent.value)
191                    current.append(ch,start,length);
192        }
193    
194    
195        /**
196         * @return Returns the content.
197         */
198        public String getContent() {
199            return strContent;
200        }
201    
202        /**
203         * @return Returns the title.
204         */
205        public String getTitle() {
206            return title;
207        }
208    
209        /**
210         * @return Returns the charset.
211         */
212        public String getCharset() {
213            return charset;
214        }
215        /**
216         * @return Returns the summary
217         */
218        public String getSummary() {
219            return description;
220            
221        }
222    
223            /**
224             * @return the keywords
225             */
226            public String getKeywords() {
227                    return keywords;
228            }
229    
230            /**
231             * @return if keywords exists
232             */
233            public boolean hasKeywords() {
234                    return !StringUtil.isEmpty(keywords,true);
235            }
236    
237            /**
238             * @return the author
239             */
240            public String getAuthor() {
241                    return author;
242            }
243    
244            /**
245             * @return if author exists
246             */
247            public boolean hasAuthor() {
248                    return !StringUtil.isEmpty(author,true);
249            }
250            
251            public boolean hasCustom1() {
252                    return !StringUtil.isEmpty(custom1,true);
253            }
254            public boolean hasCustom2() {
255                    return !StringUtil.isEmpty(custom2,true);
256            }
257            public boolean hasCustom3() {
258                    return !StringUtil.isEmpty(custom3,true);
259            }
260            public boolean hasCustom4() {
261                    return !StringUtil.isEmpty(custom4,true);
262            }
263    
264            /**
265             * @return the custom1
266             */
267            public String getCustom1() {
268                    return custom1;
269            }
270            /**
271             * @return the custom2
272             */
273            public String getCustom2() {
274                    return custom2;
275            }
276            /**
277             * @return the custom3
278             */
279            public String getCustom3() {
280                    return custom3;
281            }
282            /**
283             * @return the custom4
284             */
285            public String getCustom4() {
286                    return custom4;
287            }
288            
289            
290        
291        /*public static void main(String[] args) throws Exception {
292            HTMLParser parser = new HTMLParser();
293            parser.parse(new File("C:\\projects\\jmuffin\\webroot\\cfmx\\jm\\test\\tags\\_tuv.htm"),null);
294            
295            //print.ln("title:"+parser.getTitle());
296            //print.ln(parser.getContent());
297            
298            parser.parse(new File("C:\\projects\\jmuffin\\webroot\\cfmx\\jm\\test\\tags\\_tuv.htm"),"UTF-8");
299            
300            //print.ln("title:"+parser.getTitle());
301            //print.ln(parser.getContent());
302        }*/
303        
304        
305        private class Silent {
306            Silent parent;
307            boolean value;
308            /**
309             * constructor of the class
310             * @param parent
311             * @param value
312             */
313            public Silent(Silent parent, boolean value) {
314                this.parent = parent;
315                this.value = value;
316            }
317            
318        }
319        
320    }
321    
322    
323    
324    
325    
326    
327    
328