001/**
002 *
003 * Copyright (c) 2014, the Railo Company Ltd. All rights reserved.
004 *
005 * This library is free software; you can redistribute it and/or
006 * modify it under the terms of the GNU Lesser General Public
007 * License as published by the Free Software Foundation; either 
008 * version 2.1 of the License, or (at your option) any later version.
009 * 
010 * This library is distributed in the hope that it will be useful,
011 * but WITHOUT ANY WARRANTY; without even the implied warranty of
012 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
013 * Lesser General Public License for more details.
014 * 
015 * You should have received a copy of the GNU Lesser General Public 
016 * License along with this library.  If not, see <http://www.gnu.org/licenses/>.
017 * 
018 **/
019package lucee.commons.lang;
020
021import java.net.MalformedURLException;
022import java.net.URL;
023import java.util.ArrayList;
024import java.util.List;
025
026import lucee.commons.io.CharsetUtil;
027import lucee.commons.net.HTTPUtil;
028import lucee.transformer.util.CFMLString;
029
030/**
031 * HTML Util class
032 *
033 */
034public final class HTMLUtil {
035        
036        private final Tag[] tags=new Tag[]{
037                        new Tag("a","href"),
038                        new Tag("link","href"),
039                        new Tag("form","action"),
040                        new Tag("applet","code"),
041                        new Tag("script","src"),
042                        new Tag("body","background"),
043                        new Tag("frame","src"),
044                        new Tag("bgsound","src"),
045                        new Tag("img","src"),
046                        
047                        new Tag("embed",new String[]{"src","pluginspace"}),
048                        new Tag("object",new String[]{"data","classid","codebase","usemap"})
049                        
050        };
051        
052        
053        /**
054         * returns all urls in a html String
055         * @param html HTML String to search urls
056         * @param url Absolute URL path to set
057         * @return urls found in html String
058         */
059        public List getURLS(String html, URL url) {
060                
061            ArrayList urls=new ArrayList();
062                CFMLString cfml=new CFMLString(html,CharsetUtil.UTF8);
063                while(!cfml.isAfterLast()) {
064                        if(cfml.forwardIfCurrent('<')) {
065                                for(int i=0;i<tags.length;i++) {
066                                        if(cfml.forwardIfCurrent(tags[i].tag+" ")) {
067                                                getSingleUrl(urls,cfml,tags[i],url);
068                                        }
069                                }
070                        }
071                        else {
072                                cfml.next();
073                        }
074                        
075                }
076                return urls;
077        }
078        
079        /**
080         * transform a single tag
081         * @param urls all urls founded
082         * @param cfml CFMl String Object containing plain HTML
083         * @param tag current tag totransform
084         * @param url absolute URL to Set at tag attribute
085         */
086        private void getSingleUrl(List urls,CFMLString cfml, Tag tag,URL url) {
087                char quote=0;
088                boolean inside=false;
089                StringBuilder value=new StringBuilder();
090                
091                while(!cfml.isAfterLast()) {
092                        if(inside) {
093                                if(quote!=0 && cfml.forwardIfCurrent(quote)) {
094                                        inside=false;
095                                        
096                                        add(urls,url,value.toString());
097                                }
098                                else if(quote==0 && (cfml.isCurrent(' ')||cfml.isCurrent("/>")||cfml.isCurrent('>')||cfml.isCurrent('\t')||cfml.isCurrent('\n'))) {
099                                        inside=false;
100                                        try {
101                                                urls.add(new URL(url,value.toString()));
102                    } catch (MalformedURLException e) {}
103                                        cfml.next();
104                                } 
105                                else {
106                                        value.append(cfml.getCurrent());
107                                        cfml.next();
108                                }
109                        }
110                        else if(cfml.forwardIfCurrent('>')) {
111                                break;
112                        }
113                        else {
114                                
115                                for(int i=0;i<tag.attributes.length;i++) {
116                                        if(cfml.forwardIfCurrent(tag.attributes[i])) {
117                                                cfml.removeSpace();
118                                                // =
119                                                if(cfml.isCurrent('=')) {
120                                                        inside=true;
121                                                        cfml.next();
122                                                        cfml.removeSpace();
123                                                        
124                                                        quote=cfml.getCurrent();
125                                                        value=new StringBuilder();
126                                                        if(quote!='"' && quote!='\'')quote=0;
127                                                        else {
128                                                                cfml.next();
129                                                        }
130                                                }
131                                        }
132                                }
133                                if(!inside) {
134                                        cfml.next();
135                                }
136                        }
137                }
138        }
139
140    private void add(List list,URL baseURL,String value) {
141                value=value.trim();
142                String lcValue=value.toLowerCase();
143                try {
144                        if(lcValue.startsWith("http://") || lcValue.startsWith("news://") || lcValue.startsWith("goopher://") || lcValue.startsWith("javascript:"))
145                                list.add(HTTPUtil.toURL(value,true));
146                        else {
147                                
148                                
149                                list.add(new URL(baseURL,value.toString()));
150                        }
151                }
152                catch(MalformedURLException mue) {}
153                //print.err(list.get(list.size()-1));
154        }
155
156        private class Tag {
157                private String tag;
158                private String[] attributes;
159                private Tag(String tag,String[] attributes) {
160                        this.tag=tag.toLowerCase();
161                        this.attributes=new String[attributes.length];
162                        for(int i=0;i<attributes.length;i++) {
163                                this.attributes[i]=attributes[i].toLowerCase();
164                        }
165                        
166                }
167                private Tag(String tag,String attribute1) {
168                        this.tag=tag.toLowerCase();
169                        this.attributes=new String[]{attribute1.toLowerCase()};
170                }
171        
172        }
173}