001/**
002 *
003 * Copyright (c) 2014, the Railo Company Ltd. All rights reserved.
004 *
005 * This library is free software; you can redistribute it and/or
006 * modify it under the terms of the GNU Lesser General Public
007 * License as published by the Free Software Foundation; either 
008 * version 2.1 of the License, or (at your option) any later version.
009 * 
010 * This library is distributed in the hope that it will be useful,
011 * but WITHOUT ANY WARRANTY; without even the implied warranty of
012 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
013 * Lesser General Public License for more details.
014 * 
015 * You should have received a copy of the GNU Lesser General Public 
016 * License along with this library.  If not, see <http://www.gnu.org/licenses/>.
017 * 
018 **/
019package lucee.runtime.search.lucene2.docs;
020
021import java.io.Reader;
022import java.io.StringReader;
023
024import lucee.commons.io.IOUtil;
025import lucee.commons.io.res.Resource;
026import lucee.commons.lang.ExceptionUtil;
027import lucee.commons.lang.StringUtil;
028import lucee.runtime.op.Caster;
029import lucee.runtime.search.lucene2.html.HTMLParser;
030
031import org.apache.lucene.document.DateField;
032import org.apache.lucene.document.Document;
033
034/** A utility for making Lucene Documents for HTML documents. */
035
036public final class HTMLDocument {
037    private static final char FILE_SEPARATOR = System.getProperty("file.separator").charAt(0);
038  
039  public static String uid(Resource f) {
040    return f.getPath().replace(FILE_SEPARATOR, '\u0000') +
041      "\u0000" +
042      DateField.timeToString(f.lastModified());
043  }
044
045  public static String uid2url(String uid) {
046    String url = uid.replace('\u0000', '/');      // replace nulls with slashes
047    return url.substring(0, url.lastIndexOf('/')); // remove date from end
048  }
049  
050  public static Document getDocument(Resource res,String charset)  {
051    Document doc = new Document();
052    doc.add(FieldUtil.Text("uid", uid(res), false));
053    
054    HTMLParser parser = new HTMLParser();
055    try {
056        parser.parse(res,charset);
057    } 
058    catch (Throwable t) {
059                ExceptionUtil.rethrowIfNecessary(t);
060        return doc;
061    }
062    addContent(doc,parser);
063    return doc;
064  }
065
066  public static Document getDocument(StringBuffer content, Reader reader) {
067      Document doc = new Document();
068      
069      HTMLParser parser = new HTMLParser();
070      try {
071          String str = IOUtil.toString(reader);
072          if(content!=null)content.append(str);
073          doc.add(FieldUtil.UnIndexed("size", Caster.toString(str.length())));
074          StringReader sr = new StringReader(str);
075          parser.parse(sr);
076      } 
077      catch (Throwable t) {
078                        ExceptionUtil.rethrowIfNecessary(t);
079          //t.printStackTrace();
080          return doc;
081      }
082      
083      addContent(doc, parser);
084      return doc;
085  }
086  
087        private static void addContent(Document doc, HTMLParser parser) {
088            
089                FieldUtil.setMimeType(doc,"text/html");
090            //doc.add(FieldUtil.UnIndexed("mime-type", "text/html"));
091            
092            String content = parser.getContent();
093
094            FieldUtil.setTitle(doc,parser.getTitle());
095            
096            String summary = parser.getSummary();
097            if(StringUtil.isEmpty(summary)){
098                summary=(content.length()<=200)? content:content.substring(0,200);
099                    FieldUtil.setSummary(doc,summary,false);
100            }
101            else{
102                FieldUtil.setSummary(doc,summary,true);
103            }
104            FieldUtil.setRaw(doc,content);
105            FieldUtil.setContent(doc,content);
106            
107            //doc.add(FieldUtil.UnIndexed("charset", StringUtil.valueOf(parser.getCharset())));
108            
109            if(parser.hasKeywords()) {
110                FieldUtil.setKeywords(doc,parser.getKeywords());
111            }
112            
113
114            if(parser.hasAuthor()){
115                FieldUtil.setAuthor(doc,parser.getAuthor());
116            }
117            if(parser.hasCustom1()){
118                FieldUtil.setCustom(doc,parser.getCustom1(),1);
119            }
120            if(parser.hasCustom2()){
121                FieldUtil.setCustom(doc,parser.getCustom2(),2);
122            }
123            if(parser.hasCustom3()){
124                FieldUtil.setCustom(doc,parser.getCustom3(),3);
125            }
126            if(parser.hasCustom4()){
127                FieldUtil.setCustom(doc,parser.getCustom4(),4);
128            }
129            
130            
131    
132}
133
134  private HTMLDocument() {}
135}
136