001    package railo.runtime.search.lucene2.docs;
002    
003    import java.io.Reader;
004    import java.io.StringReader;
005    
006    import org.apache.lucene.document.DateField;
007    import org.apache.lucene.document.Document;
008    
009    import railo.commons.io.IOUtil;
010    import railo.commons.io.res.Resource;
011    import railo.commons.lang.StringUtil;
012    import railo.runtime.op.Caster;
013    import railo.runtime.search.lucene2.html.HTMLParser;
014    
015    /** A utility for making Lucene Documents for HTML documents. */
016    
017    public final class HTMLDocument {
018        private static final char FILE_SEPARATOR = System.getProperty("file.separator").charAt(0);
019      
020      public static String uid(Resource f) {
021        return f.getPath().replace(FILE_SEPARATOR, '\u0000') +
022          "\u0000" +
023          DateField.timeToString(f.lastModified());
024      }
025    
026      public static String uid2url(String uid) {
027        String url = uid.replace('\u0000', '/');      // replace nulls with slashes
028        return url.substring(0, url.lastIndexOf('/')); // remove date from end
029      }
030      
031      public static Document getDocument(Resource res,String charset)  {
032        Document doc = new Document();
033        doc.add(FieldUtil.Text("uid", uid(res), false));
034        
035        HTMLParser parser = new HTMLParser();
036        try {
037            parser.parse(res,charset);
038        } 
039        catch (Throwable t) {
040            return doc;
041        }
042        addContent(doc,parser);
043        return doc;
044      }
045    
046      public static Document getDocument(StringBuffer content, Reader reader) {
047          Document doc = new Document();
048          
049          HTMLParser parser = new HTMLParser();
050          try {
051              String str = IOUtil.toString(reader);
052              if(content!=null)content.append(str);
053              doc.add(FieldUtil.UnIndexed("size", Caster.toString(str.length())));
054              StringReader sr = new StringReader(str);
055              parser.parse(sr);
056          } 
057          catch (Throwable t) {
058              //t.printStackTrace();
059              return doc;
060          }
061          
062          addContent(doc, parser);
063          return doc;
064      }
065      
066            private static void addContent(Document doc, HTMLParser parser) {
067                
068                    FieldUtil.setMimeType(doc,"text/html");
069                //doc.add(FieldUtil.UnIndexed("mime-type", "text/html"));
070                
071                String content = parser.getContent();
072    
073                FieldUtil.setTitle(doc,parser.getTitle());
074                
075                String summary = parser.getSummary();
076                if(StringUtil.isEmpty(summary)){
077                    summary=(content.length()<=200)? content:content.substring(0,200);
078                        FieldUtil.setSummary(doc,summary,false);
079                }
080                else{
081                    FieldUtil.setSummary(doc,summary,true);
082                }
083                FieldUtil.setRaw(doc,content);
084                FieldUtil.setContent(doc,content);
085                
086                //doc.add(FieldUtil.UnIndexed("charset", StringUtil.valueOf(parser.getCharset())));
087                
088                if(parser.hasKeywords()) {
089                    FieldUtil.setKeywords(doc,parser.getKeywords());
090                }
091                
092    
093                if(parser.hasAuthor()){
094                    FieldUtil.setAuthor(doc,parser.getAuthor());
095                }
096                if(parser.hasCustom1()){
097                    FieldUtil.setCustom(doc,parser.getCustom1(),1);
098                }
099                if(parser.hasCustom2()){
100                    FieldUtil.setCustom(doc,parser.getCustom2(),2);
101                }
102                if(parser.hasCustom3()){
103                    FieldUtil.setCustom(doc,parser.getCustom3(),3);
104                }
105                if(parser.hasCustom4()){
106                    FieldUtil.setCustom(doc,parser.getCustom4(),4);
107                }
108                
109                
110        
111    }
112    
113      private HTMLDocument() {}
114    }
115