001/** 002 * 003 * Copyright (c) 2014, the Railo Company Ltd. All rights reserved. 004 * 005 * This library is free software; you can redistribute it and/or 006 * modify it under the terms of the GNU Lesser General Public 007 * License as published by the Free Software Foundation; either 008 * version 2.1 of the License, or (at your option) any later version. 009 * 010 * This library is distributed in the hope that it will be useful, 011 * but WITHOUT ANY WARRANTY; without even the implied warranty of 012 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 013 * Lesser General Public License for more details. 014 * 015 * You should have received a copy of the GNU Lesser General Public 016 * License along with this library. If not, see <http://www.gnu.org/licenses/>. 017 * 018 **/ 019package lucee.runtime.search.lucene2.docs; 020 021import java.io.Reader; 022import java.io.StringReader; 023 024import lucee.commons.io.IOUtil; 025import lucee.commons.io.res.Resource; 026import lucee.commons.lang.ExceptionUtil; 027import lucee.commons.lang.StringUtil; 028import lucee.runtime.op.Caster; 029import lucee.runtime.search.lucene2.html.HTMLParser; 030 031import org.apache.lucene.document.DateField; 032import org.apache.lucene.document.Document; 033 034/** A utility for making Lucene Documents for HTML documents. */ 035 036public final class HTMLDocument { 037 private static final char FILE_SEPARATOR = System.getProperty("file.separator").charAt(0); 038 039 public static String uid(Resource f) { 040 return f.getPath().replace(FILE_SEPARATOR, '\u0000') + 041 "\u0000" + 042 DateField.timeToString(f.lastModified()); 043 } 044 045 public static String uid2url(String uid) { 046 String url = uid.replace('\u0000', '/'); // replace nulls with slashes 047 return url.substring(0, url.lastIndexOf('/')); // remove date from end 048 } 049 050 public static Document getDocument(Resource res,String charset) { 051 Document doc = new Document(); 052 doc.add(FieldUtil.Text("uid", uid(res), false)); 053 054 HTMLParser parser = new HTMLParser(); 055 try { 056 parser.parse(res,charset); 057 } 058 catch (Throwable t) { 059 ExceptionUtil.rethrowIfNecessary(t); 060 return doc; 061 } 062 addContent(doc,parser); 063 return doc; 064 } 065 066 public static Document getDocument(StringBuffer content, Reader reader) { 067 Document doc = new Document(); 068 069 HTMLParser parser = new HTMLParser(); 070 try { 071 String str = IOUtil.toString(reader); 072 if(content!=null)content.append(str); 073 doc.add(FieldUtil.UnIndexed("size", Caster.toString(str.length()))); 074 StringReader sr = new StringReader(str); 075 parser.parse(sr); 076 } 077 catch (Throwable t) { 078 ExceptionUtil.rethrowIfNecessary(t); 079 //t.printStackTrace(); 080 return doc; 081 } 082 083 addContent(doc, parser); 084 return doc; 085 } 086 087 private static void addContent(Document doc, HTMLParser parser) { 088 089 FieldUtil.setMimeType(doc,"text/html"); 090 //doc.add(FieldUtil.UnIndexed("mime-type", "text/html")); 091 092 String content = parser.getContent(); 093 094 FieldUtil.setTitle(doc,parser.getTitle()); 095 096 String summary = parser.getSummary(); 097 if(StringUtil.isEmpty(summary)){ 098 summary=(content.length()<=200)? content:content.substring(0,200); 099 FieldUtil.setSummary(doc,summary,false); 100 } 101 else{ 102 FieldUtil.setSummary(doc,summary,true); 103 } 104 FieldUtil.setRaw(doc,content); 105 FieldUtil.setContent(doc,content); 106 107 //doc.add(FieldUtil.UnIndexed("charset", StringUtil.valueOf(parser.getCharset()))); 108 109 if(parser.hasKeywords()) { 110 FieldUtil.setKeywords(doc,parser.getKeywords()); 111 } 112 113 114 if(parser.hasAuthor()){ 115 FieldUtil.setAuthor(doc,parser.getAuthor()); 116 } 117 if(parser.hasCustom1()){ 118 FieldUtil.setCustom(doc,parser.getCustom1(),1); 119 } 120 if(parser.hasCustom2()){ 121 FieldUtil.setCustom(doc,parser.getCustom2(),2); 122 } 123 if(parser.hasCustom3()){ 124 FieldUtil.setCustom(doc,parser.getCustom3(),3); 125 } 126 if(parser.hasCustom4()){ 127 FieldUtil.setCustom(doc,parser.getCustom4(),4); 128 } 129 130 131 132} 133 134 private HTMLDocument() {} 135} 136