source: greenstone3/trunk/src/java/org/greenstone/gsdl3/service/GS2LuceneRetrieve.java@ 20817

Last change on this file since 20817 was 20817, checked in by kjdon, 15 years ago

we no longer use sequential ids for sections. so can use doc_id to get the section.

  • Property svn:keywords set to Author Date Id Revision
File size: 5.4 KB
Line 
1/*
2 * GS2LuceneRetrieve.java
3 * Copyright (C) 2005 New Zealand Digital Library, http://www.nzdl.org
4 *
5 * This program is free software; you can redistribute it and/or modify
6 * it under the terms of the GNU General Public License as published by
7 * the Free Software Foundation; either version 2 of the License, or
8 * (at your option) any later version.
9 *
10 * This program is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 * GNU General Public License for more details.
14 *
15 * You should have received a copy of the GNU General Public License
16 * along with this program; if not, write to the Free Software
17 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
18 */
19package org.greenstone.gsdl3.service;
20
21// Greenstone classes
22import org.greenstone.gsdl3.core.GSException;
23import org.greenstone.gsdl3.util.GSFile;
24import org.greenstone.gsdl3.util.GSXML;
25import org.greenstone.gsdl3.util.DBInfo;
26import org.greenstone.gsdl3.util.GSHTML;
27import org.greenstone.gsdl3.util.OID;
28// XML classes
29import org.w3c.dom.Document;
30import org.w3c.dom.Element;
31import org.w3c.dom.Text;
32
33// General Java classes
34import java.io.File;
35
36import org.apache.log4j.Logger;
37
38/** Retrieve documents from a gs2 lucene collection. Note that this doesn't
39 actually use lucene, as the documents are stored in XML files */
40public class GS2LuceneRetrieve
41 extends AbstractGS2DocumentRetrieve
42{
43
44 static Logger logger = Logger.getLogger(org.greenstone.gsdl3.service.GS2LuceneRetrieve.class.getName());
45
46
47 protected static final String DOC_LEVEL="Doc";
48 protected static final String SEC_LEVEL="Sec";
49 protected static final String ID_ATT = "gs2:docOID";
50
51 // Parameters used
52 private static final String LEVEL_PARAM = "level";
53
54 // Elements used in the config file that are specific to this class
55 private static final String DEFAULT_LEVEL_ELEM = "defaultLevel";
56
57 private String default_level = null;
58 private String text_dir = null;
59
60 private boolean text_available = true;
61
62 public GS2LuceneRetrieve() {
63 }
64
65 public void cleanUp() {
66 super.cleanUp();
67 }
68
69 /** configure this service */
70 public boolean configure(Element info, Element extra_info)
71 {
72 if (!super.configure(info, extra_info)){
73 return false;
74 }
75
76 // Do specific configuration
77 logger.info("Configuring GS2LuceneRetrieve...");
78
79 text_dir = GSFile.collectionIndexDir(this.site_home, this.cluster_name) + File.separatorChar+"text"+File.separatorChar;
80 if (!(new File(text_dir).isDirectory())) {
81 logger.error("Text directory "+text_dir+" does not exist, will be unable to retrieve text for "+cluster_name);
82 text_available = false;
83 return true; // return true so that we still get the other services for the collection
84 }
85 // Get the default level out of <defaultLevel> (buildConfig.xml)
86 Element def = (Element) GSXML.getChildByTagName(info, DEFAULT_LEVEL_ELEM);
87 if (def != null) {
88 this.default_level = def.getAttribute(GSXML.SHORTNAME_ATT);
89 }
90 if (this.default_level == null || this.default_level.equals("")) {
91 logger.error("Default level not specified for "+this.cluster_name+", assuming "+DOC_LEVEL);
92 this.default_level = DOC_LEVEL;
93 }
94
95 return true;
96
97 }
98
99 /** returns the content of a node
100 * should return a nodeContent element:
101 * <nodeContent>text content or other elements</nodeContent>
102 */
103 protected Element getNodeContent(String doc_id, String lang) throws GSException {
104 String [] args = new String[1];
105 args[0] = doc_id;
106 String doc_content = getTextString("TextRetrievalError", lang, args);
107 try {
108 if (!text_available) {
109 throw new Exception("No text directory available");
110 }
111
112 DBInfo info=this.coll_db.getInfo(OID.getTop(doc_id));
113 if (info == null) {
114 throw new Exception("Couldn't get database entry for "+OID.getTop(doc_id));
115 }
116
117 String archivedir=info.getInfo("archivedir");
118 File doc_xml_file = new File(text_dir+archivedir+File.separatorChar+"doc.xml");
119 if (!doc_xml_file.isFile()) {
120 throw new Exception("Doc XML file "+doc_xml_file.getPath()+" does not exist");
121 }
122 Document doc_xml_doc = this.converter.getDOM(doc_xml_file, "utf-8");
123 if (doc_xml_doc == null) {
124 throw new Exception("Couldn't parse file "+doc_xml_file.getPath());
125 }
126 Element full_document = doc_xml_doc.getDocumentElement();
127 if (full_document == null) {
128 throw new Exception("Couldn't parse file "+doc_xml_file.getPath());
129 }
130 Element current_section = null;
131 if (default_level.equals(DOC_LEVEL)) {
132 current_section = full_document;
133 } else {
134 current_section = GSXML.getNamedElement(full_document, SEC_LEVEL, ID_ATT, doc_id);
135 }
136 if (current_section == null) {
137 throw new Exception("Couldn't find section "+ doc_id+" in file "+doc_xml_file.getPath());
138 }
139 doc_content = GSXML.getNodeText(current_section);
140 if (doc_content == null) {
141 doc_content = "";
142 } else {
143 doc_content = resolveTextMacros(doc_content, doc_id, lang);
144 }
145 } catch (Exception e) {
146 logger.error("Error trying to get document text for "+doc_id+" in collection "+this.cluster_name+": "+e);
147 }
148
149 Element content_node = this.doc.createElement(GSXML.NODE_CONTENT_ELEM);
150 Text t = this.doc.createTextNode(doc_content);
151 content_node.appendChild(t);
152 return content_node;
153 }
154}
Note: See TracBrowser for help on using the repository browser.