source: main/trunk/greenstone3/src/java/org/greenstone/gsdl3/service/AbstractGS2DocumentRetrieve.java@ 24254

Last change on this file since 24254 was 24254, checked in by ak19, 13 years ago

Commits for ticket 770 concerning the display of multiple values for a metadata (like dc.Title) when classified by that metadata. So when the user browses by dc.Title, they no longer merely see a doc listed once for each dc.Title assigned but under the same (first retrieved) dc.Title, but they should now see the doc listed once for each dc.Title assigned to it with a different dc.Title value each time.

  • Property svn:keywords set to Author Date Id Revision
File size: 17.2 KB
Line 
1/*
2 * AbstractGS2DocumentRetrieve.java
3 * Copyright (C) 2002 New Zealand Digital Library, http://www.nzdl.org
4 *
5 * This program is free software; you can redistribute it and/or modify
6 * it under the terms of the GNU General Public License as published by
7 * the Free Software Foundation; either version 2 of the License, or
8 * (at your option) any later version.
9 *
10 * This program is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 * GNU General Public License for more details.
14 *
15 * You should have received a copy of the GNU General Public License
16 * along with this program; if not, write to the Free Software
17 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
18 */
19package org.greenstone.gsdl3.service;
20
21// Greenstone classes
22import org.greenstone.gsdl3.core.GSException;
23import org.greenstone.gsdl3.util.GSXML;
24import org.greenstone.gsdl3.util.GSFile;
25import org.greenstone.gsdl3.util.OID;
26import org.greenstone.gsdl3.util.MacroResolver;
27import org.greenstone.gsdl3.util.GS2MacroResolver;
28import org.greenstone.gsdl3.util.GSConstants;
29import org.greenstone.gsdl3.util.SimpleCollectionDatabase;
30import org.greenstone.gsdl3.util.DBInfo;
31// XML classes
32import org.w3c.dom.Document;
33import org.w3c.dom.Element;
34import org.w3c.dom.NodeList;
35
36// General Java classes
37import java.io.File;
38import java.util.StringTokenizer;
39import java.util.Vector;
40import java.util.Set;
41import java.util.Iterator;
42import java.util.ArrayList;
43import java.util.regex.Matcher;
44import java.util.regex.Pattern;
45
46import org.apache.log4j.*;
47
48// Apache Commons
49import org.apache.commons.lang3.*;
50
51/** Implements the generic retrieval and classifier services for GS2
52 * collections.
53 *
54 * @author Katherine Don
55 * @author Michael Dewsnip
56 */
57
58public abstract class AbstractGS2DocumentRetrieve
59 extends AbstractDocumentRetrieve {
60
61 static Logger logger = Logger.getLogger(org.greenstone.gsdl3.service.AbstractGS2DocumentRetrieve.class.getName());
62
63 // protected static final String EXTLINK_PARAM = "ext"; here or in base??
64 protected String index_stem = null;
65
66 protected SimpleCollectionDatabase coll_db = null;
67
68
69 /** constructor */
70 protected AbstractGS2DocumentRetrieve()
71 {
72 this.macro_resolver = new GS2MacroResolver();
73 }
74
75 public void cleanUp() {
76 super.cleanUp();
77 this.coll_db.closeDatabase();
78 }
79 /** configure this service */
80 public boolean configure(Element info, Element extra_info)
81 {
82 if (!super.configure(info, extra_info)){
83 return false;
84 }
85
86 logger.info("Configuring AbstractGS2DocumentRetrieve...");
87 //this.config_info = info;
88
89 // the index stem is either specified in the config file or is the collection name
90 Element index_stem_elem = (Element) GSXML.getChildByTagName(info, GSXML.INDEX_STEM_ELEM);
91 if (index_stem_elem != null) {
92 this.index_stem = index_stem_elem.getAttribute(GSXML.NAME_ATT);
93 }
94 if (this.index_stem == null || this.index_stem.equals("")) {
95 logger.error("AbstractGS2DocumentRetrieve.configure(): indexStem element not found, stem will default to collection name");
96 this.index_stem = this.cluster_name;
97 }
98
99 // find out what kind of database we have
100 Element database_type_elem = (Element) GSXML.getChildByTagName(info, GSXML.DATABASE_TYPE_ELEM);
101 String database_type = null;
102 if (database_type_elem != null) {
103 database_type = database_type_elem.getAttribute(GSXML.NAME_ATT);
104 }
105 if (database_type == null || database_type.equals("")) {
106 database_type = "gdbm"; // the default
107 }
108 coll_db = new SimpleCollectionDatabase(database_type);
109 if (!coll_db.databaseOK()) {
110 logger.error("Couldn't create the collection database of type "+database_type);
111 return false;
112 }
113
114 // Open database for querying
115 String coll_db_file = GSFile.collectionDatabaseFile(this.site_home, this.cluster_name, this.index_stem, database_type);
116 if (!this.coll_db.openDatabase(coll_db_file, SimpleCollectionDatabase.READ)) {
117 logger.error("Could not open collection database!");
118 return false;
119 }
120
121 // we need to set the database for our GS2 macro resolver
122 GS2MacroResolver gs2_macro_resolver = (GS2MacroResolver)this.macro_resolver;
123 gs2_macro_resolver.setDB(this.coll_db);
124
125 return true;
126 }
127
128 /** if id ends in .fc, .pc etc, then translate it to the correct id */
129 protected String translateId(String node_id) {
130 return OID.translateOID(this.coll_db, node_id); //return this.coll_db.translateOID(node_id);
131 }
132
133 /** if an id is not a greenstone id (an external id) then translate
134 it to a greenstone one*/
135 protected String translateExternalId(String node_id){
136 return this.coll_db.externalId2OID(node_id);
137 }
138
139 /** returns the id of the root node of the document containing node node_id. . may be the same as node_id */
140 protected String getRootId(String node_id) {
141 return OID.getTop(node_id);
142 }
143 /** returns a list of the child ids in order, null if no children */
144 protected ArrayList getChildrenIds(String node_id) {
145 DBInfo info = this.coll_db.getInfo(node_id);
146 if (info == null) {
147 return null;
148 }
149
150 String contains = info.getInfo("contains");
151 if (contains.equals("")) {
152 return null;
153 }
154 ArrayList children = new ArrayList();
155 StringTokenizer st = new StringTokenizer(contains, ";");
156 while (st.hasMoreTokens()) {
157 String child_id = StringUtils.replace(st.nextToken(), "\"", node_id);
158 children.add(child_id);
159 }
160 return children;
161
162 }
163 /** returns the node id of the parent node, null if no parent */
164 protected String getParentId(String node_id){
165 String parent = OID.getParent(node_id);
166 if (parent.equals(node_id)) {
167 return null;
168 }
169 return parent;
170 }
171
172 /** get the metadata for the classifier node node_id
173 * returns a metadataList element:
174 * <metadataList><metadata name="xxx">value</metadata></metadataList>
175 */
176 // assumes only one value per metadata
177 protected Element getMetadataList(String node_id, boolean all_metadata,
178 ArrayList metadata_names)
179 throws GSException {
180 Element metadata_list = this.doc.createElement(GSXML.METADATA_ELEM+GSXML.LIST_MODIFIER);
181 DBInfo info = this.coll_db.getInfo(node_id);
182 if (info == null) {
183 return null;
184 }
185 String lang = "en"; // why do we need this??
186 if (all_metadata) {
187 // return everything out of the database
188 Set keys = info.getKeys();
189 Iterator it = keys.iterator();
190 while(it.hasNext()) {
191 String key = (String)it.next();
192 //String value = info.getInfo(key);
193 Vector values = info.getMultiInfo(key);
194 for(int i=0; i<values.size(); i++) {
195 GSXML.addMetadata(this.doc, metadata_list, key, this.macro_resolver.resolve((String)values.elementAt(i), lang, MacroResolver.SCOPE_META, node_id));
196 }
197 }
198
199 } else {
200 // prepare regex to work with mdoffset: looking for <offset\d*_>
201 Pattern pattern = Pattern.compile("offset[0-9]*" + GSConstants.META_RELATION_SEP);
202
203 for (int i=0; i<metadata_names.size(); i++) {
204 String meta_name = (String) metadata_names.get(i);
205 String value = getMetadata(node_id, info, meta_name, lang);
206
207 // Remove the occurrence (if any) in this metaname of the mdoffset number in the pattern <offset\d*_>
208 // Leaving string "offset" in at this point: it will be handled in config_format.xsl's gsf:metadata template match
209 Matcher matcher = pattern.matcher(meta_name);
210 meta_name = matcher.replaceFirst("offset" + GSConstants.META_RELATION_SEP);
211 //replaceFirst(""); // if removing the occurrence (if any) of entire pattern <offset\d*_> in input
212
213 GSXML.addMetadata(this.doc, metadata_list, meta_name, value);
214 }
215 }
216 return metadata_list;
217 }
218
219 /** returns the structural information asked for.
220 * info_type may be one of
221 * INFO_NUM_SIBS, INFO_NUM_CHILDREN, INFO_SIB_POS
222 */
223 protected String getStructureInfo(String doc_id, String info_type) {
224 String value="";
225 if (info_type.equals(INFO_NUM_SIBS)) {
226 String parent_id = OID.getParent(doc_id);
227 if (parent_id.equals(doc_id)) {
228 value="0";
229 } else {
230 value = String.valueOf(getNumChildren(parent_id));
231 }
232 return value;
233 }
234
235 if (info_type.equals(INFO_NUM_CHILDREN)) {
236 return String.valueOf(getNumChildren(doc_id));
237 }
238
239
240 if (info_type.equals(INFO_SIB_POS)) {
241 String parent_id = OID.getParent(doc_id);
242 if (parent_id.equals(doc_id)) {
243 return "-1";
244 }
245
246 DBInfo info = this.coll_db.getInfo(parent_id);
247 if (info==null) {
248 return "-1";
249 }
250
251 String contains = info.getInfo("contains");
252 contains = StringUtils.replace(contains, "\"", parent_id);
253 String [] children = contains.split(";");
254 for (int i=0;i<children.length;i++) {
255 String child_id = children[i];
256 if (child_id.equals(doc_id)) {
257 return String.valueOf(i+1); // make it from 1 to length
258
259 }
260 }
261
262 return "-1";
263 } else {
264 return null;
265 }
266
267 }
268
269 protected int getNumChildren(String node_id) {
270 DBInfo info = this.coll_db.getInfo(node_id);
271 if (info == null) {
272 return 0;
273 }
274 String contains = info.getInfo("contains");
275 if (contains.equals("")) {
276 return 0;
277 }
278 String [] children = contains.split(";");
279 return children.length;
280 }
281
282 /** returns the document type of the doc that the specified node
283 belongs to. should be one of
284 GSXML.DOC_TYPE_SIMPLE,
285 GSXML.DOC_TYPE_PAGED,
286 GSXML.DOC_TYPE_HIERARCHY
287 */
288 protected String getDocType(String node_id) {
289 DBInfo info = this.coll_db.getInfo(node_id);
290 if (info == null) {
291 return GSXML.DOC_TYPE_SIMPLE;
292 }
293 String doc_type = info.getInfo("doctype");
294 if (!doc_type.equals("")&&!doc_type.equals("doc")) {
295 return doc_type;
296 }
297
298 String top_id = OID.getTop(node_id);
299 boolean is_top = (top_id.equals(node_id) ? true : false);
300
301 String children = info.getInfo("contains");
302 boolean is_leaf = (children.equals("") ? true : false);
303
304 if (is_top && is_leaf) { // a single section document
305 return GSXML.DOC_TYPE_SIMPLE;
306 }
307
308 // now we just check the top node
309 if (!is_top) { // we need to look at the top info
310 info = this.coll_db.getInfo(top_id);
311 }
312 if (info == null) {
313 return GSXML.DOC_TYPE_HIERARCHY;
314 }
315
316 String childtype = info.getInfo("childtype");
317 if (childtype.equals("Paged")) {
318 return GSXML.DOC_TYPE_PAGED;
319 }
320 return GSXML.DOC_TYPE_HIERARCHY;
321 }
322
323 /** returns the content of a node
324 * should return a nodeContent element:
325 * <nodeContent>text content or other elements</nodeContent>
326 */
327 abstract protected Element getNodeContent(String doc_id, String lang) throws GSException;
328
329 protected String getMetadata(String node_id, DBInfo info,
330 String metadata, String lang) {
331 String multiple = "false"; // multiple can now be "true", "false" or "offset<number>". It's no longer a boolean
332 String relation = "";
333 String separator = ", ";
334 int pos = metadata.indexOf(GSConstants.META_RELATION_SEP);
335 if (pos ==-1) {
336 Vector values = info.getMultiInfo(metadata);
337 if (values !=null){
338 // just a plain meta entry eg dc.Title
339 StringBuffer result = new StringBuffer();
340 boolean first = true;
341 for (int i=0; i<values.size(); i++) {
342 if (first) {
343 first = false;
344 } else {
345 result.append(separator);
346 }
347 result.append(this.macro_resolver.resolve((String)values.elementAt(i), lang, MacroResolver.SCOPE_META, node_id));
348 }
349 return result.toString();
350 }
351 else{
352 String result = info.getInfo(metadata);
353 return this.macro_resolver.resolve(result, lang, MacroResolver.SCOPE_META, node_id);
354 }
355 }
356
357 String temp = metadata.substring(0, pos);
358 metadata = metadata.substring(pos+1);
359 // check for all on the front
360 if (temp.equals("all") || temp.startsWith("offset")) { // multiple can now be "true", "false" or "offset"
361 multiple = temp; // multiple=true;
362 pos = metadata.indexOf(GSConstants.META_RELATION_SEP);
363 if (pos ==-1) {
364 temp = "";
365 } else {
366 temp = metadata.substring(0, pos);
367 metadata = metadata.substring(pos+1);
368 }
369 }
370
371 // now check for relational info
372 if (temp.equals("parent") || temp.equals("root") || temp.equals( "ancestors")) { // "current" "siblings" "children" "descendants"
373 relation = temp;
374 pos = metadata.indexOf(GSConstants.META_RELATION_SEP);
375 if (pos == -1) {
376 temp = "";
377 } else {
378 temp = metadata.substring(0, pos);
379 metadata = metadata.substring(pos+1);
380 }
381 }
382
383 // now look for separator info
384 if (temp.startsWith(GSConstants.META_SEPARATOR_SEP) && temp.endsWith(GSConstants.META_SEPARATOR_SEP)) {
385 separator = temp.substring(1, temp.length()-1);
386
387 }
388
389 String relation_id = node_id;
390 if (relation.equals("parent") || relation.equals("ancestors")) {
391 relation_id = OID.getParent(node_id);
392 // parent or ancestor does not include self
393 if (relation_id.equals(node_id)){
394 return "";
395 }
396 } else if (relation.equals("root")) {
397 relation_id = OID.getTop(node_id);
398 }
399
400 // now we either have a single node, or we have ancestors
401 DBInfo relation_info;
402 if (relation_id.equals(node_id)) {
403 relation_info = info;
404 } else {
405 relation_info = this.coll_db.getInfo(relation_id);
406 }
407 if (relation_info == null) {
408 return "";
409 }
410
411 StringBuffer result = new StringBuffer();
412
413 if (multiple.equals("false")) {
414 result.append(this.macro_resolver.resolve(relation_info.getInfo(metadata), lang, MacroResolver.SCOPE_META, relation_id));
415 } else if(multiple.startsWith("offset")) { // multiple = offset
416 String offset = multiple.substring("offset".length(), multiple.length());
417 int offsetVal = offset.equals("") ? 0 : Integer.parseInt(offset);
418 String value = relation_info.getInfoOffset(metadata, offsetVal); // what if this metadata is not the one we need to get the offset for? MDTYPE!
419 // at the moment, do we assume the user will specify retrieving the offset only for such metadata as has an offset?
420 // At least, getInfoOffset will return the firstelement if the offset exceeds the bounds of the values for this metadata key
421
422 result.append(this.macro_resolver.resolve(value, lang, MacroResolver.SCOPE_META, relation_id));
423
424 } else { // multiple = true, we have multiple meta
425 Vector values = relation_info.getMultiInfo(metadata);
426 if (values != null) {
427 boolean first = true;
428 for (int i=0; i<values.size(); i++) {
429 if (first) {
430 first = false;
431 } else {
432 result.append(separator);
433 }
434 result.append(this.macro_resolver.resolve((String)values.elementAt(i), lang, MacroResolver.SCOPE_META, relation_id));
435 }
436 }
437 logger.info(result);
438 }
439 // if not ancestors, then this is all we do
440 if (!relation.equals("ancestors")) {
441 return result.toString();
442 }
443
444 // now do the ancestors
445 String current_id = relation_id;
446 relation_id = OID.getParent(current_id);
447 while (!relation_id.equals(current_id)) {
448 relation_info = this.coll_db.getInfo(relation_id);
449 if (relation_info == null) return result.toString();
450 if (multiple.equals("false")) { //if (!multiple)
451 result.insert(0, separator);
452 result.insert(0, this.macro_resolver.resolve(relation_info.getInfo(metadata), lang, MacroResolver.SCOPE_META, relation_id));
453 } else {
454 Vector values = relation_info.getMultiInfo(metadata);
455 if (values != null) {
456 for (int i=values.size()-1; i>=0; i--) {
457 result.insert(0, separator);
458 result.insert(0, this.macro_resolver.resolve((String)values.elementAt(i), lang, MacroResolver.SCOPE_META, relation_id));
459 }
460 }
461
462 }
463 current_id = relation_id;
464 relation_id = OID.getParent(current_id);
465 }
466 return result.toString();
467 }
468
469
470 /** needs to get info from collection database - if the calling code gets it already it may pay to pass it in instead */
471 protected String resolveTextMacros(String doc_content, String doc_id, String lang)
472 {
473 // resolve any collection specific macros
474 doc_content = macro_resolver.resolve(doc_content, lang, MacroResolver.SCOPE_TEXT, doc_id);
475 return doc_content;
476 }
477
478 protected Element getInfo(String doc_id, String info_type) {
479
480 String value="";
481 if (info_type.equals(INFO_NUM_SIBS)) {
482 String parent_id = OID.getParent(doc_id);
483 if (parent_id.equals(doc_id)) {
484 value="0";
485 } else {
486 value = String.valueOf(getNumChildren(parent_id));
487 }
488 } else if (info_type.equals(INFO_NUM_CHILDREN)) {
489 value = String.valueOf(getNumChildren(doc_id));
490 } else if (info_type.equals(INFO_SIB_POS)) {
491 String parent_id = OID.getParent(doc_id);
492 if (parent_id.equals(doc_id)) {
493 value="-1";
494 } else {
495 DBInfo info = this.coll_db.getInfo(parent_id);
496 if (info==null) {
497 value ="-1";
498 } else {
499 String contains = info.getInfo("contains");
500 contains = StringUtils.replace(contains, "\"", parent_id);
501 String [] children = contains.split(";");
502 for (int i=0;i<children.length;i++) {
503 String child_id = children[i];
504 if (child_id.equals(doc_id)) {
505 value = String.valueOf(i+1); // make it from 1 to length
506 break;
507 }
508 }
509 }
510 }
511 } else {
512 return null;
513 }
514 Element info_elem = this.doc.createElement("info");
515 info_elem.setAttribute(GSXML.NAME_ATT, info_type);
516 info_elem.setAttribute(GSXML.VALUE_ATT, value);
517 return info_elem;
518 }
519
520 protected String getHrefOID(String href_url){
521 return this.coll_db.docnum2OID(href_url);
522 }
523
524}
Note: See TracBrowser for help on using the repository browser.