/* * DocumentAction.java * Copyright (C) 2002 New Zealand Digital Library, http://www.nzdl.org * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ package org.greenstone.gsdl3.action; // Greenstone classes import org.greenstone.gsdl3.core.ModuleInterface; import org.greenstone.gsdl3.service.AbstractDocumentRetrieve; import org.greenstone.gsdl3.service.DocXMLUtil; import org.greenstone.gsdl3.util.*; import org.greenstone.util.GlobalProperties; // XML classes import org.w3c.dom.Document; import org.w3c.dom.Element; import org.w3c.dom.Node; import org.w3c.dom.Text; import org.w3c.dom.NodeList; // General Java classes import java.util.ArrayList; import java.util.HashMap; import java.util.HashSet; import java.util.Iterator; import java.io.File; import java.io.Serializable; import org.apache.log4j.*; /** Action class for retrieving Documents via the message router */ public class DocumentAction extends Action { static Logger logger = Logger.getLogger(org.greenstone.gsdl3.action.DocumentAction.class.getName()); // this is used to specify that the sibling nodes of a selected one should be obtained public static final String SIBLING_ARG = "sib"; public static final String GOTO_PAGE_ARG = "gp"; public static final String ENRICH_DOC_ARG = "end"; public static final String EXPAND_DOCUMENT_ARG = "ed"; public static final String EXPAND_CONTENTS_ARG = "ec"; public static final String REALISTIC_BOOK_ARG = "book"; public static final String NO_TEXT_ARG = "noText"; public static final String DOC_EDIT_ARG = "docEdit"; /** * if this is set to true, when a document is displayed, any annotation type * services (enrich) will be offered to the user as well */ protected boolean provide_annotations = false; protected boolean highlight_query_terms = false; public boolean configure() { super.configure(); String highlight = (String) config_params.get("highlightQueryTerms"); if (highlight != null && highlight.equals("true")) { highlight_query_terms = true; } String annotate = (String) config_params.get("displayAnnotationService"); if (annotate != null && annotate.equals("true")) { provide_annotations = true; } return true; } public Node process(Node message_node) { // for now, no subaction eventually we may want to have subactions such as text assoc or something ? Element message = GSXML.nodeToElement(message_node); Document doc = XMLConverter.newDOM(); // the response Element result = doc.createElement(GSXML.MESSAGE_ELEM); Element page_response = doc.createElement(GSXML.RESPONSE_ELEM); result.appendChild(page_response); // get the request - assume only one Element request = (Element) GSXML.getChildByTagName(message, GSXML.REQUEST_ELEM); Element cgi_paramList = (Element) GSXML.getChildByTagName(request, GSXML.PARAM_ELEM + GSXML.LIST_MODIFIER); HashMap params = GSXML.extractParams(cgi_paramList, false); // just in case there are some that need to get passed to the services // why do we use s0 here and s1 in other places??? HashMap service_params = (HashMap) params.get("s0"); String collection = (String) params.get(GSParams.COLLECTION); String document_id = (String) params.get(GSParams.DOCUMENT); if (document_id != null && document_id.equals("")) { document_id = null; } String href = (String) params.get(GSParams.HREF);//for an external link : get the href URL if it is existing in the params list if (href != null && href.equals("")) { href = null; } String rl = (String) params.get(GSParams.RELATIVE_LINK);//for an external link : get the rl value if it is existing in the params list if (document_id == null && href == null) { logger.error("no document specified!"); return result; } if (rl != null && rl.equals("0")) { // this is a true external link, we should have been directed to a different page or action logger.error("rl value was 0, shouldn't get here"); return result; } String doc_id_modifier = ""; String sibling_num = (String) params.get(GOTO_PAGE_ARG); if (sibling_num != null && !sibling_num.equals("")) { // we have to modify the doc name doc_id_modifier = "." + sibling_num + ".ss"; } UserContext userContext = new UserContext(request); //append site metadata addSiteMetadata(page_response, userContext); addInterfaceOptions(page_response); // get the additional data needed for the page getBackgroundData(page_response, collection, userContext); // create a basic doc list containing the current node // we will use this to query whether the id is valid, and to get document type Element basic_doc_list = doc.createElement(GSXML.DOC_NODE_ELEM + GSXML.LIST_MODIFIER); Element current_doc = doc.createElement(GSXML.DOC_NODE_ELEM); basic_doc_list.appendChild(current_doc); if (document_id != null) { current_doc.setAttribute(GSXML.NODE_ID_ATT, document_id + doc_id_modifier); } else { current_doc.setAttribute(GSXML.HREF_ID_ATT, href); // do we need this?? current_doc.setAttribute(GSXML.ID_MOD_ATT, doc_id_modifier); } // lets do a quick check here for valid doc id. if (document_id != null) { boolean is_valid = checkValidOID(basic_doc_list, collection, userContext, page_response ); if (!is_valid) { GSXML.addError(page_response, "Invalid doc id ("+document_id+")", GSXML.ERROR_TYPE_INVALID_ID); return result; } } Element format_elem = (Element) GSXML.getChildByTagName(page_response, GSXML.FORMAT_ELEM); if (format_elem != null) { // lets look for param defaults set in config file NodeList param_defaults = format_elem.getElementsByTagName(GSXML.PARAM_DEFAULT_ELEM); for (int i=0; i meta_names = new HashSet(); meta_names.add("Title"); // the default if (format_elem != null) { getRequiredMetadataNames(format_elem, meta_names); } Element extraMetaListElem = (Element) GSXML.getChildByTagName(request, GSXML.EXTRA_METADATA + GSXML.LIST_MODIFIER); if (extraMetaListElem != null) { NodeList extraMetaList = extraMetaListElem.getElementsByTagName(GSXML.EXTRA_METADATA); for (int i = 0; i < extraMetaList.getLength(); i++) { meta_names.add(((Element) extraMetaList.item(i)).getAttribute(GSXML.NAME_ATT)); } } Element dm_param_list = createMetadataParamList(doc,meta_names); if (service_params != null) { GSXML.addParametersToList(dm_param_list, service_params); } dm_request.appendChild(dm_param_list); // create the doc node list for the metadata request Element dm_doc_list = doc.createElement(GSXML.DOC_NODE_ELEM + GSXML.LIST_MODIFIER); dm_request.appendChild(dm_doc_list); // Add each node from the structure response into the metadata request NodeList doc_nodes = the_document.getElementsByTagName(GSXML.DOC_NODE_ELEM); for (int i = 0; i < doc_nodes.getLength(); i++) { Element doc_node = (Element) doc_nodes.item(i); String doc_node_id = doc_node.getAttribute(GSXML.NODE_ID_ATT); // Add the documentNode to the list Element dm_doc_node = doc.createElement(GSXML.DOC_NODE_ELEM); if (needSectionContent(params)) { if (doc_node_id.equals(document_id)) { dm_doc_list.appendChild(dm_doc_node); } } else { dm_doc_list.appendChild(dm_doc_node); } //dm_doc_list.appendChild(dm_doc_node); dm_doc_node.setAttribute(GSXML.NODE_ID_ATT, doc_node_id); dm_doc_node.setAttribute(GSXML.NODE_TYPE_ATT, doc_node.getAttribute(GSXML.NODE_TYPE_ATT)); if (document_id == null){ dm_doc_node.setAttribute(GSXML.HREF_ID_ATT, href ); } } // we also want a metadata request to the top level document to get // assocfilepath - this could be cached too Element doc_meta_request = GSXML.createBasicRequest(doc, GSXML.REQUEST_TYPE_PROCESS, to, userContext); dm_message.appendChild(doc_meta_request); Element doc_meta_param_list = doc.createElement(GSXML.PARAM_ELEM + GSXML.LIST_MODIFIER); if (service_params != null) { GSXML.addParametersToList(doc_meta_param_list, service_params); } doc_meta_request.appendChild(doc_meta_param_list); Element doc_param = doc.createElement(GSXML.PARAM_ELEM); doc_meta_param_list.appendChild(doc_param); doc_param.setAttribute(GSXML.NAME_ATT, "metadata"); doc_param.setAttribute(GSXML.VALUE_ATT, "assocfilepath"); // create the doc node list for the metadata request Element doc_list = doc.createElement(GSXML.DOC_NODE_ELEM + GSXML.LIST_MODIFIER); doc_meta_request.appendChild(doc_list); Element doc_node = doc.createElement(GSXML.DOC_NODE_ELEM); // the node we want is the root document node if (document_id != null) { doc_node.setAttribute(GSXML.NODE_ID_ATT, document_id + ".rt"); } /*else { doc_node.setAttribute(GSXML.HREF_ID_ATT, href);// + ".rt"); // can we assume that href is always a top level doc?? //doc_node.setAttribute(GSXML.ID_MOD_ATT, ".rt"); //doc_node.setAttribute("externalURL", has_rl); }*/ doc_list.appendChild(doc_node); Element dm_response_message = (Element) this.mr.process(dm_message); if (processErrorElements(dm_response_message, page_response)) { return result; } String path = GSPath.appendLink(GSXML.RESPONSE_ELEM, GSXML.DOC_NODE_ELEM + GSXML.LIST_MODIFIER); Element dm_response_doc_list = (Element) GSXML.getNodeByPath(dm_response_message, path); // Merge the metadata with the structure information NodeList dm_response_docs = dm_response_doc_list.getChildNodes(); for (int i = 0; i < doc_nodes.getLength(); i++) { Node dcNode; String node_idd = ((Element)doc_nodes.item(i)).getAttribute(GSXML.NODE_ID_ATT); if (node_idd.isEmpty()) { String href_id_att = ((Element)doc_nodes.item(i)).getAttribute(GSXML.HREF_ID_ATT); dcNode = GSXML.getNamedElement(dm_response_doc_list, "documentNode", GSXML.HREF_ID_ATT, href_id_att); } else { dcNode = GSXML.getNamedElement(dm_response_doc_list, "documentNode", GSXML.NODE_ID_ATT, node_idd); } GSXML.mergeMetadataLists(doc_nodes.item(i), dcNode); } // get the top level doc metadata out Element doc_meta_response = (Element) dm_response_message.getElementsByTagName(GSXML.RESPONSE_ELEM).item(1); Element top_doc_node = (Element) GSXML.getNodeByPath(doc_meta_response, "documentNodeList/documentNode"); GSXML.mergeMetadataLists(the_document, top_doc_node); // if we are highlighting query terms, then we also get them highlighted in the metadata HashSet query_term_variants = null; ArrayList>> phrase_query_term_variants_hierarchy = null; boolean do_highlight_query_terms = highlight_query_terms; int query_terms_status = 0; if (highlight_query_terms) { // lets get the query term equivalents query_term_variants = new HashSet(); phrase_query_term_variants_hierarchy = new ArrayList>>(); if ((query_terms_status = getQueryTermVariants(request, query_term_variants, phrase_query_term_variants_hierarchy)) ==0) { do_highlight_query_terms = false; // we couldn't get the terms } } // lets try marking up the metadata with search terms // if the search service doesn't send back then we haven't got the term variants. We lower case everything and do case insensitive matching boolean highlight_case_insensitive = false; if (query_terms_status == NO_EQUIV_QUERY_TERMS) { highlight_case_insensitive = true; } if (do_highlight_query_terms) { highlightQueryTermsDOM(doc, the_document, "metadata", query_term_variants, phrase_query_term_variants_hierarchy, highlight_case_insensitive); } // do we want doc text content? If not, we are done. if (!get_text) { // don't get text return result; } // Build a request to obtain some document content Element dc_message = doc.createElement(GSXML.MESSAGE_ELEM); to = GSPath.appendLink(collection, AbstractDocumentRetrieve.DOCUMENT_CONTENT_RETRIEVE_SERVICE); Element dc_request = GSXML.createBasicRequest(doc, GSXML.REQUEST_TYPE_PROCESS, to, userContext); dc_message.appendChild(dc_request); // Create a parameter list to specify the request parameters - empty for now Element dc_param_list = doc.createElement(GSXML.PARAM_ELEM + GSXML.LIST_MODIFIER); if (service_params != null) { GSXML.addParametersToList(dc_param_list, service_params); } dc_request.appendChild(dc_param_list); // get the content // the doc list for the content request is the same as the one for the structure request unless we want the whole document, in which case its the same as for the metadata request. if (expand_document) { dc_request.appendChild(dm_doc_list); } else { dc_request.appendChild(basic_doc_list); } Element dc_response_message = (Element) this.mr.process(dc_message); if (processErrorElements(dc_response_message, page_response)) { return result; } Element dc_response_doc_list = (Element) GSXML.getNodeByPath(dc_response_message, path); boolean get_marked_up_doc_from_query = false; if (do_highlight_query_terms && query_terms_status == NO_EQUIV_QUERY_TERMS) { get_marked_up_doc_from_query = true; // we try to. solr we can, lucene we can't } if (expand_document) { // Merge the content with the structure information NodeList dc_response_docs = dc_response_doc_list.getChildNodes(); for (int i = 0; i < doc_nodes.getLength(); i++) { String node_id = ((Element)doc_nodes.item(i)).getAttribute(GSXML.NODE_ID_ATT); Node docNode = GSXML.getNamedElement(dc_response_doc_list, "documentNode", GSXML.NODE_ID_ATT, node_id); Node content = GSXML.getChildByTagName(docNode, GSXML.NODE_CONTENT_ELEM); if (content != null) { if (do_highlight_query_terms) { if (get_marked_up_doc_from_query) { Element new_content = retrieveHighlightedContent(request, node_id); if (new_content == null) { // we didn't get any text back from the request. assume we won't be able to get it next time either (eg lucene) get_marked_up_doc_from_query = false; content = highlightQueryTermsElementText(doc, (Element)content, query_term_variants, phrase_query_term_variants_hierarchy, highlight_case_insensitive); } else { content= new_content; } } else { content = highlightQueryTermsElementText(doc, (Element)content, query_term_variants, phrase_query_term_variants_hierarchy, highlight_case_insensitive); } } doc_nodes.item(i).appendChild(doc.importNode(content, true)); } } if (has_dummy && document_type.equals(GSXML.DOC_TYPE_SIMPLE)) { Element dummy_node = (Element) doc_nodes.item(0); the_document.removeChild(dummy_node); the_document.setAttribute(GSXML.NODE_ID_ATT, dummy_node.getAttribute(GSXML.NODE_ID_ATT)); NodeList dummy_children = dummy_node.getChildNodes(); for (int i = dummy_children.getLength() - 1; i >= 0; i--) { // special case as we don't want more than one metadata list if (dummy_children.item(i).getNodeName().equals(GSXML.METADATA_ELEM + GSXML.LIST_MODIFIER)) { GSXML.mergeMetadataFromList(the_document, dummy_children.item(i)); } else { the_document.appendChild(dummy_children.item(i)); } } } } else { Element dc_response_doc = (Element) GSXML.getChildByTagName(dc_response_doc_list, GSXML.DOC_NODE_ELEM); Element dc_response_doc_content = (Element) GSXML.getChildByTagName(dc_response_doc, GSXML.NODE_CONTENT_ELEM); if (dc_response_doc_content == null) { // no content to add if (dc_response_doc.getAttribute("external").equals("true")) { String href_id = dc_response_doc.getAttribute(GSXML.HREF_ID_ATT); the_document.setAttribute("selectedNode", href_id); the_document.setAttribute("external", href_id); } return result; } if (do_highlight_query_terms) { dc_response_doc.removeChild(dc_response_doc_content); if (get_marked_up_doc_from_query) { Element new_content = retrieveHighlightedContent(request, null); if (new_content == null) { get_marked_up_doc_from_query = false; dc_response_doc_content = highlightQueryTermsElementText(doc, (Element)dc_response_doc_content, query_term_variants, phrase_query_term_variants_hierarchy, highlight_case_insensitive); } else { dc_response_doc_content = new_content; } } else { dc_response_doc_content = highlightQueryTermsElementText(doc, (Element)dc_response_doc_content, query_term_variants, phrase_query_term_variants_hierarchy, highlight_case_insensitive); } dc_response_doc.appendChild(dc_response_doc.getOwnerDocument().importNode(dc_response_doc_content, true)); } if (provide_annotations) { String service_selected = (String) params.get(ENRICH_DOC_ARG); if (service_selected != null && service_selected.equals("1")) { // now we can modifiy the response doc if needed String enrich_service = (String) params.get(GSParams.SERVICE); // send a message to the service Element enrich_message = doc.createElement(GSXML.MESSAGE_ELEM); Element enrich_request = GSXML.createBasicRequest(doc, GSXML.REQUEST_TYPE_PROCESS, enrich_service, userContext); enrich_message.appendChild(enrich_request); // check for parameters HashMap e_service_params = (HashMap) params.get("s1"); if (e_service_params != null) { Element enrich_pl = doc.createElement(GSXML.PARAM_ELEM + GSXML.LIST_MODIFIER); GSXML.addParametersToList(enrich_pl, e_service_params); enrich_request.appendChild(enrich_pl); } Element e_doc_list = doc.createElement(GSXML.DOC_NODE_ELEM + GSXML.LIST_MODIFIER); enrich_request.appendChild(e_doc_list); e_doc_list.appendChild(doc.importNode(dc_response_doc, true)); Node enrich_response = this.mr.process(enrich_message); String[] links = { GSXML.RESPONSE_ELEM, GSXML.DOC_NODE_ELEM + GSXML.LIST_MODIFIER, GSXML.DOC_NODE_ELEM, GSXML.NODE_CONTENT_ELEM }; path = GSPath.createPath(links); dc_response_doc_content = (Element) GSXML.getNodeByPath(enrich_response, path); } } // if provide_annotations // use the returned id rather than the sent one cos there may have // been modifiers such as .pr that are removed. String modified_doc_id = dc_response_doc.getAttribute(GSXML.NODE_ID_ATT); the_document.setAttribute("selectedNode", modified_doc_id); if (has_dummy) { // change the id if necessary and add the content Element dummy_node = (Element) doc_nodes.item(0); dummy_node.setAttribute(GSXML.NODE_ID_ATT, modified_doc_id); dummy_node.appendChild(doc.importNode(dc_response_doc_content, true)); // hack for simple type if (document_type.equals(GSXML.DOC_TYPE_SIMPLE)) { // we dont want the internal docNode, just want the content and metadata in the document // rethink this!! the_document.removeChild(dummy_node); NodeList dummy_children = dummy_node.getChildNodes(); for (int i = dummy_children.getLength() - 1; i >= 0; i--) { // special case as we don't want more than one metadata list if (dummy_children.item(i).getNodeName().equals(GSXML.METADATA_ELEM + GSXML.LIST_MODIFIER)) { GSXML.mergeMetadataFromList(the_document, dummy_children.item(i)); } else { the_document.appendChild(dummy_children.item(i)); } } } the_document.setAttribute(GSXML.NODE_ID_ATT, modified_doc_id); } else { // Merge the document content with the metadata and structure information for (int i = 0; i < doc_nodes.getLength(); i++) { Node dn = doc_nodes.item(i); String dn_id = ((Element) dn).getAttribute(GSXML.NODE_ID_ATT); if (dn_id.equals(modified_doc_id)) { dn.appendChild(doc.importNode(dc_response_doc_content, true)); break; } } } } //logger.debug("(DocumentAction) Page:\n" + GSXML.xmlNodeToString(result)); return result; } protected boolean checkValidOID(Element basic_doc_list, String collection, UserContext userContext, Element page_response) { Document doc = basic_doc_list.getOwnerDocument(); Element v_message = doc.createElement(GSXML.MESSAGE_ELEM); String to = GSPath.appendLink(collection, AbstractDocumentRetrieve.VALIDATE_DOCUMENT_ID_SERVICE); Element v_request = GSXML.createBasicRequest(doc, GSXML.REQUEST_TYPE_PROCESS, to, userContext); v_message.appendChild(v_request); // add the node list v_request.appendChild(basic_doc_list); Element v_response_message = (Element) this.mr.process(v_message); if (processErrorElements(v_response_message, page_response)) { return false; } String[] links = { GSXML.RESPONSE_ELEM, GSXML.DOC_NODE_ELEM + GSXML.LIST_MODIFIER, GSXML.DOC_NODE_ELEM }; String path = GSPath.createPath(links); Element info_elem = (Element) GSXML.getNodeByPath(v_response_message, path); if (info_elem == null) { return false; } if (info_elem.getAttribute("valid").equals("true")) { return true; } return false; } protected Element getFormattedArchiveDoc(Document doc, String collection, String document_id, String document_type, Element result, Element page_response, UserContext userContext ) { // call get archive doc Element dx_message = doc.createElement(GSXML.MESSAGE_ELEM); String to = DocXMLUtil.DOC_XML_GET_SECTION_SERVICE; Element dx_request = GSXML.createBasicRequest(doc, GSXML.REQUEST_TYPE_PROCESS, to, userContext); dx_message.appendChild(dx_request); Element dx_section = doc.createElement(GSXML.DOCXML_SECTION_ELEM); dx_section.setAttribute(GSXML.NODE_ID_ATT, document_id); dx_section.setAttribute(GSXML.COLLECTION_ATT, collection); dx_request.appendChild(dx_section); Element dx_response_message = (Element) this.mr.process(dx_message); if (processErrorElements(dx_response_message, page_response)) { return result; } // get the section out String path = GSPath.appendLink(GSXML.RESPONSE_ELEM, GSXML.DOCXML_SECTION_ELEM); Element section = (Element) GSXML.getNodeByPath(dx_response_message, path); if (section == null) { logger.error("no archive doc returned for "+document_id); return result; } // convert the archive format into the internal format that the page response requires // work out doctype // NOTE: this will be coming from collection database in index // the archive file doesn't store this. So we have to assume // that the doc type will not be changing with any // modifications happening to archives. // if doc type is null, then we need to work it out. // create a basic doc list containing the current node if (document_type == null) { Element basic_doc_list = doc.createElement(GSXML.DOC_NODE_ELEM + GSXML.LIST_MODIFIER); Element current_doc = doc.createElement(GSXML.DOC_NODE_ELEM); basic_doc_list.appendChild(current_doc); current_doc.setAttribute(GSXML.NODE_ID_ATT, document_id); basic_doc_list.appendChild(current_doc); document_type = getDocumentType(basic_doc_list, collection, userContext, page_response); } if (document_type == null) { logger.debug("@@@ doctype is null, setting to simple"); document_type = GSXML.DOC_TYPE_SIMPLE; } Element doc_elem = doc.createElement(GSXML.DOCUMENT_ELEM); doc_elem.setAttribute(GSXML.DOC_TYPE_ATT, document_type); page_response.appendChild(doc_elem); Element transformed_section = transformArchiveToDocument(section); if (document_type == GSXML.DOC_TYPE_SIMPLE) { // simple doc, only returning a single document node, which is the top level section. doc_elem.setAttribute(GSXML.NODE_ID_ATT, document_id); GSXML.mergeElements(doc_elem, transformed_section); return result; } // multi sectioned document. transformed_section.setAttribute(GSXML.NODE_ID_ATT, document_id); // In docEdit mode, we obtain the text from archives, from doc.xml // Now the transformation has replaced
with // Need to add nodeID, nodeType and docType attributes to each docNode // as doc.xml doesn't store that. insertDocNodeAttributes(transformed_section, document_type, null); doc_elem.appendChild(doc.importNode(transformed_section, true)); logger.debug("dx result = "+XMLConverter.getPrettyString(result)); return result; } private boolean needSectionContent(HashMap params) { String document_id = (String) params.get(GSParams.DOCUMENT); String ilt = (String) params.get(GSParams.INLINE_TEMPLATE); String iltPrefix = "., where the first parent-id is rootNode id. // The nodeType is root if rootNode, internal if there are children and leaf if no children protected void insertDocNodeAttributes(Element docNode, String document_type, String id) { boolean isRoot = false; if(id == null) { // rootNode, get the root nodeID to work with recursively id = docNode.getAttribute(GSXML.NODE_ID_ATT); isRoot = true; } else { // for all but the root node, need to still set the nodeID docNode.setAttribute(GSXML.NODE_ID_ATT, id); } docNode.setAttribute(GSXML.DOC_TYPE_ATT, document_type); NodeList docNodes = GSXML.getChildrenByTagName(docNode, GSXML.DOC_NODE_ELEM); if(docNodes.getLength() > 0) { docNode.setAttribute(GSXML.NODE_TYPE_ATT, GSXML.NODE_TYPE_INTERNAL); for(int i = 0; i < docNodes.getLength(); i++) { Element childDocNode = (Element)docNodes.item(i); // work out the child docNode's nodeID based on current id String nodeID = id + "." + (i+1); insertDocNodeAttributes(childDocNode, document_type, nodeID); //recursion step } } else { docNode.setAttribute(GSXML.NODE_TYPE_ATT, GSXML.NODE_TYPE_LEAF); } // rootNode's nodeType is a special case: it's "root", not "leaf" or "internal" if(isRoot) docNode.setAttribute(GSXML.NODE_TYPE_ATT, GSXML.NODE_TYPE_ROOT); } /** run the XSLT transform which converts from doc.xml format to our internal document format */ protected Element transformArchiveToDocument(Element section) { String stylesheet_filename = GSFile.stylesheetFile(GlobalProperties.getGSDL3Home(), (String) this.config_params.get(GSConstants.SITE_NAME), "", (String) this.config_params.get(GSConstants.INTERFACE_NAME), (ArrayList) this.config_params.get(GSConstants.BASE_INTERFACES), "archive2document.xsl"); if (stylesheet_filename == null) { logger.error("Couldn't find stylesheet archive2document.xsl"); return section; } Document stylesheet_doc = XMLConverter.getDOM(new File(stylesheet_filename)); if (stylesheet_doc == null) { logger.error("Couldn't load in stylesheet "+stylesheet_filename); return section; } Document section_doc = XMLConverter.newDOM(); section_doc.appendChild(section_doc.importNode(section, true)); Node result = this.transformer.transform(stylesheet_doc, section_doc); logger.debug("transform result = "+XMLConverter.getPrettyString(result)); Element new_element; if (result.getNodeType() == Node.DOCUMENT_NODE) { new_element = ((Document) result).getDocumentElement(); } else { new_element = (Element) result; } return new_element; } protected final int NO_QUERY_TERMS = 0; protected final int NO_EQUIV_QUERY_TERMS = 1; protected final int EQUIV_QUERY_TERMS = 2; /** * this involves a bit of a hack to get the equivalent query terms - has to * requery the query service - uses the last selected service name. (if it * ends in query). */ protected int getQueryTermVariants(Element request, HashSet query_term_variants, ArrayList>> phrase_query_term_variants_hierarchy) { Document doc = XMLConverter.newDOM(); // do the query again to get term info Element cgi_param_list = (Element) GSXML.getChildByTagName(request, GSXML.PARAM_ELEM + GSXML.LIST_MODIFIER); HashMap params = GSXML.extractParams(cgi_param_list, false); HashMap previous_params = (HashMap) params.get("p"); if (previous_params == null) { return NO_QUERY_TERMS; } String service_name = (String) previous_params.get(GSParams.SERVICE); if (service_name == null || !service_name.endsWith("Query")) { // hack for now - we only do highlighting if we were in a query last - ie not if we were in a browse thingy logger.debug("invalid service "+service_name+", not doing highlighting"); return NO_QUERY_TERMS; } String collection = (String) params.get(GSParams.COLLECTION); UserContext userContext = new UserContext(request); String to = GSPath.appendLink(collection, service_name); Element mr_query_message = doc.createElement(GSXML.MESSAGE_ELEM); Element mr_query_request = GSXML.createBasicRequest(doc, GSXML.REQUEST_TYPE_PROCESS, to, userContext); mr_query_message.appendChild(mr_query_request); // paramList HashMap service_params = (HashMap) params.get("s1"); Element query_param_list = doc.createElement(GSXML.PARAM_ELEM + GSXML.LIST_MODIFIER); GSXML.addParametersToList(query_param_list, service_params); mr_query_request.appendChild(query_param_list); // do the query Element mr_query_response = (Element) this.mr.process(mr_query_message); // find the term lists String path = GSPath.appendLink(GSXML.RESPONSE_ELEM, GSXML.TERM_ELEM + GSXML.LIST_MODIFIER); Element query_term_list_element = (Element) GSXML.getNodeByPath(mr_query_response, path); if (query_term_list_element == null) { // no term info return NO_QUERY_TERMS; } int result_code = NO_EQUIV_QUERY_TERMS; NodeList equivalent_terms_nodelist = query_term_list_element.getElementsByTagName("equivTermList"); if (equivalent_terms_nodelist == null || equivalent_terms_nodelist.getLength() == 0) { // if we have no equivalent terms, just add the current terms lower cased and we do case insensitive matching later on NodeList terms_nodelist = query_term_list_element.getElementsByTagName("term"); if (terms_nodelist != null && terms_nodelist.getLength() > 0) { for (int i = 0; i < terms_nodelist.getLength(); i++) { String termValue = ((Element) terms_nodelist.item(i)).getAttribute("name"); query_term_variants.add(termValue.toLowerCase()); } } } else { result_code = EQUIV_QUERY_TERMS; for (int i = 0; i < equivalent_terms_nodelist.getLength(); i++) { Element equivalent_terms_element = (Element) equivalent_terms_nodelist.item(i); String[] equivalent_terms = GSXML.getAttributeValuesFromList(equivalent_terms_element, GSXML.NAME_ATT); for (int j = 0; j < equivalent_terms.length; j++) { query_term_variants.add(equivalent_terms[j]); } } } String metadata_path = GSPath.appendLink(GSXML.RESPONSE_ELEM, GSXML.METADATA_ELEM + GSXML.LIST_MODIFIER); Element metadata_list = (Element) GSXML.getNodeByPath(mr_query_response, metadata_path); Element query_element = GSXML.getNamedElement(metadata_list, GSXML.METADATA_ELEM, GSXML.NAME_ATT, "query"); String performed_query = GSXML.getNodeText(query_element) + " "; logger.debug("performed query="+performed_query); boolean has_phrases = false; // if there are no phrases, we don't bother making the phrase variants structure if (performed_query.contains("\"")) { has_phrases = true; } ArrayList> phrase_query_p_term_variants_list = new ArrayList>(); int term_start = 0; boolean in_term = false; boolean in_phrase = false; for (int i = 0; i < performed_query.length(); i++) { char character = performed_query.charAt(i); boolean is_character_letter_or_digit = Character.isLetterOrDigit(character); // Has a query term just started? if (in_term == false && is_character_letter_or_digit == true) { in_term = true; term_start = i; } // Or has a term just finished? else if (in_term == true && is_character_letter_or_digit == false) { in_term = false; String term = performed_query.substring(term_start, i); if (has_phrases) { // do the phrase bit HashSet phrase_query_p_term_x_variants = new HashSet(); if (result_code == EQUIV_QUERY_TERMS) { Element term_element = GSXML.getNamedElement(query_term_list_element, GSXML.TERM_ELEM, GSXML.NAME_ATT, term); if (term_element != null) { // might be null for eg TX in [snails]:TX NodeList term_equivalent_terms_nodelist = term_element.getElementsByTagName("equivTermList"); if (term_equivalent_terms_nodelist != null || term_equivalent_terms_nodelist.getLength() != 0) { for (int j = 0; j < term_equivalent_terms_nodelist.getLength(); j++) { Element term_equivalent_terms_element = (Element) term_equivalent_terms_nodelist.item(j); String[] term_equivalent_terms = GSXML.getAttributeValuesFromList(term_equivalent_terms_element, GSXML.NAME_ATT); for (int k = 0; k < term_equivalent_terms.length; k++) { phrase_query_p_term_x_variants.add(term_equivalent_terms[k]); } } } } } else { // result_code != EQUIV_QUERY_TERMS // we don;t have equivalent term list, so just add the lower cased version in, and we do case-insensitive matching later on if (query_term_variants.contains(term.toLowerCase()) || containsSubString(query_term_variants, term)) { // this handles the case where the user has searched for snails, but term list returns 'snail' phrase_query_p_term_x_variants.add(term.toLowerCase()); } } if (phrase_query_p_term_x_variants.size()>0) { // we have found a valid term phrase_query_p_term_variants_list.add(phrase_query_p_term_x_variants); if (in_phrase == false) { phrase_query_term_variants_hierarchy.add(phrase_query_p_term_variants_list); phrase_query_p_term_variants_list = new ArrayList>(); } } } // end if has_phrases else { // no phrases so we don't have to do the phrasey stuff. but // we need to check the term against the query term list - if its not in there, check whether its the root of a term. // we want to handle the case where user has queried "snails", the term list returned only has snail, and therefore snails doesn't get highlighted. // but dont want to include eg TX if (result_code == NO_EQUIV_QUERY_TERMS) { if (containsSubString(query_term_variants, term)) { query_term_variants.add(term.toLowerCase()); } } } } // end of in_term... // Watch for phrases (surrounded by quotes) if (character == '\"') { // Has a phrase just started? if (in_phrase == false) { in_phrase = true; } // Or has a phrase just finished? else if (in_phrase == true) { in_phrase = false; phrase_query_term_variants_hierarchy.add(phrase_query_p_term_variants_list); } phrase_query_p_term_variants_list = new ArrayList>(); } // if char == " } // for each char in performed query return result_code; } protected boolean containsSubString(HashSet query_term_variants, String term) { // hack to filter out TX, TI field names String lc_term = term.toLowerCase(); if (query_term_variants.contains(term)) { return false; // or true?? } if (term.matches("[A-Z][A-Z][A-Z]?")) { return false; } Iterator i = query_term_variants.iterator(); while (i.hasNext()) { String t = (String)i.next(); if (term.startsWith(t)) { return true; } } return false; } /** retrieve the marked up highlighted section - only works for solr collection */ protected Element retrieveHighlightedContent(Element request, String node_id) { Document doc = XMLConverter.newDOM(); // do the query again to get term info Element cgi_param_list = (Element) GSXML.getChildByTagName(request, GSXML.PARAM_ELEM + GSXML.LIST_MODIFIER); HashMap params = GSXML.extractParams(cgi_param_list, false); HashMap previous_params = (HashMap) params.get("p"); if (previous_params == null) { return null; } String service_name = (String) previous_params.get(GSParams.SERVICE); if (service_name == null || !service_name.endsWith("Query")) { // hack for now - we only do highlighting if we were in a query last - ie not if we were in a browse thingy logger.debug("HL invalid service, not doing highlighting"); return null; } String collection = (String) params.get(GSParams.COLLECTION); UserContext userContext = new UserContext(request); String to = GSPath.appendLink(collection, service_name); Element mr_query_message = doc.createElement(GSXML.MESSAGE_ELEM); Element mr_query_request = GSXML.createBasicRequest(doc, GSXML.REQUEST_TYPE_PROCESS, to, userContext); mr_query_message.appendChild(mr_query_request); // paramList HashMap service_params = (HashMap) params.get("s1"); // hack in case the user searched on eg titles, but we want highlighting in the text service_params.put("index", "TX"); Element query_param_list = doc.createElement(GSXML.PARAM_ELEM + GSXML.LIST_MODIFIER); GSXML.addParametersToList(query_param_list, service_params); if (node_id != null) { GSXML.addParameterToList(query_param_list, "hldocOID", node_id); } else { GSXML.addParameterToList(query_param_list, "hldocOID", (String) params.get(GSParams.DOCUMENT)); } mr_query_request.appendChild(query_param_list); // do the query Element mr_query_response = (Element) this.mr.process(mr_query_message); String pathNode = GSPath.appendLink(GSXML.RESPONSE_ELEM, GSXML.NODE_CONTENT_ELEM); Element highlighted_node = (Element) GSXML.getNodeByPath(mr_query_response, pathNode); if (highlighted_node == null) { return null; } // For SOLR, the highlighted node will be a nodeContent element, which is the hldocOID section content, with search terms marked up. //We send it back to the documnetContentRetrieve service so that resolveTextMacros can be applied, and it can be properly encased in documentNode etc elements // Build a request to process highlighted text Element hl_message = doc.createElement(GSXML.MESSAGE_ELEM); to = GSPath.appendLink(collection, AbstractDocumentRetrieve.DOCUMENT_CONTENT_RETRIEVE_SERVICE); Element dc_request = GSXML.createBasicRequest(doc, GSXML.REQUEST_TYPE_PROCESS, to, userContext); hl_message.appendChild(dc_request); // Create a parameter list to specify the request parameters - empty for now Element dc_param_list = doc.createElement(GSXML.PARAM_ELEM + GSXML.LIST_MODIFIER); dc_request.appendChild(dc_param_list); // get the content Element doc_list = doc.createElement(GSXML.DOC_NODE_ELEM + GSXML.LIST_MODIFIER); dc_request.appendChild(doc_list); Element current_doc = doc.createElement(GSXML.DOC_NODE_ELEM); doc_list.appendChild(current_doc); current_doc.setAttribute(GSXML.NODE_ID_ATT, (String) params.get(GSParams.DOCUMENT)); //Append highlighted content to request for processing dc_request.appendChild(doc.importNode(highlighted_node, true)); Element hl_response_message = (Element) this.mr.process(hl_message); //Get results NodeList contentList = hl_response_message.getElementsByTagName(GSXML.NODE_CONTENT_ELEM); Element content = (Element) contentList.item(0); return content; } /** * Highlights query terms in specified elements (whose name is in element_names) text inside top_level_elem */ protected boolean highlightQueryTermsDOM(Document doc, Element top_level_elem, String element_name, HashSet query_term_variants, ArrayList>> phrase_query_term_variants_hierarchy, boolean case_insensitive) { NodeList named_elems = top_level_elem.getElementsByTagName(element_name); for (int j=named_elems.getLength()-1; j>=0; j--) { Element this_elem = (Element)named_elems.item(j); Element replacement_elem = highlightQueryTermsElementText(doc, this_elem, query_term_variants, phrase_query_term_variants_hierarchy, case_insensitive); this_elem.getParentNode().replaceChild(replacement_elem, this_elem); } return true; } /** * Highlights query terms in the text content of an element. */ private Element highlightQueryTermsElementText(Document doc, Element original_element, HashSet query_term_variants, ArrayList>> phrase_query_term_variants_hierarchy, boolean case_insensitive) { String content = GSXML.getNodeText(original_element); // Convert the content string to an array of characters for speed char[] content_characters = new char[content.length()]; content.getChars(0, content.length(), content_characters, 0); // Now skim through the content, identifying word matches ArrayList word_matches = new ArrayList(); int word_start = 0; boolean in_word = false; boolean preceding_word_matched = false; boolean inTag = false; for (int i = 0; i < content_characters.length; i++) { //We don't want to find words inside HTML tags if (content_characters[i] == '<') { // are we currently in a word? if (in_word) { in_word = false; String word = new String(content_characters, word_start, (i - word_start)); if (case_insensitive) { word = word.toLowerCase(); } if (query_term_variants.contains(word)) { // We have found a matching word, so remember its location word_matches.add(new WordMatch(word, word_start, i, preceding_word_matched)); // should preceding word matched be set to true/false here?? preceding_word_matched = true; } else { preceding_word_matched = false; } } inTag = true; continue; } else if (inTag && content_characters[i] == '>') { inTag = false; continue; } else if (inTag) { continue; } boolean is_character_letter_or_digit = Character.isLetterOrDigit(content_characters[i]); // Has a word just started? if (in_word == false && is_character_letter_or_digit == true) { in_word = true; word_start = i; } // Or has a word just finished? else if (in_word == true && is_character_letter_or_digit == false) { in_word = false; // Check if the word matches any of the query term equivalents String word = new String(content_characters, word_start, (i - word_start)); if (case_insensitive) { word = word.toLowerCase(); } if (query_term_variants.contains(word)) { // We have found a matching word, so remember its location word_matches.add(new WordMatch(word, word_start, i, preceding_word_matched)); preceding_word_matched = true; } else { preceding_word_matched = false; } } } // Don't forget the last word... if (in_word == true) { // Check if the word matches any of the query term equivalents String word = new String(content_characters, word_start, (content_characters.length - word_start)); if (case_insensitive) { word = word.toLowerCase(); } if (query_term_variants.contains(word)) { // We have found a matching word, so remember its location word_matches.add(new WordMatch(word, word_start, content_characters.length, preceding_word_matched)); } } if (word_matches.size() == 0) { // just return a copy of the original element return (Element)doc.importNode(original_element, true); } ArrayList highlight_start_positions = new ArrayList(); ArrayList highlight_end_positions = new ArrayList(); if (phrase_query_term_variants_hierarchy.size() ==0) { for (int i = 0; i < word_matches.size(); i++) { highlight_start_positions.add(Integer.valueOf(word_matches.get(i).start_position)); highlight_end_positions.add(Integer.valueOf(word_matches.get(i).end_position)); } } else { // Deal with phrases now ArrayList partial_phrase_matches = new ArrayList(); for (int i = 0; i < word_matches.size(); i++) { WordMatch word_match = word_matches.get(i); // See if any partial phrase matches are extended by this word if (word_match.preceding_word_matched) { for (int j = partial_phrase_matches.size() - 1; j >= 0; j--) { PartialPhraseMatch partial_phrase_match = partial_phrase_matches.remove(j); ArrayList phrase_query_p_term_variants_list = phrase_query_term_variants_hierarchy.get(partial_phrase_match.query_phrase_number); HashSet phrase_query_p_term_x_variants = (HashSet) phrase_query_p_term_variants_list.get(partial_phrase_match.num_words_matched); if (phrase_query_p_term_x_variants.contains(word_match.word)) { partial_phrase_match.num_words_matched++; // Has a complete phrase match occurred? if (partial_phrase_match.num_words_matched == phrase_query_p_term_variants_list.size()) { // Check for overlaps by looking at the previous highlight range if (!highlight_end_positions.isEmpty()) { int last_highlight_index = highlight_end_positions.size() - 1; int last_highlight_end = highlight_end_positions.get(last_highlight_index).intValue(); if (last_highlight_end > partial_phrase_match.start_position) { // There is an overlap, so remove the previous phrase match int last_highlight_start = highlight_start_positions.remove(last_highlight_index).intValue(); highlight_end_positions.remove(last_highlight_index); partial_phrase_match.start_position = last_highlight_start; } } highlight_start_positions.add(Integer.valueOf(partial_phrase_match.start_position)); highlight_end_positions.add(Integer.valueOf(word_match.end_position)); } // No, but add the partial match back into the list for next time else { partial_phrase_matches.add(partial_phrase_match); } } } } else { partial_phrase_matches.clear(); } // See if this word is at the start of any of the phrases for (int p = 0; p < phrase_query_term_variants_hierarchy.size(); p++) { ArrayList phrase_query_p_term_variants_list = phrase_query_term_variants_hierarchy.get(p); if (phrase_query_p_term_variants_list.size()>0) { HashSet phrase_query_p_term_1_variants = (HashSet) phrase_query_p_term_variants_list.get(0); if (phrase_query_p_term_1_variants.contains(word_match.word)) { // If this phrase is just one word long, we have a complete match if (phrase_query_p_term_variants_list.size() == 1) { highlight_start_positions.add(Integer.valueOf(word_match.start_position)); highlight_end_positions.add(Integer.valueOf(word_match.end_position)); } // Otherwise we have the start of a potential phrase match else { partial_phrase_matches.add(new PartialPhraseMatch(word_match.start_position, p)); } } } } } } // Now add the annotation tags into the document at the correct points Element content_element = (Element)doc.importNode(original_element, false); // just copy the element plus any attributes, but not any children. int last_wrote = 0; for (int i = 0; i < highlight_start_positions.size(); i++) { int highlight_start = highlight_start_positions.get(i).intValue(); int highlight_end = highlight_end_positions.get(i).intValue(); // Print anything before the highlight range if (last_wrote < highlight_start) { String preceding_text = new String(content_characters, last_wrote, (highlight_start - last_wrote)); content_element.appendChild(doc.createTextNode(preceding_text)); } // Print the highlight text, annotated if (highlight_end > last_wrote) { String highlight_text = new String(content_characters, highlight_start, (highlight_end - highlight_start)); Element annotation_element = GSXML.createTextElement(doc, "annotation", highlight_text); annotation_element.setAttribute("type", "query_term"); content_element.appendChild(annotation_element); last_wrote = highlight_end; } } // Finish off any unwritten text if (last_wrote < content_characters.length) { String remaining_text = new String(content_characters, last_wrote, (content_characters.length - last_wrote)); content_element.appendChild(doc.createTextNode(remaining_text)); } return content_element; } static private class WordMatch { public String word; public int start_position; public int end_position; public boolean preceding_word_matched; public WordMatch(String word, int start_position, int end_position, boolean preceding_word_matched) { this.word = word; this.start_position = start_position; this.end_position = end_position; this.preceding_word_matched = preceding_word_matched; } } static private class PartialPhraseMatch { public int start_position; public int query_phrase_number; public int num_words_matched; public PartialPhraseMatch(int start_position, int query_phrase_number) { this.start_position = start_position; this.query_phrase_number = query_phrase_number; this.num_words_matched = 1; } } }