Ignore:
Timestamp:
2011-06-07T17:07:48+12:00 (13 years ago)
Author:
sjm84
Message:

Fixed search term highlighting in Lucene

File:
1 edited

Legend:

Unmodified
Added
Removed
  • main/trunk/greenstone3/src/java/org/greenstone/gsdl3/action/DocumentAction.java

    r23628 r24116  
    11/*
    2  *    DocumentAction.java
    3  *    Copyright (C) 2002 New Zealand Digital Library, http://www.nzdl.org
    4  *
    5  *    This program is free software; you can redistribute it and/or modify
    6  *    it under the terms of the GNU General Public License as published by
    7  *    the Free Software Foundation; either version 2 of the License, or
    8  *    (at your option) any later version.
    9  *
    10  *    This program is distributed in the hope that it will be useful,
    11  *    but WITHOUT ANY WARRANTY; without even the implied warranty of
    12  *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    13  *    GNU General Public License for more details.
    14  *
    15  *    You should have received a copy of the GNU General Public License
    16  *    along with this program; if not, write to the Free Software
    17  *    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
    18  */
     2*    DocumentAction.java
     3*    Copyright (C) 2002 New Zealand Digital Library, http://www.nzdl.org
     4*
     5*    This program is free software; you can redistribute it and/or modify
     6*    it under the terms of the GNU General Public License as published by
     7*    the Free Software Foundation; either version 2 of the License, or
     8*    (at your option) any later version.
     9*
     10*    This program is distributed in the hope that it will be useful,
     11*    but WITHOUT ANY WARRANTY; without even the implied warranty of
     12*    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
     13*    GNU General Public License for more details.
     14*
     15*    You should have received a copy of the GNU General Public License
     16*    along with this program; if not, write to the Free Software
     17*    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
     18*/
    1919package org.greenstone.gsdl3.action;
    2020
     
    3939
    4040/** Action class for retrieving Documents  via the message router
    41  */
     41*/
    4242public class DocumentAction extends Action {
    4343
    44    static Logger logger = Logger.getLogger(org.greenstone.gsdl3.action.DocumentAction.class.getName());
    45 
    46     // this is used to specify that the sibling nodes of a selected one should be obtained
    47     public static final String SIBLING_ARG = "sib";
    48     public static final String GOTO_PAGE_ARG = "gp";
    49     public static final String ENRICH_DOC_ARG = "end";
    50    
    51     /** if this is set to true, when a document is displayed, any annotation
    52      * type services (enrich) will be offered to the user as well */
    53     protected boolean provide_annotations = false;
    54    
    55     protected boolean highlight_query_terms = false;
    56 
    57     public boolean configure() {
    58     super.configure();
    59     String highlight = (String)config_params.get("highlightQueryTerms");
    60     if (highlight != null && highlight.equals("true")) {
    61         highlight_query_terms = true;
     44    static Logger logger = Logger.getLogger(org.greenstone.gsdl3.action.DocumentAction.class.getName());
     45
     46    // this is used to specify that the sibling nodes of a selected one should be obtained
     47    public static final String SIBLING_ARG = "sib";
     48    public static final String GOTO_PAGE_ARG = "gp";
     49    public static final String ENRICH_DOC_ARG = "end";
     50   
     51    /** if this is set to true, when a document is displayed, any annotation
     52    * type services (enrich) will be offered to the user as well */
     53    protected boolean provide_annotations = false;
     54   
     55    protected boolean highlight_query_terms = false;
     56
     57    public boolean configure() {
     58        super.configure();
     59        String highlight = (String)config_params.get("highlightQueryTerms");
     60        if (highlight != null && highlight.equals("true")) {
     61            highlight_query_terms = true;
     62        }
     63        String annotate = (String)config_params.get("displayAnnotationService");
     64        if (annotate != null && annotate.equals("true")) {
     65            provide_annotations = true;
     66        }
     67        return true;
    6268    }
    63     String annotate = (String)config_params.get("displayAnnotationService");
    64     if (annotate != null && annotate.equals("true")) {
    65         provide_annotations = true;
    66     }
    67     return true;
    68     }
    69     public Node process (Node message_node)
    70     {
    71     // for now, no subaction eventually we may want to have subactions such as text assoc or something ?
    72    
    73     Element message = this.converter.nodeToElement(message_node);
    74 
    75     // the response
    76     Element result = this.doc.createElement(GSXML.MESSAGE_ELEM);
    77     Element page_response = this.doc.createElement(GSXML.RESPONSE_ELEM);
    78     result.appendChild(page_response);
    79 
    80     // get the request - assume only one
    81     Element request = (Element)GSXML.getChildByTagName(message, GSXML.REQUEST_ELEM);
    82     Element cgi_paramList = (Element)GSXML.getChildByTagName(request, GSXML.PARAM_ELEM+GSXML.LIST_MODIFIER);
    83     HashMap params = GSXML.extractParams(cgi_paramList, false);
    84 
    85     // just in case there are some that need to get passed to the services
    86     HashMap service_params = (HashMap)params.get("s0");
    87 
    88    
    89     String has_rl = null;
    90     String has_href = null;
    91     has_href = (String) params.get("href");//for an external link : get the href URL if it is existing in the params list
    92     has_rl = (String) params.get("rl");//for an external link : get the rl value if it is existing in the params list
    93     String collection = (String) params.get(GSParams.COLLECTION);
    94     String lang = request.getAttribute(GSXML.LANG_ATT);
    95     String uid = request.getAttribute(GSXML.USER_ID_ATT);
    96     String document_name = (String) params.get(GSParams.DOCUMENT);
    97     if ((document_name == null || document_name.equals("")) && (has_href == null || has_href.equals(""))) {
    98         logger.error("no document specified!");
    99         return result;
    100     }
    101     String document_type = (String) params.get(GSParams.DOCUMENT_TYPE);
    102     if (document_type == null) {
    103         document_type = "simple";
    104     }
    105     //whether to retrieve siblings or not
    106     boolean get_siblings = false;
    107     String sibs = (String) params.get(SIBLING_ARG);
    108     if (sibs != null && sibs.equals("1")) {
    109         get_siblings = true;
     69    public Node process (Node message_node)
     70    {
     71        // for now, no subaction eventually we may want to have subactions such as text assoc or something ?
     72       
     73        Element message = this.converter.nodeToElement(message_node);
     74
     75        // the response
     76        Element result = this.doc.createElement(GSXML.MESSAGE_ELEM);
     77        Element page_response = this.doc.createElement(GSXML.RESPONSE_ELEM);
     78        result.appendChild(page_response);
     79
     80        // get the request - assume only one
     81        Element request = (Element)GSXML.getChildByTagName(message, GSXML.REQUEST_ELEM);
     82        Element cgi_paramList = (Element)GSXML.getChildByTagName(request, GSXML.PARAM_ELEM+GSXML.LIST_MODIFIER);
     83        HashMap params = GSXML.extractParams(cgi_paramList, false);
     84
     85        // just in case there are some that need to get passed to the services
     86        HashMap service_params = (HashMap)params.get("s0");
     87
     88       
     89        String has_rl = null;
     90        String has_href = null;
     91        has_href = (String) params.get("href");//for an external link : get the href URL if it is existing in the params list
     92        has_rl = (String) params.get("rl");//for an external link : get the rl value if it is existing in the params list
     93        String collection = (String) params.get(GSParams.COLLECTION);
     94        String lang = request.getAttribute(GSXML.LANG_ATT);
     95        String uid = request.getAttribute(GSXML.USER_ID_ATT);
     96        String document_name = (String) params.get(GSParams.DOCUMENT);
     97        if ((document_name == null || document_name.equals("")) && (has_href == null || has_href.equals(""))) {
     98            logger.error("no document specified!");
     99            return result;
     100        }
     101        String document_type = (String) params.get(GSParams.DOCUMENT_TYPE);
     102        if (document_type == null) {
     103            document_type = "simple";
     104        }
     105        //whether to retrieve siblings or not
     106        boolean get_siblings = false;
     107        String sibs = (String) params.get(SIBLING_ARG);
     108        if (sibs != null && sibs.equals("1")) {
     109            get_siblings = true;
     110        }
     111       
     112        String sibling_num = (String) params.get(GOTO_PAGE_ARG);
     113        if (sibling_num != null && !sibling_num.equals("")) {
     114            // we have to modify the doc name
     115            document_name = document_name+"."+sibling_num+".ss";
     116        }
     117       
     118        boolean expand_document = false;
     119        String ed_arg = (String) params.get(GSParams.EXPAND_DOCUMENT);
     120        if (ed_arg != null && ed_arg.equals("1")) {
     121            expand_document = true;
     122        }
     123       
     124
     125        boolean expand_contents = false;
     126        if (expand_document) { // we always expand the contents with the text
     127            expand_contents = true;
     128        } else {
     129            String ec_arg = (String) params.get(GSParams.EXPAND_CONTENTS);
     130            if (ec_arg != null && ec_arg.equals("1")) {
     131                expand_contents = true;
     132            }
     133        }
     134
     135        //append site metadata
     136        addSiteMetadata( page_response, lang, uid);
     137
     138        // get the additional data needed for the page
     139        getBackgroundData(page_response, collection, lang, uid);
     140        Element format_elem = (Element)GSXML.getChildByTagName(page_response, GSXML.FORMAT_ELEM);
     141       
     142        // the_document is where all the doc info - structure and metadata etc
     143        // is added into, to be returned in the page
     144        Element the_document = this.doc.createElement(GSXML.DOCUMENT_ELEM);
     145        page_response.appendChild(the_document);
     146
     147        // set the doctype from the cgi arg as an attribute
     148        the_document.setAttribute(GSXML.DOC_TYPE_ATT, document_type);
     149
     150        // create a basic doc list containing the current node
     151        Element basic_doc_list = this.doc.createElement(GSXML.DOC_NODE_ELEM+GSXML.LIST_MODIFIER);
     152        Element current_doc = this.doc.createElement(GSXML.DOC_NODE_ELEM);
     153        basic_doc_list.appendChild(current_doc);
     154        if (document_name.length()!=0){
     155            current_doc.setAttribute(GSXML.NODE_ID_ATT, document_name);
     156        }else if (has_href.length()!=0){
     157            current_doc.setAttribute(GSXML.NODE_ID_ATT, has_href);
     158            current_doc.setAttribute("externalURL", has_rl);
     159        }
     160
     161        // Create a parameter list to specify the required structure information
     162        Element ds_param_list = this.doc.createElement(GSXML.PARAM_ELEM+GSXML.LIST_MODIFIER);
     163       
     164        if (service_params != null) {
     165            GSXML.addParametersToList(this.doc, ds_param_list, service_params);
     166        }
     167
     168        Element ds_param = null;
     169        boolean get_structure = false;
     170        boolean get_structure_info = false;
     171        if (document_type.equals("paged")) {
     172            get_structure_info = true;
     173            // get teh info needed for paged naviagtion
     174            ds_param = this.doc.createElement(GSXML.PARAM_ELEM);
     175            ds_param_list.appendChild(ds_param);
     176            ds_param.setAttribute(GSXML.NAME_ATT, "info");
     177            ds_param.setAttribute(GSXML.VALUE_ATT, "numSiblings");
     178            ds_param = this.doc.createElement(GSXML.PARAM_ELEM);
     179            ds_param_list.appendChild(ds_param);
     180            ds_param.setAttribute(GSXML.NAME_ATT, "info");
     181            ds_param.setAttribute(GSXML.VALUE_ATT, "numChildren");
     182            ds_param = this.doc.createElement(GSXML.PARAM_ELEM);
     183            ds_param_list.appendChild(ds_param);
     184            ds_param.setAttribute(GSXML.NAME_ATT, "info");
     185            ds_param.setAttribute(GSXML.VALUE_ATT, "siblingPosition");
     186           
     187        } else if (document_type.equals("hierarchy")){
     188            get_structure = true;
     189            if (expand_contents) {
     190                ds_param = this.doc.createElement(GSXML.PARAM_ELEM);
     191                ds_param_list.appendChild(ds_param);
     192                ds_param.setAttribute(GSXML.NAME_ATT, "structure");
     193                ds_param.setAttribute(GSXML.VALUE_ATT, "entire");
     194            } else {
     195                // get the info needed for table of contents
     196                ds_param = this.doc.createElement(GSXML.PARAM_ELEM);
     197                ds_param_list.appendChild(ds_param);
     198                ds_param.setAttribute(GSXML.NAME_ATT, "structure");
     199                ds_param.setAttribute(GSXML.VALUE_ATT, "ancestors");
     200                ds_param = this.doc.createElement(GSXML.PARAM_ELEM);
     201                ds_param_list.appendChild(ds_param);
     202                ds_param.setAttribute(GSXML.NAME_ATT, "structure");
     203                ds_param.setAttribute(GSXML.VALUE_ATT, "children");
     204                if (get_siblings) {
     205                    ds_param = this.doc.createElement(GSXML.PARAM_ELEM);
     206                    ds_param_list.appendChild(ds_param);
     207                    ds_param.setAttribute(GSXML.NAME_ATT, "structure");
     208                    ds_param.setAttribute(GSXML.VALUE_ATT, "siblings");
     209                }
     210            }
     211        } else {
     212            // we dont need any structure
     213        }
     214
     215        boolean has_dummy = false;
     216        if (get_structure || get_structure_info) {
     217
     218            // Build a request to obtain the document structure
     219            Element ds_message = this.doc.createElement(GSXML.MESSAGE_ELEM);
     220            String to = GSPath.appendLink(collection, "DocumentStructureRetrieve");// Hard-wired?
     221            Element ds_request = GSXML.createBasicRequest(this.doc,GSXML.REQUEST_TYPE_PROCESS, to, lang, uid);
     222            ds_message.appendChild(ds_request);
     223            ds_request.appendChild(ds_param_list);
     224           
     225            // create a doc_node_list and put in the doc_node that we are interested in
     226            ds_request.appendChild(basic_doc_list);
     227           
     228            // Process the document structure retrieve message
     229            Element ds_response_message = (Element) this.mr.process(ds_message);
     230            if (processErrorElements(ds_response_message, page_response)) {
     231                return result;
     232            }
     233
     234            // get the info and print out
     235            String path = GSPath.appendLink(GSXML.RESPONSE_ELEM, GSXML.DOC_NODE_ELEM+GSXML.LIST_MODIFIER);
     236            path = GSPath.appendLink(path, GSXML.DOC_NODE_ELEM);
     237            path = GSPath.appendLink(path, "nodeStructureInfo");
     238            Element ds_response_struct_info = (Element) GSXML.getNodeByPath(ds_response_message, path);
     239            // get the doc_node bit
     240            if (ds_response_struct_info != null) {
     241                the_document.appendChild(this.doc.importNode(ds_response_struct_info, true));
     242            }
     243            path = GSPath.appendLink(GSXML.RESPONSE_ELEM, GSXML.DOC_NODE_ELEM+GSXML.LIST_MODIFIER);
     244            path = GSPath.appendLink(path, GSXML.DOC_NODE_ELEM);
     245            path = GSPath.appendLink(path, GSXML.NODE_STRUCTURE_ELEM);
     246            Element ds_response_structure = (Element) GSXML.getNodeByPath(ds_response_message, path);
     247           
     248            if (ds_response_structure != null) {
     249                // add the contents of the structure bit into the_document
     250                NodeList structs = ds_response_structure.getChildNodes();
     251                for (int i=0; i<structs.getLength();i++) {
     252                    the_document.appendChild(this.doc.importNode(structs.item(i), true));
     253                }
     254            } else {
     255                // no structure nodes, so put in a dummy doc node
     256                Element doc_node = this.doc.createElement(GSXML.DOC_NODE_ELEM);
     257                if (document_name.length()!=0){
     258                    doc_node.setAttribute(GSXML.NODE_ID_ATT, document_name);
     259                }else if (has_href.length()!=0){
     260                    doc_node.setAttribute(GSXML.NODE_ID_ATT, has_href);
     261                    doc_node.setAttribute("externalURL", has_rl);
     262                }
     263                the_document.appendChild(doc_node);
     264                has_dummy = true;
     265            }
     266        } else { // a simple type - we dont have a dummy node for simple
     267            // should think about this more
     268            // no structure request, so just put in a dummy doc node
     269            Element doc_node = this.doc.createElement(GSXML.DOC_NODE_ELEM);
     270            if (document_name.length()!=0){
     271                doc_node.setAttribute(GSXML.NODE_ID_ATT, document_name);
     272            }else if (has_href.length()!=0){
     273                doc_node.setAttribute(GSXML.NODE_ID_ATT, has_href);
     274                doc_node.setAttribute("externalURL", has_rl);
     275            }
     276            the_document.appendChild(doc_node);
     277            has_dummy = true;
     278        }
     279       
     280        // Build a request to obtain some document metadata
     281        Element dm_message = this.doc.createElement(GSXML.MESSAGE_ELEM);
     282        String to = GSPath.appendLink(collection, "DocumentMetadataRetrieve");  // Hard-wired?
     283        Element dm_request = GSXML.createBasicRequest(this.doc, GSXML.REQUEST_TYPE_PROCESS, to, lang, uid);
     284        dm_message.appendChild(dm_request);
     285        // Create a parameter list to specify the required metadata information
     286       
     287        HashSet meta_names = new HashSet();
     288        meta_names.add("Title"); // the default
     289        if (format_elem != null) {
     290            extractMetadataNames(format_elem, meta_names);
     291        }
     292       
     293        Element dm_param_list = createMetadataParamList(meta_names);
     294        if (service_params != null) {
     295            GSXML.addParametersToList(this.doc, dm_param_list, service_params);
     296        }
     297       
     298        dm_request.appendChild(dm_param_list);
     299       
     300       
     301        // create the doc node list for the metadata request
     302        Element dm_doc_list = this.doc.createElement(GSXML.DOC_NODE_ELEM+GSXML.LIST_MODIFIER);
     303        dm_request.appendChild(dm_doc_list);
     304
     305        // Add each node from the structure response into the metadata request
     306        NodeList doc_nodes = the_document.getElementsByTagName(GSXML.DOC_NODE_ELEM);
     307        for (int i = 0; i < doc_nodes.getLength(); i++) {
     308            Element doc_node = (Element) doc_nodes.item(i);
     309            String doc_node_id = doc_node.getAttribute(GSXML.NODE_ID_ATT);
     310
     311            // Add the documentNode to the list
     312            Element dm_doc_node = this.doc.createElement(GSXML.DOC_NODE_ELEM);
     313            dm_doc_list.appendChild(dm_doc_node);
     314            dm_doc_node.setAttribute(GSXML.NODE_ID_ATT, doc_node_id);
     315            dm_doc_node.setAttribute(GSXML.NODE_TYPE_ATT,
     316            doc_node.getAttribute(GSXML.NODE_TYPE_ATT));
     317        }
     318
     319        // we also want a metadata request to the top level document to get
     320        // assocfilepath - this could be cached too
     321        Element doc_meta_request = GSXML.createBasicRequest(this.doc, GSXML.REQUEST_TYPE_PROCESS, to, lang, uid);
     322        dm_message.appendChild(doc_meta_request);
     323        Element doc_meta_param_list = this.doc.createElement(GSXML.PARAM_ELEM+GSXML.LIST_MODIFIER);
     324        if (service_params != null) {
     325            GSXML.addParametersToList(this.doc, doc_meta_param_list, service_params);
     326        }
     327
     328        doc_meta_request.appendChild(doc_meta_param_list);
     329        Element doc_param = this.doc.createElement(GSXML.PARAM_ELEM);
     330        doc_meta_param_list.appendChild(doc_param);
     331        doc_param.setAttribute(GSXML.NAME_ATT, "metadata");
     332        doc_param.setAttribute(GSXML.VALUE_ATT, "assocfilepath");
     333
     334        // create the doc node list for the metadata request
     335        Element doc_list = this.doc.createElement(GSXML.DOC_NODE_ELEM+GSXML.LIST_MODIFIER);
     336        doc_meta_request.appendChild(doc_list);
     337
     338        Element doc_node = this.doc.createElement(GSXML.DOC_NODE_ELEM);
     339        // the node we want is the root document node
     340        if (document_name.length()!=0){
     341            doc_node.setAttribute(GSXML.NODE_ID_ATT, document_name+".rt");
     342        }else if (has_href.length()!=0){
     343            doc_node.setAttribute(GSXML.NODE_ID_ATT, has_href+".rt");
     344            doc_node.setAttribute("externalURL", has_rl);
     345        }
     346        doc_list.appendChild(doc_node);
     347        Element dm_response_message = (Element) this.mr.process(dm_message);
     348        if (processErrorElements(dm_response_message, page_response)) {
     349            return result;
     350        }
     351
     352        String path = GSPath.appendLink(GSXML.RESPONSE_ELEM, GSXML.DOC_NODE_ELEM+GSXML.LIST_MODIFIER);
     353        Element dm_response_doc_list = (Element) GSXML.getNodeByPath(dm_response_message, path);
     354
     355        // Merge the metadata with the structure information
     356        NodeList dm_response_docs = dm_response_doc_list.getChildNodes();
     357        for (int i = 0; i < doc_nodes.getLength(); i++) {
     358            GSXML.mergeMetadataLists(doc_nodes.item(i), dm_response_docs.item(i));
     359        }
     360        // get the top level doc metadata out
     361        Element doc_meta_response = (Element)dm_response_message.getElementsByTagName(GSXML.RESPONSE_ELEM).item(1);
     362        Element top_doc_node = (Element)GSXML.getNodeByPath(doc_meta_response, "documentNodeList/documentNode");
     363        GSXML.mergeMetadataLists(the_document, top_doc_node);
     364       
     365        // Build a request to obtain some document content
     366        Element dc_message = this.doc.createElement(GSXML.MESSAGE_ELEM);
     367        to = GSPath.appendLink(collection, "DocumentContentRetrieve");  // Hard-wired?
     368        Element dc_request = GSXML.createBasicRequest(this.doc, GSXML.REQUEST_TYPE_PROCESS, to, lang, uid);
     369        dc_message.appendChild(dc_request);
     370       
     371
     372        // Create a parameter list to specify the request parameters - empty for now
     373        Element dc_param_list = this.doc.createElement(GSXML.PARAM_ELEM+GSXML.LIST_MODIFIER);
     374        if (service_params != null) {
     375            GSXML.addParametersToList(this.doc, dc_param_list, service_params);
     376        }
     377
     378        dc_request.appendChild(dc_param_list);
     379
     380        // get the content
     381        // the doc list for the content request is the same as the one for the structure request unless we want the whole document, in which case its the same as for the metadata request.
     382        if (expand_document) {
     383            dc_request.appendChild(dm_doc_list);
     384        } else {
     385            dc_request.appendChild(basic_doc_list);
     386        }
     387        logger.debug("request = "+converter.getString(dc_message));
     388        Element dc_response_message = (Element) this.mr.process(dc_message);
     389        if (processErrorElements(dc_response_message, page_response)) {
     390            return result;
     391        }
     392
     393        Element dc_response_doc_list = (Element) GSXML.getNodeByPath(dc_response_message, path);
     394
     395        if (expand_document) {
     396            // Merge the content with the structure information
     397            NodeList dc_response_docs = dc_response_doc_list.getChildNodes();
     398            for (int i = 0; i < doc_nodes.getLength(); i++) {
     399                Node content = GSXML.getChildByTagName((Element)dc_response_docs.item(i), "nodeContent");
     400                if (content != null) {
     401                    if (highlight_query_terms) {
     402                        content = highlightQueryTerms(request, (Element)content);
     403                    }
     404                    doc_nodes.item(i).appendChild(this.doc.importNode(content, true));
     405                }
     406                //GSXML.mergeMetadataLists(doc_nodes.item(i), dm_response_docs.item(i));
     407            }
     408        } else {
     409            //path = GSPath.appendLink(path, GSXML.DOC_NODE_ELEM);
     410            Element dc_response_doc = (Element) GSXML.getChildByTagName(dc_response_doc_list, GSXML.DOC_NODE_ELEM);
     411            Element dc_response_doc_content = (Element) GSXML.getChildByTagName(dc_response_doc, GSXML.NODE_CONTENT_ELEM);
     412            Element dc_response_doc_external = (Element) GSXML.getChildByTagName(dc_response_doc, "external");
     413           
     414            if (dc_response_doc_content == null) {
     415                // no content to add
     416                if (dc_response_doc_external !=null){
     417                    String modified_doc_id = dc_response_doc.getAttribute(GSXML.NODE_ID_ATT);
     418                   
     419                    the_document.setAttribute("selectedNode", modified_doc_id);
     420                    the_document.setAttribute("external", dc_response_doc_external.getAttribute("external_link"));
     421                }
     422                return result;
     423            }
     424            if (highlight_query_terms) {
     425                dc_response_doc.removeChild(dc_response_doc_content);
     426               
     427                dc_response_doc_content = highlightQueryTerms(request, dc_response_doc_content);
     428                dc_response_doc.appendChild(dc_response_doc.getOwnerDocument().importNode(dc_response_doc_content, true));
     429            }
     430           
     431           
     432            if (provide_annotations) {
     433                String service_selected = (String)params.get(ENRICH_DOC_ARG);
     434                if (service_selected != null && service_selected.equals("1")) {
     435                    // now we can modifiy the response doc if needed
     436                    String enrich_service = (String)params.get(GSParams.SERVICE);
     437                    // send a message to the service
     438                    Element enrich_message = this.doc.createElement(GSXML.MESSAGE_ELEM);
     439                    Element enrich_request = GSXML.createBasicRequest(this.doc, GSXML.REQUEST_TYPE_PROCESS, enrich_service, lang, uid);
     440                    enrich_message.appendChild(enrich_request);
     441                    // check for parameters
     442                    HashMap e_service_params = (HashMap)params.get("s1");
     443                    if (e_service_params != null) {
     444                        Element enrich_pl = this.doc.createElement(GSXML.PARAM_ELEM+GSXML.LIST_MODIFIER);
     445                        GSXML.addParametersToList(this.doc, enrich_pl, e_service_params);
     446                        enrich_request.appendChild(enrich_pl);
     447                    }
     448                    Element e_doc_list = this.doc.createElement(GSXML.DOC_NODE_ELEM+GSXML.LIST_MODIFIER);
     449                    enrich_request.appendChild(e_doc_list);
     450                    e_doc_list.appendChild(this.doc.importNode(dc_response_doc, true));
     451                   
     452                    Node enrich_response = this.mr.process(enrich_message);
     453                   
     454                    String [] links = {GSXML.RESPONSE_ELEM, GSXML.DOC_NODE_ELEM+GSXML.LIST_MODIFIER, GSXML.DOC_NODE_ELEM, GSXML.NODE_CONTENT_ELEM};
     455                    path = GSPath.createPath(links);
     456                    dc_response_doc_content = (Element)GSXML.getNodeByPath(enrich_response, path);
     457                   
     458                }
     459            } // if provide_annotations
     460
     461           
     462            // use the returned id rather than the sent one cos there may have
     463            // been modifiers such as .pr that are removed.
     464            String modified_doc_id = dc_response_doc.getAttribute(GSXML.NODE_ID_ATT);
     465            the_document.setAttribute("selectedNode", modified_doc_id);
     466            if (has_dummy) {
     467                // change the id if necessary and add the content
     468                Element dummy_node = (Element)doc_nodes.item(0);
     469               
     470                dummy_node.setAttribute(GSXML.NODE_ID_ATT, modified_doc_id);
     471                dummy_node.appendChild(this.doc.importNode(dc_response_doc_content, true));
     472                // hack for simple type
     473                if (document_type.equals("simple")) {
     474                    // we dont want the internal docNode, just want the content and metadata in the document
     475                    // rethink this!!
     476                    the_document.removeChild(dummy_node);
     477
     478                    NodeList dummy_children = dummy_node.getChildNodes();
     479                    //for (int i=0; i<dummy_children.getLength(); i++) {
     480                    for (int i=dummy_children.getLength()-1; i>=0; i--) {
     481                        // special case as we don't want more than one metadata list
     482                        if (dummy_children.item(i).getNodeName().equals(GSXML.METADATA_ELEM+GSXML.LIST_MODIFIER)) {
     483                            GSXML.mergeMetadataFromList(the_document, dummy_children.item(i));
     484                        } else {
     485                            the_document.appendChild(dummy_children.item(i));
     486                        }
     487                    }
     488                }
     489            } else {
     490                // Merge the document content with the metadata and structure information
     491                for (int i = 0; i < doc_nodes.getLength(); i++) {
     492                    Node dn = doc_nodes.item(i);
     493                    String dn_id = ((Element)dn).getAttribute(GSXML.NODE_ID_ATT);
     494                    if (dn_id.equals(modified_doc_id)) {
     495                        dn.appendChild(this.doc.importNode(dc_response_doc_content, true));
     496                        break;
     497                    }
     498                }
     499            }
     500        }
     501        logger.debug("(DocumentAction) Page:\n" + this.converter.getPrettyString(result));
     502        return result;
    110503    }
    111504   
    112     String sibling_num = (String) params.get(GOTO_PAGE_ARG);
    113     if (sibling_num != null && !sibling_num.equals("")) {
    114         // we have to modify the doc name
    115         document_name = document_name+"."+sibling_num+".ss";
     505    /** tell the param class what its arguments are
     506    * if an action has its own arguments, this should add them to the params
     507    * object - particularly important for args that should not be saved */
     508    public boolean getActionParameters(GSParams params) {
     509        params.addParameter(GOTO_PAGE_ARG, false);
     510        params.addParameter(ENRICH_DOC_ARG, false);
     511        return true;
    116512    }
    117    
    118     boolean expand_document = false;
    119     String ed_arg = (String) params.get(GSParams.EXPAND_DOCUMENT);
    120     if (ed_arg != null && ed_arg.equals("1")) {
    121         expand_document = true;
     513
     514
     515    /** this method gets the collection description, the format info, the
     516    * list of enrich services, etc - stuff that is needed for the page,
     517    * but is the same whatever the query is - should be cached */
     518    protected  boolean getBackgroundData(Element page_response,
     519    String collection, String lang,
     520    String uid) {
     521
     522        // create a message to process - contains requests for the collection
     523        // description, the format element, the enrich services on offer
     524        // these could all be cached
     525        Element info_message = this.doc.createElement(GSXML.MESSAGE_ELEM);
     526        String path = GSPath.appendLink(collection, "DocumentContentRetrieve");
     527        // the format request - ignore for now, where does this request go to??
     528        Element format_request = GSXML.createBasicRequest(this.doc, GSXML.REQUEST_TYPE_FORMAT, path, lang, uid);
     529        info_message.appendChild(format_request);
     530
     531        // the enrich_services request - only do this if provide_annotations is true
     532
     533        if (provide_annotations) {
     534            Element enrich_services_request = GSXML.createBasicRequest(this.doc, GSXML.REQUEST_TYPE_DESCRIBE, "", lang, uid);
     535            enrich_services_request.setAttribute(GSXML.INFO_ATT, "serviceList");
     536            info_message.appendChild(enrich_services_request);
     537        }
     538       
     539        Element info_response = (Element)this.mr.process(info_message);
     540
     541        // the collection is the first response
     542        NodeList responses = info_response.getElementsByTagName(GSXML.RESPONSE_ELEM);
     543        Element format_resp = (Element) responses.item(0);
     544       
     545        Element format_elem = (Element)GSXML.getChildByTagName(format_resp, GSXML.FORMAT_ELEM);
     546        if (format_elem != null) {
     547            logger.debug("doc action found a format statement");
     548            // set teh format type
     549            format_elem.setAttribute(GSXML.TYPE_ATT, "display"); 
     550            page_response.appendChild(this.doc.importNode(format_elem, true));
     551        }
     552
     553        if (provide_annotations) {
     554            Element services_resp = (Element)responses.item(1);
     555
     556            // a new message for the mr
     557            Element enrich_message = this.doc.createElement(GSXML.MESSAGE_ELEM);
     558           
     559            NodeList e_services = services_resp.getElementsByTagName(GSXML.SERVICE_ELEM);
     560            boolean service_found = false;
     561            for (int j=0; j<e_services.getLength(); j++) {
     562                if (((Element)e_services.item(j)).getAttribute(GSXML.TYPE_ATT).equals("enrich")) {
     563                    Element s = GSXML.createBasicRequest(this.doc, GSXML.REQUEST_TYPE_DESCRIBE, ((Element)e_services.item(j)).getAttribute(GSXML.NAME_ATT), lang, uid);
     564                    enrich_message.appendChild(s);
     565                    service_found = true;
     566                }
     567            }
     568            if (service_found) {
     569                Element enrich_response = (Element)this.mr.process(enrich_message);
     570               
     571                NodeList e_responses = enrich_response.getElementsByTagName(GSXML.RESPONSE_ELEM);
     572                Element service_list = this.doc.createElement(GSXML.SERVICE_ELEM + GSXML.LIST_MODIFIER);
     573                for (int i=0; i<e_responses.getLength(); i++) {
     574                    Element e_resp = (Element)e_responses.item(i);
     575                    Element e_service = (Element)this.doc.importNode(GSXML.getChildByTagName(e_resp, GSXML.SERVICE_ELEM), true);
     576                    e_service.setAttribute(GSXML.NAME_ATT, e_resp.getAttribute(GSXML.FROM_ATT));
     577                    service_list.appendChild(e_service);
     578                }
     579                page_response.appendChild(service_list);
     580            }
     581        } // if provide_annotations
     582        return true;
     583       
    122584    }
    123        
    124 
    125     boolean expand_contents = false;
    126     if (expand_document) { // we always expand the contents with the text
    127         expand_contents = true;
    128     } else {
    129         String ec_arg = (String) params.get(GSParams.EXPAND_CONTENTS);
    130         if (ec_arg != null && ec_arg.equals("1")) {
    131         expand_contents = true;
    132         }
     585
     586    /** this involves a bit of a hack to get the equivalent query terms - has to requery the query service - uses the last selected service name. (if it ends in query). should this action do the query or should it send a message to the query action? but that will involve lots of extra stuff. also doesn't handle phrases properly - just highlights all the terms found in the text.
     587    */
     588    protected Element highlightQueryTerms(Element request, Element dc_response_doc_content) {
     589
     590        // do the query again to get term info
     591        Element cgi_param_list = (Element)GSXML.getChildByTagName(request, GSXML.PARAM_ELEM+GSXML.LIST_MODIFIER);
     592        HashMap params = GSXML.extractParams(cgi_param_list, false);
     593       
     594        HashMap previous_params = (HashMap)params.get("p");
     595        if (previous_params == null) {
     596            return dc_response_doc_content;
     597        }
     598        String service_name = (String)previous_params.get(GSParams.SERVICE);
     599        if (service_name == null || !service_name.endsWith("Query")) { // hack for now - we only do highlighting if we were in a query last - ie not if we were in a browse thingy
     600            logger.debug("invalid service, not doing highlighting");
     601            return dc_response_doc_content;
     602        }
     603        String collection = (String)params.get(GSParams.COLLECTION);
     604        String lang = request.getAttribute(GSXML.LANG_ATT);
     605        String uid = request.getAttribute(GSXML.USER_ID_ATT);
     606        String to = GSPath.appendLink(collection, service_name);
     607       
     608        Element mr_query_message = this.doc.createElement(GSXML.MESSAGE_ELEM);
     609        Element mr_query_request = GSXML.createBasicRequest(this.doc, GSXML.REQUEST_TYPE_PROCESS, to, lang, uid);
     610        mr_query_message.appendChild(mr_query_request);
     611       
     612        // paramList
     613        HashMap service_params = (HashMap)params.get("s1");
     614       
     615        Element query_param_list = this.doc.createElement(GSXML.PARAM_ELEM + GSXML.LIST_MODIFIER);
     616        GSXML.addParametersToList(this.doc, query_param_list, service_params);
     617        mr_query_request.appendChild(query_param_list);
     618
     619        // do the query
     620        Element mr_query_response = (Element)this.mr.process(mr_query_message);
     621       
     622        String path = GSPath.appendLink(GSXML.RESPONSE_ELEM, GSXML.TERM_ELEM+GSXML.LIST_MODIFIER);
     623        Element query_term_list_element = (Element) GSXML.getNodeByPath(mr_query_response, path);
     624        if (query_term_list_element == null) {
     625            // no term info
     626            logger.error("No query term information.\n");
     627            return dc_response_doc_content;
     628        }
     629
     630        String content = GSXML.getNodeText(dc_response_doc_content);
     631
     632        String metadata_path = GSPath.appendLink(GSXML.RESPONSE_ELEM, GSXML.METADATA_ELEM+GSXML.LIST_MODIFIER);
     633        Element metadata_list = (Element) GSXML.getNodeByPath(mr_query_response, metadata_path);
     634
     635        HashSet query_term_variants = new HashSet();
     636        NodeList equivalent_terms_nodelist = query_term_list_element.getElementsByTagName("equivTermList");
     637        if(equivalent_terms_nodelist == null || equivalent_terms_nodelist.getLength() == 0)
     638        {
     639            NodeList terms_nodelist = query_term_list_element.getElementsByTagName("term");
     640            if(terms_nodelist != null && terms_nodelist.getLength() > 0)
     641            {
     642                for(int i = 0; i < terms_nodelist.getLength(); i++)
     643                {
     644                    String termValue = ((Element)terms_nodelist.item(i)).getAttribute("name");
     645                    String termValueU = null;
     646                    String termValueL = null;
     647                       
     648                    if(termValue.length() > 1)
     649                    {
     650                        termValueU = termValue.substring(0, 1).toUpperCase() + termValue.substring(1);
     651                        termValueL = termValue.substring(0, 1).toLowerCase() + termValue.substring(1);
     652                    }
     653                    else
     654                    {
     655                        termValueU = termValue.substring(0, 1).toUpperCase();
     656                        termValueL = termValue.substring(0, 1).toLowerCase();
     657                    }
     658                   
     659                    query_term_variants.add(termValueU);
     660                    query_term_variants.add(termValueL);
     661                }
     662            }
     663        }
     664        else
     665        {
     666            for (int i = 0; i < equivalent_terms_nodelist.getLength(); i++) {
     667                Element equivalent_terms_element = (Element) equivalent_terms_nodelist.item(i);
     668                String[] equivalent_terms = GSXML.getAttributeValuesFromList(equivalent_terms_element, GSXML.NAME_ATT);
     669                for (int j = 0; j < equivalent_terms.length; j++) {
     670                    query_term_variants.add(equivalent_terms[j]);
     671                }
     672            }
     673        }
     674
     675        ArrayList phrase_query_term_variants_hierarchy = new ArrayList();
     676
     677        Element query_element = GSXML.getNamedElement(metadata_list, GSXML.METADATA_ELEM, GSXML.NAME_ATT, "query");
     678        String performed_query = GSXML.getNodeText(query_element) + " ";
     679
     680        ArrayList phrase_query_p_term_variants_list = new ArrayList();
     681        int term_start = 0;
     682        boolean in_term = false;
     683        boolean in_phrase = false;
     684        for (int i = 0; i < performed_query.length(); i++) {
     685            char character = performed_query.charAt(i);
     686            boolean is_character_letter_or_digit = Character.isLetterOrDigit(character);
     687
     688            // Has a query term just started?
     689            if (in_term == false && is_character_letter_or_digit == true) {
     690                in_term = true;
     691                term_start = i;
     692            }
     693
     694            // Or has a term just finished?
     695            else if (in_term == true && is_character_letter_or_digit == false) {
     696                in_term = false;
     697                String term = performed_query.substring(term_start, i);
     698               
     699                Element term_element = GSXML.getNamedElement(query_term_list_element, GSXML.TERM_ELEM, GSXML.NAME_ATT, term);
     700                if (term_element != null) {
     701                   
     702                    HashSet phrase_query_p_term_x_variants = new HashSet();
     703                   
     704                    NodeList term_equivalent_terms_nodelist = term_element.getElementsByTagName("equivTermList");
     705                    if(term_equivalent_terms_nodelist == null || term_equivalent_terms_nodelist.getLength() == 0)
     706                    {
     707                        String termValueU = null;
     708                        String termValueL = null;
     709                       
     710                        if(term.length() > 1)
     711                        {
     712                            termValueU = term.substring(0, 1).toUpperCase() + term.substring(1);
     713                            termValueL = term.substring(0, 1).toLowerCase() + term.substring(1);
     714                        }
     715                        else
     716                        {
     717                            termValueU = term.substring(0, 1).toUpperCase();
     718                            termValueL = term.substring(0, 1).toLowerCase();
     719                        }
     720                       
     721                        phrase_query_p_term_x_variants.add(termValueU);
     722                        phrase_query_p_term_x_variants.add(termValueL);
     723                    }
     724                    else
     725                    {
     726                        for (int j = 0; j < term_equivalent_terms_nodelist.getLength(); j++) {
     727                            Element term_equivalent_terms_element = (Element) term_equivalent_terms_nodelist.item(j);
     728                            String[] term_equivalent_terms = GSXML.getAttributeValuesFromList(term_equivalent_terms_element, GSXML.NAME_ATT);
     729                            for (int k = 0; k < term_equivalent_terms.length; k++) {
     730                                phrase_query_p_term_x_variants.add(term_equivalent_terms[k]);
     731                            }
     732                        }
     733                    }
     734                    phrase_query_p_term_variants_list.add(phrase_query_p_term_x_variants);
     735                   
     736                    if (in_phrase == false) {
     737                        phrase_query_term_variants_hierarchy.add(phrase_query_p_term_variants_list);
     738                        phrase_query_p_term_variants_list = new ArrayList();
     739                    }
     740                }
     741            }
     742            // Watch for phrases (surrounded by quotes)
     743            if (character == '\"') {
     744                // Has a phrase just started?
     745                if (in_phrase == false) {
     746                    in_phrase = true;
     747                }
     748                // Or has a phrase just finished?
     749                else if (in_phrase == true) {
     750                    in_phrase = false;
     751                    phrase_query_term_variants_hierarchy.add(phrase_query_p_term_variants_list);
     752                }
     753
     754                phrase_query_p_term_variants_list = new ArrayList();
     755            }
     756        }
     757
     758        System.err.println(query_term_variants + " *** " + phrase_query_term_variants_hierarchy);
     759        return highlightQueryTermsInternal(content, query_term_variants, phrase_query_term_variants_hierarchy);
    133760    }
    134761
    135     //append site metadata
    136     addSiteMetadata( page_response, lang, uid);
    137 
    138     // get the additional data needed for the page
    139     getBackgroundData(page_response, collection, lang, uid);
    140     Element format_elem = (Element)GSXML.getChildByTagName(page_response, GSXML.FORMAT_ELEM);
    141    
    142     // the_document is where all the doc info - structure and metadata etc
    143     // is added into, to be returned in the page
    144     Element the_document = this.doc.createElement(GSXML.DOCUMENT_ELEM);
    145     page_response.appendChild(the_document);
    146 
    147     // set the doctype from the cgi arg as an attribute
    148     the_document.setAttribute(GSXML.DOC_TYPE_ATT, document_type);
    149 
    150     // create a basic doc list containing the current node
    151     Element basic_doc_list = this.doc.createElement(GSXML.DOC_NODE_ELEM+GSXML.LIST_MODIFIER);
    152     Element current_doc = this.doc.createElement(GSXML.DOC_NODE_ELEM);
    153     basic_doc_list.appendChild(current_doc);
    154     if (document_name.length()!=0){
    155         current_doc.setAttribute(GSXML.NODE_ID_ATT, document_name);
    156     }else if (has_href.length()!=0){
    157         current_doc.setAttribute(GSXML.NODE_ID_ATT, has_href);
    158         current_doc.setAttribute("externalURL", has_rl);
     762
     763    /**
     764    * Highlights query terms in a piece of text.
     765    */
     766    private Element highlightQueryTermsInternal(String content, HashSet query_term_variants, ArrayList phrase_query_term_variants_hierarchy)
     767    {
     768        // Convert the content string to an array of characters for speed
     769        char[] content_characters = new char[content.length()];
     770        content.getChars(0, content.length(), content_characters, 0);
     771
     772        // Now skim through the content, identifying word matches
     773        ArrayList word_matches = new ArrayList();
     774        int word_start = 0;
     775        boolean in_word = false;
     776        boolean preceding_word_matched = false;
     777        for (int i = 0; i < content_characters.length; i++) {
     778            boolean is_character_letter_or_digit = Character.isLetterOrDigit(content_characters[i]);
     779
     780            // Has a word just started?
     781            if (in_word == false && is_character_letter_or_digit == true) {
     782                in_word = true;
     783                word_start = i;
     784            }
     785
     786            // Or has a word just finished?
     787            else if (in_word == true && is_character_letter_or_digit == false) {
     788                in_word = false;
     789
     790                // Check if the word matches any of the query term equivalents
     791                String word = new String(content_characters, word_start, (i - word_start));
     792                if (query_term_variants.contains(word)) {
     793                    // We have found a matching word, so remember its location
     794                    word_matches.add(new WordMatch(word, word_start, i, preceding_word_matched));
     795                    preceding_word_matched = true;
     796                }
     797                else {
     798                    preceding_word_matched = false;
     799                }
     800            }
     801        }
     802
     803        // Don't forget the last word...
     804        if (in_word == true) {
     805            // Check if the word matches any of the query term equivalents
     806            String word = new String(content_characters, word_start, (content_characters.length - word_start));
     807            if (query_term_variants.contains(word)) {
     808                // We have found a matching word, so remember its location
     809                word_matches.add(new WordMatch(word, word_start, content_characters.length, preceding_word_matched));
     810            }
     811        }
     812
     813        ArrayList highlight_start_positions = new ArrayList();
     814        ArrayList highlight_end_positions = new ArrayList();
     815
     816        // Deal with phrases now
     817        ArrayList partial_phrase_matches = new ArrayList();
     818        for (int i = 0; i < word_matches.size(); i++) {
     819            WordMatch word_match = (WordMatch) word_matches.get(i);
     820
     821            // See if any partial phrase matches are extended by this word
     822            if (word_match.preceding_word_matched) {
     823                for (int j = partial_phrase_matches.size() - 1; j >= 0; j--) {
     824                    PartialPhraseMatch partial_phrase_match = (PartialPhraseMatch) partial_phrase_matches.remove(j);
     825                    ArrayList phrase_query_p_term_variants_list = (ArrayList) phrase_query_term_variants_hierarchy.get(partial_phrase_match.query_phrase_number);
     826                    HashSet phrase_query_p_term_x_variants = (HashSet) phrase_query_p_term_variants_list.get(partial_phrase_match.num_words_matched);
     827                    if (phrase_query_p_term_x_variants.contains(word_match.word)) {
     828                        partial_phrase_match.num_words_matched++;
     829
     830                        // Has a complete phrase match occurred?
     831                        if (partial_phrase_match.num_words_matched == phrase_query_p_term_variants_list.size()) {
     832                            // Check for overlaps by looking at the previous highlight range
     833                            if (!highlight_end_positions.isEmpty()) {
     834                                int last_highlight_index = highlight_end_positions.size() - 1;
     835                                int last_highlight_end = ((Integer) highlight_end_positions.get(last_highlight_index)).intValue();
     836                                if (last_highlight_end > partial_phrase_match.start_position) {
     837                                    // There is an overlap, so remove the previous phrase match
     838                                    int last_highlight_start = ((Integer) highlight_start_positions.remove(last_highlight_index)).intValue();
     839                                    highlight_end_positions.remove(last_highlight_index);
     840                                    partial_phrase_match.start_position = last_highlight_start;
     841                                }
     842                            }
     843
     844                            highlight_start_positions.add(new Integer(partial_phrase_match.start_position));
     845                            highlight_end_positions.add(new Integer(word_match.end_position));
     846                        }
     847                        // No, but add the partial match back into the list for next time
     848                        else {
     849                            partial_phrase_matches.add(partial_phrase_match);
     850                        }
     851                    }
     852                }
     853            }
     854            else {
     855                partial_phrase_matches.clear();
     856            }
     857
     858            // See if this word is at the start of any of the phrases
     859            for (int p = 0; p < phrase_query_term_variants_hierarchy.size(); p++) {
     860                ArrayList phrase_query_p_term_variants_list = (ArrayList) phrase_query_term_variants_hierarchy.get(p);
     861                HashSet phrase_query_p_term_1_variants = (HashSet) phrase_query_p_term_variants_list.get(0);
     862                if (phrase_query_p_term_1_variants.contains(word_match.word)) {
     863                    // If this phrase is just one word long, we have a complete match
     864                    if (phrase_query_p_term_variants_list.size() == 1) {
     865                        highlight_start_positions.add(new Integer(word_match.start_position));
     866                        highlight_end_positions.add(new Integer(word_match.end_position));
     867                    }
     868                    // Otherwise we have the start of a potential phrase match
     869                    else {
     870                        partial_phrase_matches.add(new PartialPhraseMatch(word_match.start_position, p));
     871                    }
     872                }
     873            }
     874        }
     875
     876        // Now add the annotation tags into the document at the correct points
     877        Element content_element = this.doc.createElement(GSXML.NODE_CONTENT_ELEM);
     878
     879        int last_wrote = 0;
     880        for (int i = 0; i < highlight_start_positions.size(); i++) {
     881            int highlight_start = ((Integer) highlight_start_positions.get(i)).intValue();
     882            int highlight_end = ((Integer) highlight_end_positions.get(i)).intValue();
     883
     884            // Print anything before the highlight range
     885            if (last_wrote < highlight_start) {
     886                String preceding_text = new String(content_characters, last_wrote, (highlight_start - last_wrote));
     887                content_element.appendChild(this.doc.createTextNode(preceding_text));
     888            }
     889
     890            // Print the highlight text, annotated
     891            if (highlight_end > last_wrote) {
     892                String highlight_text = new String(content_characters, highlight_start, (highlight_end - highlight_start));
     893                Element annotation_element = GSXML.createTextElement(this.doc, "annotation", highlight_text);
     894                annotation_element.setAttribute("type", "query_term");
     895                content_element.appendChild(annotation_element);
     896                last_wrote = highlight_end;
     897            }
     898        }
     899
     900        // Finish off any unwritten text
     901        if (last_wrote < content_characters.length) {
     902            String remaining_text = new String(content_characters, last_wrote, (content_characters.length - last_wrote));
     903            content_element.appendChild(this.doc.createTextNode(remaining_text));
     904        }
     905
     906        return content_element;
    159907    }
    160908
    161     // Create a parameter list to specify the required structure information
    162     Element ds_param_list = this.doc.createElement(GSXML.PARAM_ELEM+GSXML.LIST_MODIFIER);
    163    
    164     if (service_params != null) {
    165         GSXML.addParametersToList(this.doc, ds_param_list, service_params);
     909
     910    static private class WordMatch
     911    {
     912        public String word;
     913        public int start_position;
     914        public int end_position;
     915        public boolean preceding_word_matched;
     916
     917        public WordMatch(String word, int start_position, int end_position, boolean preceding_word_matched)
     918        {
     919            this.word = word;
     920            this.start_position = start_position;
     921            this.end_position = end_position;
     922            this.preceding_word_matched = preceding_word_matched;
     923        }
    166924    }
    167925
    168     Element ds_param = null;
    169     boolean get_structure = false;
    170     boolean get_structure_info = false;
    171     if (document_type.equals("paged")) {
    172         get_structure_info = true;
    173         // get teh info needed for paged naviagtion
    174         ds_param = this.doc.createElement(GSXML.PARAM_ELEM);
    175         ds_param_list.appendChild(ds_param);
    176         ds_param.setAttribute(GSXML.NAME_ATT, "info");
    177         ds_param.setAttribute(GSXML.VALUE_ATT, "numSiblings");
    178         ds_param = this.doc.createElement(GSXML.PARAM_ELEM);
    179         ds_param_list.appendChild(ds_param);
    180         ds_param.setAttribute(GSXML.NAME_ATT, "info");
    181         ds_param.setAttribute(GSXML.VALUE_ATT, "numChildren");
    182         ds_param = this.doc.createElement(GSXML.PARAM_ELEM);
    183         ds_param_list.appendChild(ds_param);
    184         ds_param.setAttribute(GSXML.NAME_ATT, "info");
    185         ds_param.setAttribute(GSXML.VALUE_ATT, "siblingPosition");
    186        
    187     } else if (document_type.equals("hierarchy")){
    188         get_structure = true;
    189         if (expand_contents) {
    190         ds_param = this.doc.createElement(GSXML.PARAM_ELEM);
    191         ds_param_list.appendChild(ds_param);
    192         ds_param.setAttribute(GSXML.NAME_ATT, "structure");
    193         ds_param.setAttribute(GSXML.VALUE_ATT, "entire");
    194         } else {
    195         // get the info needed for table of contents
    196         ds_param = this.doc.createElement(GSXML.PARAM_ELEM);
    197         ds_param_list.appendChild(ds_param);
    198         ds_param.setAttribute(GSXML.NAME_ATT, "structure");
    199         ds_param.setAttribute(GSXML.VALUE_ATT, "ancestors");
    200         ds_param = this.doc.createElement(GSXML.PARAM_ELEM);
    201         ds_param_list.appendChild(ds_param);
    202         ds_param.setAttribute(GSXML.NAME_ATT, "structure");
    203         ds_param.setAttribute(GSXML.VALUE_ATT, "children");
    204         if (get_siblings) {
    205             ds_param = this.doc.createElement(GSXML.PARAM_ELEM);
    206             ds_param_list.appendChild(ds_param);
    207             ds_param.setAttribute(GSXML.NAME_ATT, "structure");
    208             ds_param.setAttribute(GSXML.VALUE_ATT, "siblings");
    209         }
    210         }
    211     } else {
    212         // we dont need any structure
     926
     927    static private class PartialPhraseMatch
     928    {
     929        public int start_position;
     930        public int query_phrase_number;
     931        public int num_words_matched;
     932
     933        public PartialPhraseMatch(int start_position, int query_phrase_number)
     934        {
     935            this.start_position = start_position;
     936            this.query_phrase_number = query_phrase_number;
     937            this.num_words_matched = 1;
     938        }
    213939    }
    214 
    215     boolean has_dummy = false;
    216     if (get_structure || get_structure_info) {
    217 
    218         // Build a request to obtain the document structure
    219         Element ds_message = this.doc.createElement(GSXML.MESSAGE_ELEM);
    220         String to = GSPath.appendLink(collection, "DocumentStructureRetrieve");// Hard-wired?
    221         Element ds_request = GSXML.createBasicRequest(this.doc,GSXML.REQUEST_TYPE_PROCESS, to, lang, uid);
    222         ds_message.appendChild(ds_request);
    223         ds_request.appendChild(ds_param_list);
    224        
    225         // create a doc_node_list and put in the doc_node that we are interested in
    226         ds_request.appendChild(basic_doc_list);
    227        
    228         // Process the document structure retrieve message
    229         Element ds_response_message = (Element) this.mr.process(ds_message);
    230         if (processErrorElements(ds_response_message, page_response)) {
    231         return result;
    232         }
    233 
    234         // get the info and print out
    235         String path = GSPath.appendLink(GSXML.RESPONSE_ELEM, GSXML.DOC_NODE_ELEM+GSXML.LIST_MODIFIER);
    236         path = GSPath.appendLink(path, GSXML.DOC_NODE_ELEM);
    237         path = GSPath.appendLink(path, "nodeStructureInfo");
    238         Element ds_response_struct_info = (Element) GSXML.getNodeByPath(ds_response_message, path);
    239         // get the doc_node bit
    240         if (ds_response_struct_info != null) {
    241         the_document.appendChild(this.doc.importNode(ds_response_struct_info, true));
    242         }
    243         path = GSPath.appendLink(GSXML.RESPONSE_ELEM, GSXML.DOC_NODE_ELEM+GSXML.LIST_MODIFIER);
    244         path = GSPath.appendLink(path, GSXML.DOC_NODE_ELEM);
    245         path = GSPath.appendLink(path, GSXML.NODE_STRUCTURE_ELEM);
    246         Element ds_response_structure = (Element) GSXML.getNodeByPath(ds_response_message, path);
    247        
    248         if (ds_response_structure != null) {
    249         // add the contents of the structure bit into the_document
    250         NodeList structs = ds_response_structure.getChildNodes();
    251         for (int i=0; i<structs.getLength();i++) {
    252             the_document.appendChild(this.doc.importNode(structs.item(i), true));
    253         }
    254         } else {
    255         // no structure nodes, so put in a dummy doc node
    256         Element doc_node = this.doc.createElement(GSXML.DOC_NODE_ELEM);
    257         if (document_name.length()!=0){
    258             doc_node.setAttribute(GSXML.NODE_ID_ATT, document_name);
    259         }else if (has_href.length()!=0){
    260             doc_node.setAttribute(GSXML.NODE_ID_ATT, has_href);
    261             doc_node.setAttribute("externalURL", has_rl);
    262         }
    263         the_document.appendChild(doc_node);
    264         has_dummy = true;
    265         }
    266     } else { // a simple type - we dont have a dummy node for simple
    267         // should think about this more
    268         // no structure request, so just put in a dummy doc node
    269         Element doc_node = this.doc.createElement(GSXML.DOC_NODE_ELEM);
    270         if (document_name.length()!=0){
    271         doc_node.setAttribute(GSXML.NODE_ID_ATT, document_name);
    272         }else if (has_href.length()!=0){
    273         doc_node.setAttribute(GSXML.NODE_ID_ATT, has_href);
    274         doc_node.setAttribute("externalURL", has_rl);
    275         }
    276         the_document.appendChild(doc_node);
    277         has_dummy = true;
    278     }
    279    
    280     // Build a request to obtain some document metadata
    281     Element dm_message = this.doc.createElement(GSXML.MESSAGE_ELEM);
    282     String to = GSPath.appendLink(collection, "DocumentMetadataRetrieve");  // Hard-wired?
    283     Element dm_request = GSXML.createBasicRequest(this.doc, GSXML.REQUEST_TYPE_PROCESS, to, lang, uid);
    284     dm_message.appendChild(dm_request);
    285     // Create a parameter list to specify the required metadata information
    286    
    287     HashSet meta_names = new HashSet();
    288     meta_names.add("Title"); // the default
    289     if (format_elem != null) {
    290         extractMetadataNames(format_elem, meta_names);
    291     }
    292    
    293     Element dm_param_list = createMetadataParamList(meta_names);
    294     if (service_params != null) {
    295         GSXML.addParametersToList(this.doc, dm_param_list, service_params);
    296     }
    297    
    298     dm_request.appendChild(dm_param_list);
    299    
    300    
    301     // create the doc node list for the metadata request
    302     Element dm_doc_list = this.doc.createElement(GSXML.DOC_NODE_ELEM+GSXML.LIST_MODIFIER);
    303     dm_request.appendChild(dm_doc_list);
    304 
    305     // Add each node from the structure response into the metadata request
    306     NodeList doc_nodes = the_document.getElementsByTagName(GSXML.DOC_NODE_ELEM);
    307     for (int i = 0; i < doc_nodes.getLength(); i++) {
    308         Element doc_node = (Element) doc_nodes.item(i);
    309         String doc_node_id = doc_node.getAttribute(GSXML.NODE_ID_ATT);
    310 
    311         // Add the documentNode to the list
    312         Element dm_doc_node = this.doc.createElement(GSXML.DOC_NODE_ELEM);
    313         dm_doc_list.appendChild(dm_doc_node);
    314         dm_doc_node.setAttribute(GSXML.NODE_ID_ATT, doc_node_id);
    315         dm_doc_node.setAttribute(GSXML.NODE_TYPE_ATT,
    316                      doc_node.getAttribute(GSXML.NODE_TYPE_ATT));
    317     }
    318 
    319     // we also want a metadata request to the top level document to get
    320     // assocfilepath - this could be cached too
    321     Element doc_meta_request = GSXML.createBasicRequest(this.doc, GSXML.REQUEST_TYPE_PROCESS, to, lang, uid);
    322     dm_message.appendChild(doc_meta_request);
    323     Element doc_meta_param_list = this.doc.createElement(GSXML.PARAM_ELEM+GSXML.LIST_MODIFIER);
    324     if (service_params != null) {
    325         GSXML.addParametersToList(this.doc, doc_meta_param_list, service_params);
    326     }
    327 
    328     doc_meta_request.appendChild(doc_meta_param_list);
    329     Element doc_param = this.doc.createElement(GSXML.PARAM_ELEM);
    330     doc_meta_param_list.appendChild(doc_param);
    331     doc_param.setAttribute(GSXML.NAME_ATT, "metadata");
    332     doc_param.setAttribute(GSXML.VALUE_ATT, "assocfilepath");
    333 
    334     // create the doc node list for the metadata request
    335     Element doc_list = this.doc.createElement(GSXML.DOC_NODE_ELEM+GSXML.LIST_MODIFIER);
    336     doc_meta_request.appendChild(doc_list);
    337 
    338     Element doc_node = this.doc.createElement(GSXML.DOC_NODE_ELEM);
    339     // the node we want is the root document node
    340     if (document_name.length()!=0){
    341         doc_node.setAttribute(GSXML.NODE_ID_ATT, document_name+".rt");
    342     }else if (has_href.length()!=0){
    343         doc_node.setAttribute(GSXML.NODE_ID_ATT, has_href+".rt");
    344         doc_node.setAttribute("externalURL", has_rl);
    345     }
    346     doc_list.appendChild(doc_node);
    347     Element dm_response_message = (Element) this.mr.process(dm_message);
    348     if (processErrorElements(dm_response_message, page_response)) {
    349         return result;
    350     }
    351 
    352     String path = GSPath.appendLink(GSXML.RESPONSE_ELEM, GSXML.DOC_NODE_ELEM+GSXML.LIST_MODIFIER);
    353     Element dm_response_doc_list = (Element) GSXML.getNodeByPath(dm_response_message, path);
    354 
    355     // Merge the metadata with the structure information
    356     NodeList dm_response_docs = dm_response_doc_list.getChildNodes();
    357     for (int i = 0; i < doc_nodes.getLength(); i++) {
    358       GSXML.mergeMetadataLists(doc_nodes.item(i), dm_response_docs.item(i));
    359     }
    360     // get the top level doc metadata out
    361     Element doc_meta_response = (Element)dm_response_message.getElementsByTagName(GSXML.RESPONSE_ELEM).item(1);
    362     Element top_doc_node = (Element)GSXML.getNodeByPath(doc_meta_response, "documentNodeList/documentNode");
    363     GSXML.mergeMetadataLists(the_document, top_doc_node);
    364    
    365     // Build a request to obtain some document content
    366     Element dc_message = this.doc.createElement(GSXML.MESSAGE_ELEM);
    367     to = GSPath.appendLink(collection, "DocumentContentRetrieve");  // Hard-wired?
    368     Element dc_request = GSXML.createBasicRequest(this.doc, GSXML.REQUEST_TYPE_PROCESS, to, lang, uid);
    369     dc_message.appendChild(dc_request);
    370    
    371 
    372     // Create a parameter list to specify the request parameters - empty for now
    373     Element dc_param_list = this.doc.createElement(GSXML.PARAM_ELEM+GSXML.LIST_MODIFIER);
    374     if (service_params != null) {
    375         GSXML.addParametersToList(this.doc, dc_param_list, service_params);
    376     }
    377 
    378     dc_request.appendChild(dc_param_list);
    379 
    380     // get the content
    381     // the doc list for the content request is the same as the one for the structure request unless we want the whole document, in which case its the same as for the metadata request.
    382     if (expand_document) {
    383         dc_request.appendChild(dm_doc_list);
    384     } else {
    385         dc_request.appendChild(basic_doc_list);
    386     }
    387     logger.debug("request = "+converter.getString(dc_message));
    388     Element dc_response_message = (Element) this.mr.process(dc_message);
    389     if (processErrorElements(dc_response_message, page_response)) {
    390         return result;
    391     }
    392 
    393     Element dc_response_doc_list = (Element) GSXML.getNodeByPath(dc_response_message, path);
    394 
    395     if (expand_document) {
    396         // Merge the content with the structure information
    397         NodeList dc_response_docs = dc_response_doc_list.getChildNodes();
    398         for (int i = 0; i < doc_nodes.getLength(); i++) {
    399         Node content = GSXML.getChildByTagName((Element)dc_response_docs.item(i), "nodeContent");
    400         if (content != null) {
    401             doc_nodes.item(i).appendChild(this.doc.importNode(content, true));
    402         }
    403         //GSXML.mergeMetadataLists(doc_nodes.item(i), dm_response_docs.item(i));
    404         }
    405     } else {
    406         //path = GSPath.appendLink(path, GSXML.DOC_NODE_ELEM);
    407         Element dc_response_doc = (Element) GSXML.getChildByTagName(dc_response_doc_list, GSXML.DOC_NODE_ELEM);
    408         Element dc_response_doc_content = (Element) GSXML.getChildByTagName(dc_response_doc, GSXML.NODE_CONTENT_ELEM);
    409         Element dc_response_doc_external = (Element) GSXML.getChildByTagName(dc_response_doc, "external");
    410        
    411         if (dc_response_doc_content == null) {
    412         // no content to add
    413         if (dc_response_doc_external !=null){
    414             String modified_doc_id = dc_response_doc.getAttribute(GSXML.NODE_ID_ATT);
    415            
    416             the_document.setAttribute("selectedNode", modified_doc_id);
    417             the_document.setAttribute("external", dc_response_doc_external.getAttribute("external_link"));
    418         }
    419         return result;
    420         }
    421         if (highlight_query_terms) {
    422         dc_response_doc.removeChild(dc_response_doc_content);
    423        
    424         dc_response_doc_content = highlightQueryTerms(request, dc_response_doc_content);
    425         dc_response_doc.appendChild(dc_response_doc.getOwnerDocument().importNode(dc_response_doc_content, true));
    426         }
    427        
    428    
    429     if (provide_annotations) {
    430         String service_selected = (String)params.get(ENRICH_DOC_ARG);
    431         if (service_selected != null && service_selected.equals("1")) {
    432         // now we can modifiy the response doc if needed
    433         String enrich_service = (String)params.get(GSParams.SERVICE);
    434         // send a message to the service
    435         Element enrich_message = this.doc.createElement(GSXML.MESSAGE_ELEM);
    436         Element enrich_request = GSXML.createBasicRequest(this.doc, GSXML.REQUEST_TYPE_PROCESS, enrich_service, lang, uid);
    437         enrich_message.appendChild(enrich_request);
    438         // check for parameters
    439         HashMap e_service_params = (HashMap)params.get("s1");
    440         if (e_service_params != null) {
    441             Element enrich_pl = this.doc.createElement(GSXML.PARAM_ELEM+GSXML.LIST_MODIFIER);
    442             GSXML.addParametersToList(this.doc, enrich_pl, e_service_params);
    443             enrich_request.appendChild(enrich_pl);
    444         }
    445         Element e_doc_list = this.doc.createElement(GSXML.DOC_NODE_ELEM+GSXML.LIST_MODIFIER);
    446         enrich_request.appendChild(e_doc_list);
    447         e_doc_list.appendChild(this.doc.importNode(dc_response_doc, true));
    448        
    449         Node enrich_response = this.mr.process(enrich_message);
    450        
    451         String [] links = {GSXML.RESPONSE_ELEM, GSXML.DOC_NODE_ELEM+GSXML.LIST_MODIFIER, GSXML.DOC_NODE_ELEM, GSXML.NODE_CONTENT_ELEM};
    452         path = GSPath.createPath(links);
    453         dc_response_doc_content = (Element)GSXML.getNodeByPath(enrich_response, path);
    454        
    455         }
    456     } // if provide_annotations
    457 
    458    
    459     // use the returned id rather than the sent one cos there may have
    460     // been modifiers such as .pr that are removed.
    461     String modified_doc_id = dc_response_doc.getAttribute(GSXML.NODE_ID_ATT);
    462     the_document.setAttribute("selectedNode", modified_doc_id);
    463     if (has_dummy) {
    464         // change the id if necessary and add the content
    465         Element dummy_node = (Element)doc_nodes.item(0);
    466        
    467         dummy_node.setAttribute(GSXML.NODE_ID_ATT, modified_doc_id);
    468         dummy_node.appendChild(this.doc.importNode(dc_response_doc_content, true));
    469         // hack for simple type
    470         if (document_type.equals("simple")) {
    471             // we dont want the internal docNode, just want the content and metadata in the document
    472         // rethink this!!
    473         the_document.removeChild(dummy_node);
    474 
    475         NodeList dummy_children = dummy_node.getChildNodes();
    476         //for (int i=0; i<dummy_children.getLength(); i++) {
    477         for (int i=dummy_children.getLength()-1; i>=0; i--) {
    478           // special case as we don't want more than one metadata list
    479           if (dummy_children.item(i).getNodeName().equals(GSXML.METADATA_ELEM+GSXML.LIST_MODIFIER)) {
    480             GSXML.mergeMetadataFromList(the_document, dummy_children.item(i));
    481           } else {
    482             the_document.appendChild(dummy_children.item(i));
    483           }
    484         }
    485         }
    486     } else {
    487         // Merge the document content with the metadata and structure information
    488         for (int i = 0; i < doc_nodes.getLength(); i++) {
    489         Node dn = doc_nodes.item(i);
    490         String dn_id = ((Element)dn).getAttribute(GSXML.NODE_ID_ATT);
    491         if (dn_id.equals(modified_doc_id)) {
    492             dn.appendChild(this.doc.importNode(dc_response_doc_content, true));
    493             break;
    494         }
    495         }
    496     }
    497     }
    498     logger.debug("(DocumentAction) Page:\n" + this.converter.getPrettyString(result));
    499     return result;
    500     }
    501    
    502     /** tell the param class what its arguments are
    503      * if an action has its own arguments, this should add them to the params
    504      * object - particularly important for args that should not be saved */
    505     public boolean getActionParameters(GSParams params) {
    506     params.addParameter(GOTO_PAGE_ARG, false);
    507     params.addParameter(ENRICH_DOC_ARG, false);
    508     return true;
    509     }
    510 
    511 
    512     /** this method gets the collection description, the format info, the
    513      * list of enrich services, etc - stuff that is needed for the page,
    514      * but is the same whatever the query is - should be cached */
    515     protected  boolean getBackgroundData(Element page_response,
    516                      String collection, String lang,
    517                      String uid) {
    518 
    519     // create a message to process - contains requests for the collection
    520     // description, the format element, the enrich services on offer
    521     // these could all be cached
    522     Element info_message = this.doc.createElement(GSXML.MESSAGE_ELEM);
    523     String path = GSPath.appendLink(collection, "DocumentContentRetrieve");
    524     // the format request - ignore for now, where does this request go to??
    525     Element format_request = GSXML.createBasicRequest(this.doc, GSXML.REQUEST_TYPE_FORMAT, path, lang, uid);
    526     info_message.appendChild(format_request);
    527 
    528     // the enrich_services request - only do this if provide_annotations is true
    529 
    530     if (provide_annotations) {
    531         Element enrich_services_request = GSXML.createBasicRequest(this.doc, GSXML.REQUEST_TYPE_DESCRIBE, "", lang, uid);
    532         enrich_services_request.setAttribute(GSXML.INFO_ATT, "serviceList");
    533         info_message.appendChild(enrich_services_request);
    534     }
    535    
    536     Element info_response = (Element)this.mr.process(info_message);
    537 
    538     // the collection is the first response
    539     NodeList responses = info_response.getElementsByTagName(GSXML.RESPONSE_ELEM);
    540     Element format_resp = (Element) responses.item(0);
    541    
    542     Element format_elem = (Element)GSXML.getChildByTagName(format_resp, GSXML.FORMAT_ELEM);
    543     if (format_elem != null) {
    544         logger.debug("doc action found a format statement");
    545         // set teh format type
    546         format_elem.setAttribute(GSXML.TYPE_ATT, "display"); 
    547         page_response.appendChild(this.doc.importNode(format_elem, true));
    548     }
    549 
    550     if (provide_annotations) {
    551         Element services_resp = (Element)responses.item(1);
    552 
    553         // a new message for the mr
    554         Element enrich_message = this.doc.createElement(GSXML.MESSAGE_ELEM);
    555        
    556         NodeList e_services = services_resp.getElementsByTagName(GSXML.SERVICE_ELEM);
    557         boolean service_found = false;
    558         for (int j=0; j<e_services.getLength(); j++) {
    559         if (((Element)e_services.item(j)).getAttribute(GSXML.TYPE_ATT).equals("enrich")) {
    560             Element s = GSXML.createBasicRequest(this.doc, GSXML.REQUEST_TYPE_DESCRIBE, ((Element)e_services.item(j)).getAttribute(GSXML.NAME_ATT), lang, uid);
    561             enrich_message.appendChild(s);
    562             service_found = true;
    563         }
    564         }
    565         if (service_found) {
    566         Element enrich_response = (Element)this.mr.process(enrich_message);
    567        
    568         NodeList e_responses = enrich_response.getElementsByTagName(GSXML.RESPONSE_ELEM);
    569         Element service_list = this.doc.createElement(GSXML.SERVICE_ELEM + GSXML.LIST_MODIFIER);
    570         for (int i=0; i<e_responses.getLength(); i++) {
    571             Element e_resp = (Element)e_responses.item(i);
    572             Element e_service = (Element)this.doc.importNode(GSXML.getChildByTagName(e_resp, GSXML.SERVICE_ELEM), true);
    573             e_service.setAttribute(GSXML.NAME_ATT, e_resp.getAttribute(GSXML.FROM_ATT));
    574             service_list.appendChild(e_service);
    575         }
    576         page_response.appendChild(service_list);
    577         }
    578     } // if provide_annotations
    579     return true;
    580        
    581     }
    582 
    583     /** this involves a bit of a hack to get the equivalent query terms - has to requery the query service - uses the last selected service name. (if it ends in query). should this action do the query or should it send a message to the query action? but that will involve lots of extra stuff. also doesn't handle phrases properly - just highlights all the terms found in the text.
    584      */
    585     protected Element highlightQueryTerms(Element request, Element dc_response_doc_content) {
    586 
    587     // do the query again to get term info
    588     Element cgi_param_list = (Element)GSXML.getChildByTagName(request, GSXML.PARAM_ELEM+GSXML.LIST_MODIFIER);
    589     HashMap params = GSXML.extractParams(cgi_param_list, false);
    590    
    591     HashMap previous_params = (HashMap)params.get("p");
    592     if (previous_params == null) {
    593         return dc_response_doc_content;
    594     }
    595         String service_name = (String)previous_params.get(GSParams.SERVICE);
    596     if (service_name == null || !service_name.endsWith("Query")) { // hack for now - we only do highlighting if we were in a query last - ie not if we were in a browse thingy
    597         logger.debug("invalid service, not doing highlighting");
    598         return dc_response_doc_content;
    599     }
    600     String collection = (String)params.get(GSParams.COLLECTION);
    601     String lang = request.getAttribute(GSXML.LANG_ATT);
    602     String uid = request.getAttribute(GSXML.USER_ID_ATT);
    603     String to = GSPath.appendLink(collection, service_name);
    604    
    605     Element mr_query_message = this.doc.createElement(GSXML.MESSAGE_ELEM);
    606     Element mr_query_request = GSXML.createBasicRequest(this.doc, GSXML.REQUEST_TYPE_PROCESS, to, lang, uid);
    607     mr_query_message.appendChild(mr_query_request);
    608    
    609     // paramList
    610     HashMap service_params = (HashMap)params.get("s1");
    611    
    612     Element query_param_list = this.doc.createElement(GSXML.PARAM_ELEM + GSXML.LIST_MODIFIER);
    613     GSXML.addParametersToList(this.doc, query_param_list, service_params);
    614     mr_query_request.appendChild(query_param_list);
    615 
    616     // do the query
    617         Element mr_query_response = (Element)this.mr.process(mr_query_message);
    618    
    619     String path = GSPath.appendLink(GSXML.RESPONSE_ELEM, GSXML.TERM_ELEM+GSXML.LIST_MODIFIER);
    620     Element query_term_list_element = (Element) GSXML.getNodeByPath(mr_query_response, path);
    621     if (query_term_list_element == null) {
    622         // no term info
    623         logger.error("No query term information.\n");
    624         return dc_response_doc_content;
    625     }
    626 
    627     String content = GSXML.getNodeText(dc_response_doc_content);
    628 
    629     String metadata_path = GSPath.appendLink(GSXML.RESPONSE_ELEM, GSXML.METADATA_ELEM+GSXML.LIST_MODIFIER);
    630     Element metadata_list = (Element) GSXML.getNodeByPath(mr_query_response, metadata_path);
    631 
    632     HashSet query_term_variants = new HashSet();
    633     NodeList equivalent_terms_nodelist = query_term_list_element.getElementsByTagName("equivTermList");
    634     for (int i = 0; i < equivalent_terms_nodelist.getLength(); i++) {
    635         Element equivalent_terms_element = (Element) equivalent_terms_nodelist.item(i);
    636         String[] equivalent_terms = GSXML.getAttributeValuesFromList(equivalent_terms_element, GSXML.NAME_ATT);
    637         for (int j = 0; j < equivalent_terms.length; j++) {
    638         query_term_variants.add(equivalent_terms[j]);
    639         }
    640     }
    641 
    642     ArrayList phrase_query_term_variants_hierarchy = new ArrayList();
    643 
    644     Element query_element = GSXML.getNamedElement(metadata_list, GSXML.METADATA_ELEM, GSXML.NAME_ATT, "query");
    645     String performed_query = GSXML.getNodeText(query_element) + " ";
    646 
    647     ArrayList phrase_query_p_term_variants_list = new ArrayList();
    648     int term_start = 0;
    649     boolean in_term = false;
    650     boolean in_phrase = false;
    651     for (int i = 0; i < performed_query.length(); i++) {
    652         char character = performed_query.charAt(i);
    653         boolean is_character_letter_or_digit = Character.isLetterOrDigit(character);
    654 
    655         // Has a query term just started?
    656         if (in_term == false && is_character_letter_or_digit == true) {
    657         in_term = true;
    658         term_start = i;
    659         }
    660 
    661         // Or has a term just finished?
    662         else if (in_term == true && is_character_letter_or_digit == false) {
    663         in_term = false;
    664         String term = performed_query.substring(term_start, i);
    665                
    666         Element term_element = GSXML.getNamedElement(query_term_list_element, GSXML.TERM_ELEM, GSXML.NAME_ATT, term);
    667         if (term_element != null) {
    668            
    669             HashSet phrase_query_p_term_x_variants = new HashSet();
    670            
    671             NodeList term_equivalent_terms_nodelist = term_element.getElementsByTagName("equivTermList");
    672             for (int j = 0; j < term_equivalent_terms_nodelist.getLength(); j++) {
    673             Element term_equivalent_terms_element = (Element) term_equivalent_terms_nodelist.item(j);
    674             String[] term_equivalent_terms = GSXML.getAttributeValuesFromList(term_equivalent_terms_element, GSXML.NAME_ATT);
    675             for (int k = 0; k < term_equivalent_terms.length; k++) {
    676                 phrase_query_p_term_x_variants.add(term_equivalent_terms[k]);
    677             }
    678             }
    679             phrase_query_p_term_variants_list.add(phrase_query_p_term_x_variants);
    680            
    681             if (in_phrase == false) {
    682             phrase_query_term_variants_hierarchy.add(phrase_query_p_term_variants_list);
    683             phrase_query_p_term_variants_list = new ArrayList();
    684             }
    685         }
    686         }
    687         // Watch for phrases (surrounded by quotes)
    688         if (character == '\"') {
    689         // Has a phrase just started?
    690         if (in_phrase == false) {
    691             in_phrase = true;
    692         }
    693         // Or has a phrase just finished?
    694         else if (in_phrase == true) {
    695             in_phrase = false;
    696             phrase_query_term_variants_hierarchy.add(phrase_query_p_term_variants_list);
    697         }
    698 
    699         phrase_query_p_term_variants_list = new ArrayList();
    700         }
    701     }
    702 
    703     return highlightQueryTermsInternal(content, query_term_variants, phrase_query_term_variants_hierarchy);
    704     }
    705 
    706 
    707     /**
    708      * Highlights query terms in a piece of text.
    709      */
    710     private Element highlightQueryTermsInternal(String content, HashSet query_term_variants, ArrayList phrase_query_term_variants_hierarchy)
    711     {
    712     // Convert the content string to an array of characters for speed
    713     char[] content_characters = new char[content.length()];
    714     content.getChars(0, content.length(), content_characters, 0);
    715 
    716     // Now skim through the content, identifying word matches
    717     ArrayList word_matches = new ArrayList();
    718     int word_start = 0;
    719     boolean in_word = false;
    720     boolean preceding_word_matched = false;
    721     for (int i = 0; i < content_characters.length; i++) {
    722         boolean is_character_letter_or_digit = Character.isLetterOrDigit(content_characters[i]);
    723 
    724         // Has a word just started?
    725         if (in_word == false && is_character_letter_or_digit == true) {
    726         in_word = true;
    727         word_start = i;
    728         }
    729 
    730         // Or has a word just finished?
    731         else if (in_word == true && is_character_letter_or_digit == false) {
    732         in_word = false;
    733 
    734         // Check if the word matches any of the query term equivalents
    735         String word = new String(content_characters, word_start, (i - word_start));
    736         if (query_term_variants.contains(word)) {
    737             // We have found a matching word, so remember its location
    738             word_matches.add(new WordMatch(word, word_start, i, preceding_word_matched));
    739             preceding_word_matched = true;
    740         }
    741         else {
    742             preceding_word_matched = false;
    743         }
    744         }
    745     }
    746 
    747     // Don't forget the last word...
    748     if (in_word == true) {
    749         // Check if the word matches any of the query term equivalents
    750         String word = new String(content_characters, word_start, (content_characters.length - word_start));
    751         if (query_term_variants.contains(word)) {
    752         // We have found a matching word, so remember its location
    753         word_matches.add(new WordMatch(word, word_start, content_characters.length, preceding_word_matched));
    754         }
    755     }
    756 
    757     ArrayList highlight_start_positions = new ArrayList();
    758     ArrayList highlight_end_positions = new ArrayList();
    759 
    760     // Deal with phrases now
    761     ArrayList partial_phrase_matches = new ArrayList();
    762     for (int i = 0; i < word_matches.size(); i++) {
    763         WordMatch word_match = (WordMatch) word_matches.get(i);
    764 
    765         // See if any partial phrase matches are extended by this word
    766         if (word_match.preceding_word_matched) {
    767         for (int j = partial_phrase_matches.size() - 1; j >= 0; j--) {
    768             PartialPhraseMatch partial_phrase_match = (PartialPhraseMatch) partial_phrase_matches.remove(j);
    769             ArrayList phrase_query_p_term_variants_list = (ArrayList) phrase_query_term_variants_hierarchy.get(partial_phrase_match.query_phrase_number);
    770             HashSet phrase_query_p_term_x_variants = (HashSet) phrase_query_p_term_variants_list.get(partial_phrase_match.num_words_matched);
    771             if (phrase_query_p_term_x_variants.contains(word_match.word)) {
    772             partial_phrase_match.num_words_matched++;
    773 
    774             // Has a complete phrase match occurred?
    775             if (partial_phrase_match.num_words_matched == phrase_query_p_term_variants_list.size()) {
    776                 // Check for overlaps by looking at the previous highlight range
    777                 if (!highlight_end_positions.isEmpty()) {
    778                 int last_highlight_index = highlight_end_positions.size() - 1;
    779                 int last_highlight_end = ((Integer) highlight_end_positions.get(last_highlight_index)).intValue();
    780                 if (last_highlight_end > partial_phrase_match.start_position) {
    781                     // There is an overlap, so remove the previous phrase match
    782                     int last_highlight_start = ((Integer) highlight_start_positions.remove(last_highlight_index)).intValue();
    783                     highlight_end_positions.remove(last_highlight_index);
    784                     partial_phrase_match.start_position = last_highlight_start;
    785                 }
    786                 }
    787 
    788                 highlight_start_positions.add(new Integer(partial_phrase_match.start_position));
    789                 highlight_end_positions.add(new Integer(word_match.end_position));
    790             }
    791             // No, but add the partial match back into the list for next time
    792             else {
    793                 partial_phrase_matches.add(partial_phrase_match);
    794             }
    795             }
    796         }
    797         }
    798         else {
    799         partial_phrase_matches.clear();
    800         }
    801 
    802         // See if this word is at the start of any of the phrases
    803         for (int p = 0; p < phrase_query_term_variants_hierarchy.size(); p++) {
    804         ArrayList phrase_query_p_term_variants_list = (ArrayList) phrase_query_term_variants_hierarchy.get(p);
    805         HashSet phrase_query_p_term_1_variants = (HashSet) phrase_query_p_term_variants_list.get(0);
    806         if (phrase_query_p_term_1_variants.contains(word_match.word)) {
    807             // If this phrase is just one word long, we have a complete match
    808             if (phrase_query_p_term_variants_list.size() == 1) {
    809             highlight_start_positions.add(new Integer(word_match.start_position));
    810             highlight_end_positions.add(new Integer(word_match.end_position));
    811             }
    812             // Otherwise we have the start of a potential phrase match
    813             else {
    814             partial_phrase_matches.add(new PartialPhraseMatch(word_match.start_position, p));
    815             }
    816         }
    817         }
    818     }
    819 
    820     // Now add the annotation tags into the document at the correct points
    821     Element content_element = this.doc.createElement(GSXML.NODE_CONTENT_ELEM);
    822 
    823     int last_wrote = 0;
    824     for (int i = 0; i < highlight_start_positions.size(); i++) {
    825         int highlight_start = ((Integer) highlight_start_positions.get(i)).intValue();
    826         int highlight_end = ((Integer) highlight_end_positions.get(i)).intValue();
    827 
    828         // Print anything before the highlight range
    829         if (last_wrote < highlight_start) {
    830         String preceding_text = new String(content_characters, last_wrote, (highlight_start - last_wrote));
    831         content_element.appendChild(this.doc.createTextNode(preceding_text));
    832         }
    833 
    834         // Print the highlight text, annotated
    835         if (highlight_end > last_wrote) {
    836         String highlight_text = new String(content_characters, highlight_start, (highlight_end - highlight_start));
    837         Element annotation_element = GSXML.createTextElement(this.doc, "annotation", highlight_text);
    838         annotation_element.setAttribute("type", "query_term");
    839         content_element.appendChild(annotation_element);
    840         last_wrote = highlight_end;
    841         }
    842     }
    843 
    844     // Finish off any unwritten text
    845     if (last_wrote < content_characters.length) {
    846         String remaining_text = new String(content_characters, last_wrote, (content_characters.length - last_wrote));
    847         content_element.appendChild(this.doc.createTextNode(remaining_text));
    848     }
    849 
    850     return content_element;
    851     }
    852 
    853 
    854     static private class WordMatch
    855     {
    856     public String word;
    857     public int start_position;
    858     public int end_position;
    859     public boolean preceding_word_matched;
    860 
    861     public WordMatch(String word, int start_position, int end_position, boolean preceding_word_matched)
    862     {
    863         this.word = word;
    864         this.start_position = start_position;
    865         this.end_position = end_position;
    866         this.preceding_word_matched = preceding_word_matched;
    867     }
    868     }
    869 
    870 
    871     static private class PartialPhraseMatch
    872     {
    873     public int start_position;
    874     public int query_phrase_number;
    875     public int num_words_matched;
    876 
    877     public PartialPhraseMatch(int start_position, int query_phrase_number)
    878     {
    879         this.start_position = start_position;
    880         this.query_phrase_number = query_phrase_number;
    881         this.num_words_matched = 1;
    882     }
    883     }
    884940}
Note: See TracChangeset for help on using the changeset viewer.