/**
*#########################################################################
* GSearchConnection.java - works with the demo-client for Greenstone 3,
* of the Greenstone digital library suite from the New Zealand Digital
* Library Project at the * University of Waikato, New Zealand.
*
* Copyright (C) 2008 New Zealand Digital Library Project
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*########################################################################
*/
package org.greenstone.fedora.services;
import java.util.Vector;
import java.util.Iterator;
import java.util.Map;
import java.util.HashMap;
import java.net.URL;
import javax.xml.namespace.QName;
import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.rpc.ServiceException;
import java.net.MalformedURLException;
import org.apache.axis.client.Call;
import org.apache.axis.client.Service;
import org.apache.log4j.Logger;
import javax.xml.parsers.ParserConfigurationException;
import org.w3c.dom.Element;
import org.w3c.dom.NodeList;
/**
* Class GSearchConnection connects to FedoraGSearch's web services.
* FedorGSearch offers indexing and full-text search functionality for
* Fedora repositories. Its search web service (method gFindObjects)
* returns the response of a search as XML.
* GSearchConnection offers more convenient methods that extract just
* the parts of search results that FedoraGS3Connection needs and returns
* that.
* @author ak19
*/
public class GSearchConnection implements FedoraToGS3Interface.Constants {
/** Logger for this class. */
private static final Logger LOG = Logger.getLogger(
GSearchConnection.class.getName());
/* Accessing the web services of Fedora Generic Search */
protected static String NAMESPACE_URI = "http://server.fedoragsearch.defxws.dk";
protected static String SERVICE_NAME = "OperationsService";
/** The names of the methods we use of Fedora Generic Search's web services
* are declared here as static final Strings. */
protected static final String G_FIND_OBJECTS = "gfindObjects";
/* Some fixed string literals that will be encountered in the response XMLs
* that FedoraGSearch's method gFindObjects() returns. */
protected static final String PID = "PID";
protected static final String HIT_TOTAL = "hitTotal";
protected static final String OBJECT = "object";
protected static final String FIELD = "field";
protected static final String NAME = "name";
protected static final String DC_TITLE_FIELD = "dc.title";
protected static final String FULLTEXT_FIELD = "ds.fulltext";
/** separator used internally to separate values of a search field */
protected static final String SPACE = " ";
/** The name of the Index wherein FedoraGSearch has indexed all the GS3 docs.
* This final member is public here so that others may read the indexName
* that this GSearchConnection works with. */
public final String indexName;
/** The Service object used to connect to the FedoraGSearch web services */
protected final Service service;
/** The Call object used to connect to the FedoraGSearch web services */
protected final Call call;
/** The portName object used when connecting to FedoraGSearch's web services */
protected final QName portName;
/** A DocumentBuilder object used to construct and parse XML */
protected final DocumentBuilder builder;
/** Constructor that takes a String representing the url of the WSDL
* file for FedoraGSearch's web services, and tries to establish a
* connection to those web services.
* @param wsdlFileLocation is a String representing the url of the WSDL file
* @param indexName is the name of the index that Fedora Generic Search
* should work with (the index wherein the indexed GS3 documents have been
* placed).
*/
public GSearchConnection(String wsdlFileLocation, String indexName)
throws MalformedURLException, ServiceException,
ParserConfigurationException
{
this.indexName = indexName;
URL wsdlURL = new URL(wsdlFileLocation);
service = new Service(wsdlURL, new QName(NAMESPACE_URI, SERVICE_NAME));
//call = (Call) service.createCall(new QName(NAMESPACE_URI, PORT_NAME));
Iterator i = service.getPorts();
// FIXME: can we just assume it's the first port of service SERVICE_NAME?
// Do we need to work out which port to get??? Remember, the port names
// vary between wsdls though!
if(i.hasNext()) {
portName = (QName)i.next();
call = (Call) service.createCall(portName);
String endpointLocation = call.getTargetEndpointAddress();
LOG.debug("Wsdl file url: " + wsdlURL
+ "\nEndpoint location is: " + endpointLocation);
} else { // should never happen: a service without a port
// portName = null;
call = (Call)service.createCall();
// FIXME: possibly manually get the ports and choose
// one containing "FEDORA" and "API-A" in its name?
throw new ServiceException(this.getClass() + ": No port in wsdl file");
}
// we can set the portName which remains constant for the various methods
// call.setPortName(portName);
DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance();
builder = factory.newDocumentBuilder(); // to create XML docs
}
/**
* Method to invoke gfindObjects operation of Fedora Generic Search
* web services.
*
* Parameter types, parameter order and return type of gFindObjects are as
* obtained from the wsdl file for the Fedora Generic Search web services
* located at:
* http://localhost:8080/fedoragsearch/services/FgsOperations?wsdl
* <wsdl:message name="gfindObjectsRequest">
* <wsdl:part name="query" type="xsd:string"/>
* <wsdl:part name="sort" type="xsd:string"/>
* <wsdl:part name="hitPageStart" type="xsd:int"/>
* <wsdl:part name="hitPageSize" type="xsd:int"/>
* <wsdl:part name="snippetsMax" type="xsd:int"/>
* <wsdl:part name="fieldMaxLength" type="xsd:int"/>
* <wsdl:part name="indexName" type="xsd:string"/>
* <wsdl:part name="resultPageXslt" type="xsd:string"/>
* </wsdl:message>
*
* <wsdl:message name="gfindObjectsResponse">
* <wsdl:part name="gfindObjectsReturn" type="xsd:string"/>
* </wsdl:message>
*
* <wsdl:operation name="gfindObjects"
* parameterOrder="query sort hitPageStart hitPageSize snippetsMax
* fieldMaxLength indexName resultPageXslt">
*
* This method works: it searches the dc.title field of our FedoraIndex
* for the term (e.g. "interview") and the result returned is an XML String.
*
* There's no example on how to call gFindObjects with parameters. In
* particular, I don't know what values the parameter sort can take.
* But topazproject has an example on how to call updateIndex().
* @see An example on how to call updateIndex() with parameters
* @see Axis Service class
* @see Axis RPC Call, for specification of interface Call
* @see Axis client Call class, for implementation of interface Call
*/
protected String gFindObjects(String searchFieldedTerms, String sort,
int hitPageStart, int hitPageSize, int snippetsMax,
/*int fieldMaxLength,*/ String indexName, String resultPageXslt) throws Exception
{
// "Prefills as much info from the WSDL as it can. Right now it's SOAPAction,
// operation qname, parameter types and return type of the Web Service.
// This method considers that port name and target endpoint address have
// already been set. This is useful when you want to use the same Call instance
// for several calls on the same Port. NOTE: Not part of JAX-RPC specification."
//call.removeAllParameters(); // no need for this when using setOpName below
call.setOperationName(G_FIND_OBJECTS);
// Max num of chars in field vals returned. Since return values exceeding
// maxlength will be truncated, ensure length suffices for long PIDs returned.
// The only element of the response XML we'll be using is the PID of the document
// in which the searchTerm occurred.
final int fieldMaxLength = 100; // NOT TRUE: max length in words of field values
// returned. E.g. snippet sizes will be reduced to fieldMaxLength words too.
// This is the method call for Fedora 2's GSearch
//String valueFound =(String)call.invoke( new Object[] {
// searchFieldedTerms, sort, hitPageStart, hitPageSize, snippetsMax,
// fieldMaxLength, indexName, resultPageXslt} );
// The method call for GSearch 2.2 of Fedora 3 takes the args in a different order:
String valueFound =(String)call.invoke( new Object[] {
searchFieldedTerms, hitPageStart, hitPageSize, snippetsMax,
fieldMaxLength, indexName, sort, resultPageXslt} );
// for debugging
//javax.swing.JOptionPane.showMessageDialog(null, "GSearchConnection.gFindObjects:" + valueFound);
return valueFound;
}
/**
* Method that performs a search for the given searchTerm inside the given
* indexed field.
* @param searchFieldName is the name of the indexed field within which the
* given searchTerm is to be searched for.
* @param searchTerm is the term to be searched for.
* @param hitPageStart is the page of search results to start returning.
* @param hitPageSize is the number of search result pages to return,
* starting from hitPageStart.
* @param snippetsMax is the maximum number of separate snippets containing
* the searchTerm that are to be returned. (snippetsMax or a fewer number of
* occurrences of the word in the text will be returned)
*/
public String search(String searchFieldName, String searchTerm,
int hitPageStart, int hitPageSize, int snippetsMax) throws Exception
{
final String sort = ""; // returns results from highest to lowest rank
final String resultPageXslt = "";
// when a fieldname is given to search in (ds.fulltext, dc.title)
// then prepend that followed by a COLON to the searchTerm.
final String fullSearchTerm = searchFieldName.equals("") ?
searchTerm : (searchFieldName+":"+searchTerm);
return gFindObjects(fullSearchTerm, sort,
hitPageStart, hitPageSize, snippetsMax,
indexName, resultPageXslt);
}
/**
* FedoraGSearch accepts a query of the form:
* <"cyclone val" "Gender Inequalities" ds.fulltext:"cyclone val"
* ds.fulltext:"worst storm">
* where the first two phrases are searched for in all indexed fields,
* (in this case dc.title and ds.fulltext), while the last two are
* searched for in the ds.fulltext field.
* Another example:
* <gender dc.title:interview ds.fulltext:"cyclone val">
* titles and fulltexts are searched for "gender", while title index
* is searched for "interview" and fulltexts are searched for the phrase
* "cyclone val"
* @param fieldsToSearchTerms is a Hashmap of searchfields and
* associated search terms (words or phrases). The terms are in a
* comma-separated list. fieldsToSearchTerms is a Hashmap of
* (Searchfields, associated-searchTerms) pairs. It can contain 3
* searchfields: allfields, titles, text. The value for each is a
* comma-separated list of search terms in that field.
* Internally the field names get converted to what FedoraGSearch's
* gfindObjects understands: titles becomes dc.title:, text becomes
* ds.fulltext and allfields becomes nothing.
* @param hitPageStart is the page of search results to start returning.
* @param hitPageSize is the number of search result pages to return,
* starting from hitPageStart.
* @return the XML (in string format) returned from Fedora Generic Search's
* gfindObjects method
*
*/
public String search(Map fieldsToSearchTerms,
int hitPageStart, int hitPageSize)
throws Exception
{
LOG.debug("In FedoraGS3's GSearchConnection.search(Map,...)");
// HashMap consists of several (key, value) entries, 3 of
// which will be dealt with here:
// - allfields,
// - titles,
// - (full)text,
// We need to obtain each value and change the separator to space:
String allfields = (String)fieldsToSearchTerms.get(ALL_FIELDS);
String titles = (String)fieldsToSearchTerms.get(ALL_TITLES);
String fulltexts = (String)fieldsToSearchTerms.get(FULLTEXT);
// Each field is a comma separated list of terms that may be
// either a word OR a phrase.
// We're going to separate each term from the list,
// and put quotes around phrases, then combine all the terms
// together again with spaces to separate them.
allfields = formatSearchTermsInField(allfields, ALL_FIELDS);
// ALL_FIELDS has no field name
titles = formatSearchTermsInField(titles, DC_TITLE_FIELD);
fulltexts = formatSearchTermsInField(fulltexts, FULLTEXT_FIELD);
String fullSearchTerm = allfields + titles + fulltexts;
if(fullSearchTerm.trim().equals("")) { // nothing to search on
return "";
}
// Finally, restrict the search to the Greenstone digital objects
// stored in Fedora
final String greenstonePID
= PID + FedoraGS3DL.COLON + FedoraGS3DL.GREENSTONE;
//"PID:\"greenstone\"";
fullSearchTerm += greenstonePID;
//! Everything after the colon in the pid is ignored by FedoraGSearch:
// "PID:\"greenstone:gs2mgdemo\""; // ignores "gs2mgdemo"
// tags interfere when PID field is searched on, set it to 0
return search(fullSearchTerm, hitPageStart, hitPageSize, 0);
// return search(fullSearchTerm, hitPageStart, hitPageSize, snippetsMax);
}
/** Each field is a comma separated list of terms that may be either a word
* OR a phrase. We're going to separate each term from the list, and put
* quotes around phrases, then combine all the terms together again with
* spaces to separate them. Examples:
*
* This is required to facilitate fielded searching with fedoraGSearch.
* @param field is a comma separated list of search terms (corresponding
* to one fieldName) to be reorganised
* @param fieldName is the name of the field to prepend to the reorganised
* field value. FieldName ALL_FIELDS is ignored.
* @return parameter field reorganised such that terms that are phrases
* are in quotes and each term is separated by a space from the previous one.
*/
protected String formatSearchTermsInField(String field, String fieldName)
{
if(field != null) { // check that the field isn't empty
//LOG.debug("field: " + field);
String[] terms = field.split(",");
field = ""; // we'll build it up again
for(int i = 0; i < terms.length; i++) {
// if it contains a space, then the term's a phrase,
// put it in quotes
if(terms[i].indexOf(SPACE) != -1) {
terms[i] = "\"" + terms[i] + "\"";
}
field = field + terms[i] + SPACE;
}
// Prefix it with the name of the field we want to search for
// the term in. Every field other than allfields has a prefix
if(!fieldName.equals(ALL_FIELDS)) {
field = fieldName + ":" + field;
}
} else field = "";
return field;
}
/**
* Uses FedoraGSearch to perform a search where the query is embedded in
* fieldedSearchTerms, which not only provides the terms to search on, but
* also the fields to search the (various) given terms in.
* @param fieldedSearchTerms is the String specifying all the search terms
* with their fields (or no field if it should search for the terms in
* all fields). The terms with no associated search-fields should come first.
* Search terms may be in quotes.
* @param snippetsMax is the maximum number of separate snippets containing
* the searchTerm (snippetsMax number of occurrences of the word in the text)
* returned.
* @param hitPageStart is the page of search results to start returning.
* @param hitPageSize is the number of search result pages to return,
* starting from hitPageStart.
* @return the XML (in string format) returned from Fedora Generic Search's
* gfindObjects method
*/
public String search(String fieldedSearchTerms,
int hitPageStart, int hitPageSize, int snippetsMax) throws Exception
{
LOG.debug("In method search(String fieldedSearchTerms,...). "
+ "Query is:\n" + fieldedSearchTerms);
final String sort = ""; // returns results from highest to lowest rank
final String resultPageXslt = "";
return gFindObjects(fieldedSearchTerms, sort,
hitPageStart, hitPageSize, snippetsMax,
indexName, resultPageXslt);
}
/** Call this method with the return value of calling search().
* Search results are returned in GSearch's XML response format,
* containing information that includes the PIDs of the documents that
* matched the search. These PIDs are returned in the array.
* @param collectionName is the name of the collection to restrict the
* search results by. If it's "", then results from all collections are
* returned. Generally, don't want to pass "", because, theoretically,
* all indexed collections in the repository could be considered and
* not all of them may be Greenstone collections. If all Greenstone
* collections should be searched for, pass "greenstone" as the
* collection name instead.
* @param searchResult is the Fedora Generic Search XML response returned
* from performing a gfindObjects() operations.
* @return an array of the pids of documents found for the search. */
public String[] getPIDsFromSearchResult(String collectionName,
String searchResult)
throws Exception
{
final String[] empty = {};
if(searchResult.equals("")) {
return empty;
}
//
//
//
//
//
//
//
// 1. Get documentElement, which is
Element resultPage = FedoraCommons.getResponseAsDOM(builder, searchResult);
// 2. find the hitTotal value which is the number of results
// it's an attribute of the sole compulsory element
int hitTotal = 0;
Element gfindObjectsEl
= (Element)resultPage.getElementsByTagName(G_FIND_OBJECTS).item(0);
String value = gfindObjectsEl.getAttribute(HIT_TOTAL);
hitTotal = Integer.parseInt(value);
if(hitTotal == 0) {
return new String[]{};
}
// Our resulting list of pids will be no more than hitTotal,
// but may be fewer if we constrain the results to a collection
Vector pidsInCollection = new Vector(hitTotal);
// Returns a NodeList of all descendant Elements with object tagname
NodeList objects = gfindObjectsEl.getElementsByTagName(OBJECT);
for(int i = 0; i < objects.getLength(); i++) {
// should be the case that pids.length == (digital)objects.getLength()
// get the PID of each object
Element object = (Element)objects.item(i);
NodeList fields = object.getElementsByTagName(FIELD);
for(int j = 0; j < fields.getLength(); j++) {
// find the sole of