/** *######################################################################### * GSearchConnection.java - works with the demo-client for Greenstone 3, * of the Greenstone digital library suite from the New Zealand Digital * Library Project at the * University of Waikato, New Zealand. *

* Copyright (C) 2008 New Zealand Digital Library Project *

* This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. *

* This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. *######################################################################## */ package org.greenstone.fedora.services; import java.util.Vector; import java.util.Iterator; import java.util.Map; import java.util.HashMap; import java.net.URL; import javax.xml.namespace.QName; import javax.xml.parsers.DocumentBuilder; import javax.xml.parsers.DocumentBuilderFactory; import javax.xml.rpc.ServiceException; import java.net.MalformedURLException; import org.apache.axis.client.Call; import org.apache.axis.client.Service; import org.apache.log4j.Logger; import javax.xml.parsers.ParserConfigurationException; import org.w3c.dom.Element; import org.w3c.dom.NodeList; /** * Class GSearchConnection connects to FedoraGSearch's web services. * FedorGSearch offers indexing and full-text search functionality for * Fedora repositories. Its search web service (method gFindObjects) * returns the response of a search as XML. * GSearchConnection offers more convenient methods that extract just * the parts of search results that FedoraGS3Connection needs and returns * that. * @author ak19 */ public class GSearchConnection implements FedoraToGS3Interface.Constants { /** Logger for this class. */ private static final Logger LOG = Logger.getLogger( GSearchConnection.class.getName()); /* Accessing the web services of Fedora Generic Search */ protected static String NAMESPACE_URI = "http://server.fedoragsearch.defxws.dk"; protected static String SERVICE_NAME = "OperationsService"; /** The names of the methods we use of Fedora Generic Search's web services * are declared here as static final Strings. */ protected static final String G_FIND_OBJECTS = "gfindObjects"; /* Some fixed string literals that will be encountered in the response XMLs * that FedoraGSearch's method gFindObjects() returns. */ protected static final String PID = "PID"; protected static final String HIT_TOTAL = "hitTotal"; protected static final String OBJECT = "object"; protected static final String FIELD = "field"; protected static final String NAME = "name"; protected static final String DC_TITLE_FIELD = "dc.title"; protected static final String FULLTEXT_FIELD = "ds.fulltext"; /** separator used internally to separate values of a search field */ protected static final String SPACE = " "; /** The name of the Index wherein FedoraGSearch has indexed all the GS3 docs. * This final member is public here so that others may read the indexName * that this GSearchConnection works with. */ public final String indexName; /** The Service object used to connect to the FedoraGSearch web services */ protected final Service service; /** The Call object used to connect to the FedoraGSearch web services */ protected final Call call; /** The portName object used when connecting to FedoraGSearch's web services */ protected final QName portName; /** A DocumentBuilder object used to construct and parse XML */ protected final DocumentBuilder builder; /** Constructor that takes a String representing the url of the WSDL * file for FedoraGSearch's web services, and tries to establish a * connection to those web services. * @param wsdlFileLocation is a String representing the url of the WSDL file * @param indexName is the name of the index that Fedora Generic Search * should work with (the index wherein the indexed GS3 documents have been * placed). */ public GSearchConnection(String wsdlFileLocation, String indexName) throws MalformedURLException, ServiceException, ParserConfigurationException { this.indexName = indexName; URL wsdlURL = new URL(wsdlFileLocation); service = new Service(wsdlURL, new QName(NAMESPACE_URI, SERVICE_NAME)); //call = (Call) service.createCall(new QName(NAMESPACE_URI, PORT_NAME)); Iterator i = service.getPorts(); // FIXME: can we just assume it's the first port of service SERVICE_NAME? // Do we need to work out which port to get??? Remember, the port names // vary between wsdls though! if(i.hasNext()) { portName = (QName)i.next(); call = (Call) service.createCall(portName); String endpointLocation = call.getTargetEndpointAddress(); LOG.debug("Wsdl file url: " + wsdlURL + "\nEndpoint location is: " + endpointLocation); } else { // should never happen: a service without a port // portName = null; call = (Call)service.createCall(); // FIXME: possibly manually get the ports and choose // one containing "FEDORA" and "API-A" in its name? throw new ServiceException(this.getClass() + ": No port in wsdl file"); } // we can set the portName which remains constant for the various methods // call.setPortName(portName); DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance(); builder = factory.newDocumentBuilder(); // to create XML docs } /** * Method to invoke gfindObjects operation of Fedora Generic Search * web services. * * Parameter types, parameter order and return type of gFindObjects are as * obtained from the wsdl file for the Fedora Generic Search web services * located at: * http://localhost:8080/fedoragsearch/services/FgsOperations?wsdl * <wsdl:message name="gfindObjectsRequest"> * <wsdl:part name="query" type="xsd:string"/> * <wsdl:part name="sort" type="xsd:string"/> * <wsdl:part name="hitPageStart" type="xsd:int"/> * <wsdl:part name="hitPageSize" type="xsd:int"/> * <wsdl:part name="snippetsMax" type="xsd:int"/> * <wsdl:part name="fieldMaxLength" type="xsd:int"/> * <wsdl:part name="indexName" type="xsd:string"/> * <wsdl:part name="resultPageXslt" type="xsd:string"/> * </wsdl:message> * * <wsdl:message name="gfindObjectsResponse"> * <wsdl:part name="gfindObjectsReturn" type="xsd:string"/> * </wsdl:message> * * <wsdl:operation name="gfindObjects" * parameterOrder="query sort hitPageStart hitPageSize snippetsMax * fieldMaxLength indexName resultPageXslt"> * * This method works: it searches the dc.title field of our FedoraIndex * for the term (e.g. "interview") and the result returned is an XML String. * * There's no example on how to call gFindObjects with parameters. In * particular, I don't know what values the parameter sort can take. * But topazproject has an example on how to call updateIndex(). * @see An example on how to call updateIndex() with parameters * @see Axis Service class * @see Axis RPC Call, for specification of interface Call * @see Axis client Call class, for implementation of interface Call */ protected String gFindObjects(String searchFieldedTerms, String sort, int hitPageStart, int hitPageSize, int snippetsMax, /*int fieldMaxLength,*/ String indexName, String resultPageXslt) throws Exception { // "Prefills as much info from the WSDL as it can. Right now it's SOAPAction, // operation qname, parameter types and return type of the Web Service. // This method considers that port name and target endpoint address have // already been set. This is useful when you want to use the same Call instance // for several calls on the same Port. NOTE: Not part of JAX-RPC specification." //call.removeAllParameters(); // no need for this when using setOpName below call.setOperationName(G_FIND_OBJECTS); // Max num of chars in field vals returned. Since return values exceeding // maxlength will be truncated, ensure length suffices for long PIDs returned. // The only element of the response XML we'll be using is the PID of the document // in which the searchTerm occurred. final int fieldMaxLength = 100; // NOT TRUE: max length in words of field values // returned. E.g. snippet sizes will be reduced to fieldMaxLength words too. // This is the method call for Fedora 2's GSearch //String valueFound =(String)call.invoke( new Object[] { // searchFieldedTerms, sort, hitPageStart, hitPageSize, snippetsMax, // fieldMaxLength, indexName, resultPageXslt} ); // The method call for GSearch 2.2 of Fedora 3 takes the args in a different order: String valueFound =(String)call.invoke( new Object[] { searchFieldedTerms, hitPageStart, hitPageSize, snippetsMax, fieldMaxLength, indexName, sort, resultPageXslt} ); // for debugging //javax.swing.JOptionPane.showMessageDialog(null, "GSearchConnection.gFindObjects:" + valueFound); return valueFound; } /** * Method that performs a search for the given searchTerm inside the given * indexed field. * @param searchFieldName is the name of the indexed field within which the * given searchTerm is to be searched for. * @param searchTerm is the term to be searched for. * @param hitPageStart is the page of search results to start returning. * @param hitPageSize is the number of search result pages to return, * starting from hitPageStart. * @param snippetsMax is the maximum number of separate snippets containing * the searchTerm that are to be returned. (snippetsMax or a fewer number of * occurrences of the word in the text will be returned) */ public String search(String searchFieldName, String searchTerm, int hitPageStart, int hitPageSize, int snippetsMax) throws Exception { final String sort = ""; // returns results from highest to lowest rank final String resultPageXslt = ""; // when a fieldname is given to search in (ds.fulltext, dc.title) // then prepend that followed by a COLON to the searchTerm. final String fullSearchTerm = searchFieldName.equals("") ? searchTerm : (searchFieldName+":"+searchTerm); return gFindObjects(fullSearchTerm, sort, hitPageStart, hitPageSize, snippetsMax, indexName, resultPageXslt); } /** * FedoraGSearch accepts a query of the form: * <"cyclone val" "Gender Inequalities" ds.fulltext:"cyclone val" * ds.fulltext:"worst storm"> * where the first two phrases are searched for in all indexed fields, * (in this case dc.title and ds.fulltext), while the last two are * searched for in the ds.fulltext field. * Another example: * <gender dc.title:interview ds.fulltext:"cyclone val"> * titles and fulltexts are searched for "gender", while title index * is searched for "interview" and fulltexts are searched for the phrase * "cyclone val" * @param fieldsToSearchTerms is a Hashmap of searchfields and * associated search terms (words or phrases). The terms are in a * comma-separated list. fieldsToSearchTerms is a Hashmap of * (Searchfields, associated-searchTerms) pairs. It can contain 3 * searchfields: allfields, titles, text. The value for each is a * comma-separated list of search terms in that field. * Internally the field names get converted to what FedoraGSearch's * gfindObjects understands: titles becomes dc.title:, text becomes * ds.fulltext and allfields becomes nothing. * @param hitPageStart is the page of search results to start returning. * @param hitPageSize is the number of search result pages to return, * starting from hitPageStart. * @return the XML (in string format) returned from Fedora Generic Search's * gfindObjects method * */ public String search(Map fieldsToSearchTerms, int hitPageStart, int hitPageSize) throws Exception { LOG.debug("In FedoraGS3's GSearchConnection.search(Map,...)"); // HashMap consists of several (key, value) entries, 3 of // which will be dealt with here: // - allfields, // - titles, // - (full)text, // We need to obtain each value and change the separator to space: String allfields = (String)fieldsToSearchTerms.get(ALL_FIELDS); String titles = (String)fieldsToSearchTerms.get(ALL_TITLES); String fulltexts = (String)fieldsToSearchTerms.get(FULLTEXT); // Each field is a comma separated list of terms that may be // either a word OR a phrase. // We're going to separate each term from the list, // and put quotes around phrases, then combine all the terms // together again with spaces to separate them. allfields = formatSearchTermsInField(allfields, ALL_FIELDS); // ALL_FIELDS has no field name titles = formatSearchTermsInField(titles, DC_TITLE_FIELD); fulltexts = formatSearchTermsInField(fulltexts, FULLTEXT_FIELD); String fullSearchTerm = allfields + titles + fulltexts; if(fullSearchTerm.trim().equals("")) { // nothing to search on return ""; } // Finally, restrict the search to the Greenstone digital objects // stored in Fedora final String greenstonePID = PID + FedoraGS3DL.COLON + FedoraGS3DL.GREENSTONE; //"PID:\"greenstone\""; fullSearchTerm += greenstonePID; //! Everything after the colon in the pid is ignored by FedoraGSearch: // "PID:\"greenstone:gs2mgdemo\""; // ignores "gs2mgdemo" // tags interfere when PID field is searched on, set it to 0 return search(fullSearchTerm, hitPageStart, hitPageSize, 0); // return search(fullSearchTerm, hitPageStart, hitPageSize, snippetsMax); } /** Each field is a comma separated list of terms that may be either a word * OR a phrase. We're going to separate each term from the list, and put * quotes around phrases, then combine all the terms together again with * spaces to separate them. Examples: *
dc.title:"a phrase" word
	 * dc.fulltext: "cyclone val"
	 * (ALL_FIELDS) interview gender
* This is required to facilitate fielded searching with fedoraGSearch. * @param field is a comma separated list of search terms (corresponding * to one fieldName) to be reorganised * @param fieldName is the name of the field to prepend to the reorganised * field value. FieldName ALL_FIELDS is ignored. * @return parameter field reorganised such that terms that are phrases * are in quotes and each term is separated by a space from the previous one. */ protected String formatSearchTermsInField(String field, String fieldName) { if(field != null) { // check that the field isn't empty //LOG.debug("field: " + field); String[] terms = field.split(","); field = ""; // we'll build it up again for(int i = 0; i < terms.length; i++) { // if it contains a space, then the term's a phrase, // put it in quotes if(terms[i].indexOf(SPACE) != -1) { terms[i] = "\"" + terms[i] + "\""; } field = field + terms[i] + SPACE; } // Prefix it with the name of the field we want to search for // the term in. Every field other than allfields has a prefix if(!fieldName.equals(ALL_FIELDS)) { field = fieldName + ":" + field; } } else field = ""; return field; } /** * Uses FedoraGSearch to perform a search where the query is embedded in * fieldedSearchTerms, which not only provides the terms to search on, but * also the fields to search the (various) given terms in. * @param fieldedSearchTerms is the String specifying all the search terms * with their fields (or no field if it should search for the terms in * all fields). The terms with no associated search-fields should come first. * Search terms may be in quotes. * @param snippetsMax is the maximum number of separate snippets containing * the searchTerm (snippetsMax number of occurrences of the word in the text) * returned. * @param hitPageStart is the page of search results to start returning. * @param hitPageSize is the number of search result pages to return, * starting from hitPageStart. * @return the XML (in string format) returned from Fedora Generic Search's * gfindObjects method */ public String search(String fieldedSearchTerms, int hitPageStart, int hitPageSize, int snippetsMax) throws Exception { LOG.debug("In method search(String fieldedSearchTerms,...). " + "Query is:\n" + fieldedSearchTerms); final String sort = ""; // returns results from highest to lowest rank final String resultPageXslt = ""; return gFindObjects(fieldedSearchTerms, sort, hitPageStart, hitPageSize, snippetsMax, indexName, resultPageXslt); } /** Call this method with the return value of calling search(). * Search results are returned in GSearch's XML response format, * containing information that includes the PIDs of the documents that * matched the search. These PIDs are returned in the array. * @param collectionName is the name of the collection to restrict the * search results by. If it's "", then results from all collections are * returned. Generally, don't want to pass "", because, theoretically, * all indexed collections in the repository could be considered and * not all of them may be Greenstone collections. If all Greenstone * collections should be searched for, pass "greenstone" as the * collection name instead. * @param searchResult is the Fedora Generic Search XML response returned * from performing a gfindObjects() operations. * @return an array of the pids of documents found for the search. */ public String[] getPIDsFromSearchResult(String collectionName, String searchResult) throws Exception { final String[] empty = {}; if(searchResult.equals("")) { return empty; } // // // // // // greenstone:gs2mgdemo-HASH01d667303fe98545f03c14ae // Fedora // FedoraObject // Active // The Courier - N°159 - Sept- Oct 1996 Dossier Inves ... // 2007-11-23T04:23:15.363Z // 2008-01-15T04:37:49.518Z // some title // some title2 // ... // (The 1993 cyclone, although // Metadata // ... // // // // 1. Get documentElement, which is Element resultPage = FedoraCommons.getResponseAsDOM(builder, searchResult); // 2. find the hitTotal value which is the number of results // it's an attribute of the sole compulsory element int hitTotal = 0; Element gfindObjectsEl = (Element)resultPage.getElementsByTagName(G_FIND_OBJECTS).item(0); String value = gfindObjectsEl.getAttribute(HIT_TOTAL); hitTotal = Integer.parseInt(value); if(hitTotal == 0) { return new String[]{}; } // Our resulting list of pids will be no more than hitTotal, // but may be fewer if we constrain the results to a collection Vector pidsInCollection = new Vector(hitTotal); // Returns a NodeList of all descendant Elements with object tagname NodeList objects = gfindObjectsEl.getElementsByTagName(OBJECT); for(int i = 0; i < objects.getLength(); i++) { // should be the case that pids.length == (digital)objects.getLength() // get the PID of each object Element object = (Element)objects.item(i); NodeList fields = object.getElementsByTagName(FIELD); for(int j = 0; j < fields.getLength(); j++) { // find the sole of where NAME attribute == PID Element field = (Element)fields.item(j); if(field.getAttribute(NAME).equals(PID)) { String pid = FedoraCommons.getValue(field); // Either store only the pids which are part of the collection, // or, if no collection is specified (=""),then store the pid too if(collectionName.equals("") || pid.contains(collectionName)) { pidsInCollection.add(pid); } break; // found pid field, meaning that we have // finished for loop on s of this , // consider next } } } String[] pids = new String[pidsInCollection.size()]; pidsInCollection.toArray(pids); return pids; } public static void main(String[] args) { try { GSearchConnection searcher = new GSearchConnection( "http://localhost:8080/fedoragsearch/services/FgsOperations?wsdl", "FedoraIndex"); HashMap map = new HashMap(); map.put(GSearchConnection.ALL_FIELDS, "gender inequalities"); map.put(GSearchConnection.FULLTEXT, "cyclone val,worst storm"); //map.put(GSearchConnection.ALL_FIELDS, "\"gender inequalities\""); //map.put(GSearchConnection.FULLTEXT, "\"cyclone val\",\"worst storm\""); String searchResult = searcher.search(map, 1, 10); //snippetsMax: 3); System.out.println(searchResult); String[] pids = searcher.getPIDsFromSearchResult("gs2mgdemo", searchResult); System.err.println("Found pids for search:\n"); for(int i = 0; i < pids.length; i++) { System.out.println(pids[i]); } //searchResult = searcher.search("", "minh", 0, 50, 50); //System.err.println(searchResult); //String searchTerms = "cyclone dc.title:interview dc.title:gender"; String searchTerms="\"gender inequalities\" ds.fulltext:\"cyclone val\" ds.fulltext:\"worst storm\""; searchResult = searcher.search(searchTerms, 1, 10, 3); System.out.println(searchResult); // Not restricting results to any collection (search results from // all collections) pids = searcher.getPIDsFromSearchResult("", searchResult); System.err.println("Found pids for search: "); for(int i = 0; i < pids.length; i++) { System.out.println(pids[i]); } searchResult = searcher.search("ds.fulltext", "cyclone", 1, 10, 3); //String searchResult = searcher.search("ds.label", "hierarchical", 1, 10, 3); // System.out.println(searcher.search("ds.fulltext", "Pinky", 1, 10, 3)); System.out.println(searchResult); pids = null; pids = searcher.getPIDsFromSearchResult("", searchResult); System.err.println("Found pids for search: "); for(int i = 0; i < pids.length; i++) { System.out.println(pids[i]); } }catch(Exception e) { System.err.println(e.getMessage()); } } }