source: other-projects/trunk/gs3-webservices-democlient/src/GS3Fedora/org/greenstone/fedora/services/GSearchConnection.java@ 15437

Last change on this file since 15437 was 15437, checked in by ak19, 16 years ago

Introduced GSearchConnection member indexName set in its constructor by FedoraGS3Connection

File size: 22.9 KB
Line 
1/**
2 *#########################################################################
3 * GSearchConnection.java - works with the demo-client for Greenstone 3,
4 * of the Greenstone digital library suite from the New Zealand Digital
5 * Library Project at the * University of Waikato, New Zealand.
6 * <BR><BR>
7 * Copyright (C) 2008 New Zealand Digital Library Project
8 * <BR><BR>
9 * This program is free software; you can redistribute it and/or modify
10 * it under the terms of the GNU General Public License as published by
11 * the Free Software Foundation; either version 2 of the License, or
12 * (at your option) any later version.
13 * <BR><BR>
14 * This program is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 * GNU General Public License for more details.
18 *########################################################################
19 */
20
21package org.greenstone.fedora.services;
22
23import java.util.Vector;
24import java.util.Iterator;
25import java.util.Map;
26import java.util.HashMap;
27
28import java.net.URL;
29import javax.xml.namespace.QName;
30import javax.xml.parsers.DocumentBuilder;
31import javax.xml.parsers.DocumentBuilderFactory;
32import javax.xml.rpc.ServiceException;
33import java.net.MalformedURLException;
34
35import org.apache.axis.client.Call;
36import org.apache.axis.client.Service;
37import org.apache.log4j.Logger;
38
39import javax.xml.parsers.ParserConfigurationException;
40import org.w3c.dom.Element;
41import org.w3c.dom.NodeList;
42
43
44/**
45 * Class GSearchConnection connects to FedoraGSearch's web services.
46 * FedorGSearch offers indexing and full-text search functionality for
47 * Fedora repositories. Its search web service (method gFindObjects)
48 * returns the response of a search as XML.
49 * GSearchConnection offers more convenient methods that extract just
50 * the parts of search results that FedoraGS3Connection needs and returns
51 * that.
52 * @author ak19
53*/
54public class GSearchConnection implements FedoraToGS3Interface.Constants {
55 /** Logger for this class. */
56 private static final Logger LOG = Logger.getLogger(
57 GSearchConnection.class.getName());
58
59 /* Accessing the web services of Fedora Generic Search */
60 protected static String NAMESPACE_URI = "http://server.fedoragsearch.defxws.dk";
61 protected static String SERVICE_NAME = "OperationsService";
62
63 /** The names of the methods we use of Fedora Generic Search's web services
64 * are declared here as static final Strings. */
65 protected static final String G_FIND_OBJECTS = "gfindObjects";
66
67 /* Some fixed string literals that will be encountered in the response XMLs
68 * that FedoraGSearch's method gFindObjects() returns. */
69 protected static final String PID = "PID";
70 protected static final String HIT_TOTAL = "hitTotal";
71 protected static final String OBJECT = "object";
72 protected static final String FIELD = "field";
73 protected static final String NAME = "name";
74 protected static final String DC_TITLE_FIELD = "dc.title";
75 protected static final String FULLTEXT_FIELD = "ds.fulltext";
76
77 /** separator used internally to separate values of a search field */
78 protected static final String SPACE = " ";
79
80 /** The name of the Index wherein FedoraGSearch has indexed all the GS3 docs.
81 * This final member is public here so that others may read the indexName
82 * that this GSearchConnection works with. */
83 public final String indexName;
84
85 /** The Service object used to connect to the FedoraGSearch web services */
86 protected final Service service;
87 /** The Call object used to connect to the FedoraGSearch web services */
88 protected final Call call;
89 /** The portName object used when connecting to FedoraGSearch's web services */
90 protected final QName portName;
91
92 /** A DocumentBuilder object used to construct and parse XML */
93 protected final DocumentBuilder builder;
94
95
96
97 /** Constructor that takes a String representing the url of the WSDL
98 * file for FedoraGSearch's web services, and tries to establish a
99 * connection to those web services.
100 * @param wsdlFileLocation is a String representing the url of the WSDL file
101 * @param indexName is the name of the index that Fedora Generic Search
102 * should work with (the index wherein the indexed GS3 documents have been
103 * placed).
104 */
105 public GSearchConnection(String wsdlFileLocation, String indexName)
106 throws MalformedURLException, ServiceException,
107 ParserConfigurationException
108 {
109 this.indexName = indexName;
110
111 URL wsdlURL = new URL(wsdlFileLocation);
112 service = new Service(wsdlURL, new QName(NAMESPACE_URI, SERVICE_NAME));
113 //call = (Call) service.createCall(new QName(NAMESPACE_URI, PORT_NAME));
114
115 Iterator i = service.getPorts();
116 // FIXME: can we just assume it's the first port of service SERVICE_NAME?
117 // Do we need to work out which port to get??? Remember, the port names
118 // vary between wsdls though!
119 if(i.hasNext()) {
120 portName = (QName)i.next();
121 call = (Call) service.createCall(portName);
122
123 String endpointLocation = call.getTargetEndpointAddress();
124 LOG.debug("Wsdl file url: " + wsdlURL
125 + "\nEndpoint location is: " + endpointLocation);
126 } else { // should never happen: a service without a port
127 // portName = null;
128 call = (Call)service.createCall();
129 // FIXME: possibly manually get the ports and choose
130 // one containing "FEDORA" and "API-A" in its name?
131 throw new ServiceException(this.getClass() + ": No port in wsdl file");
132 }
133
134 // we can set the portName which remains constant for the various methods
135 // call.setPortName(portName);
136
137 DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance();
138 builder = factory.newDocumentBuilder(); // to create XML docs
139 }
140
141
142 /**
143 * Method to invoke gfindObjects operation of Fedora Generic Search
144 * web services.
145 *
146 * Parameter types, parameter order and return type of gFindObjects are as
147 * obtained from the wsdl file for the Fedora Generic Search web services
148 * located at:
149 * http://localhost:8080/fedoragsearch/services/FgsOperations?wsdl
150 * &lt;wsdl:message name="gfindObjectsRequest"&gt;
151 * &lt;wsdl:part name="query" type="xsd:string"/&gt;
152 * &lt;wsdl:part name="sort" type="xsd:string"/&gt;
153 * &lt;wsdl:part name="hitPageStart" type="xsd:int"/&gt;
154 * &lt;wsdl:part name="hitPageSize" type="xsd:int"/&gt;
155 * &lt;wsdl:part name="snippetsMax" type="xsd:int"/&gt;
156 * &lt;wsdl:part name="fieldMaxLength" type="xsd:int"/&gt;
157 * &lt;wsdl:part name="indexName" type="xsd:string"/&gt;
158 * &lt;wsdl:part name="resultPageXslt" type="xsd:string"/&gt;
159 * &lt;/wsdl:message&gt;
160 *
161 * &lt;wsdl:message name="gfindObjectsResponse"&gt;
162 * &lt;wsdl:part name="gfindObjectsReturn" type="xsd:string"/&gt;
163 * &lt;/wsdl:message&gt;
164 *
165 * &lt;wsdl:operation name="gfindObjects"
166 * parameterOrder="query sort hitPageStart hitPageSize snippetsMax
167 * fieldMaxLength indexName resultPageXslt"&gt;
168 *
169 * This method works: it searches the dc.title field of our FedoraIndex
170 * for the term (e.g. "interview") and the result returned is an XML String.
171 *
172 * There's no example on how to call gFindObjects with parameters. In
173 * particular, I don't know what values the parameter <b>sort</b> can take.
174 * But topazproject has an example on how to call updateIndex().
175 * @see <a href="http://www.topazproject.org/trac/wiki/FedoraSearch?format=txt">An example on how to call updateIndex() with parameters</a>
176 * @see <a href="http://ws.apache.org/axis/java/apiDocs/org/apache/axis/client/Service.html">Axis Service class</a>
177 * @see <a href="http://ws.apache.org/axis/java/apiDocs/javax/xml/rpc/Call.html">Axis RPC Call, for specification of interface Call</a>
178 * @see <a href="http://ws.apache.org/axis/java/apiDocs/org/apache/axis/client/Call.html">Axis client Call class, for implementation of interface Call</a>
179 */
180 protected String gFindObjects(String searchFieldedTerms, String sort,
181 int hitPageStart, int hitPageSize, int snippetsMax,
182 int fieldMaxLength, String indexName, String resultPageXslt) throws Exception
183 {
184 // "Prefills as much info from the WSDL as it can. Right now it's SOAPAction,
185 // operation qname, parameter types and return type of the Web Service.
186 // This method considers that port name and target endpoint address have
187 // already been set. This is useful when you want to use the same Call instance
188 // for several calls on the same Port. NOTE: Not part of JAX-RPC specification."
189
190 //call.removeAllParameters(); // no need for this when using setOpName below
191 call.setOperationName(G_FIND_OBJECTS);
192
193 String valueFound =(String)call.invoke( new Object[] {
194 searchFieldedTerms, sort, hitPageStart, hitPageSize, snippetsMax,
195 fieldMaxLength, indexName, resultPageXslt} );
196
197 return valueFound;
198 }
199
200 /**
201 * Method that performs a search for the given searchTerm inside the given
202 * indexed field.
203 * @param searchFieldName is the name of the indexed field within which the
204 * given searchTerm is to be searched for.
205 * @param searchTerm is the term to be searched for.
206 * @param hitPageStart is the page of search results to start returning.
207 * @param hitPageSize is the number of search result pages to return,
208 * starting from hitPageStart.
209 * @param snippetsMax is the maximum number of separate snippets containing
210 * the searchTerm that are to be returned. (snippetsMax or a fewer number of
211 * occurrences of the word in the text will be returned)
212 */
213 public String search(String searchFieldName, String searchTerm,
214 int hitPageStart, int hitPageSize, int snippetsMax) throws Exception
215 {
216 final String sort = ""; // returns results from highest to lowest rank
217 final int fieldMaxLength = 50; // maximum length in words of field values
218 // returned. E.g. snippet sizes will be reduced to fieldMaxLength
219 // words too. It doesn't matter what we set this too, the only
220 // element of the response XML we'll be using is the PID of the
221 // document in which the searchTerm occurred.
222 final String resultPageXslt = "";
223
224 // when a fieldname is given to search in (ds.fulltext, dc.title)
225 // then prepend that followed by a COLON to the searchTerm.
226 final String fullSearchTerm = searchFieldName.equals("") ?
227 searchTerm : (searchFieldName+":"+searchTerm);
228
229 return gFindObjects(fullSearchTerm, sort,
230 hitPageStart, hitPageSize, snippetsMax,
231 fieldMaxLength, indexName, resultPageXslt);
232 }
233
234 /**
235 * FedoraGSearch accepts a query of the form:
236 * <code>&lt;"cyclone val" "Gender Inequalities" ds.fulltext:"cyclone val"
237 * ds.fulltext:"worst storm"&gt;</code>
238 * where the first two phrases are searched for in all indexed fields,
239 * (in this case dc.title and ds.fulltext), while the last two are
240 * searched for in the ds.fulltext field.
241 * Another example:
242 * <code>&lt;gender dc.title:interview ds.fulltext:"cyclone val"&gt;
243 * titles and fulltexts are searched for "gender", while title index
244 * is searched for "interview" and fulltexts are searched for the phrase
245 * "cyclone val"</code>
246 * @param fieldsToSearchTerms is a Hashmap of searchfields and
247 * associated search terms (words or phrases). The terms are in a
248 * comma-separated list. fieldsToSearchTerms is a Hashmap of
249 * (Searchfields, associated-searchTerms) pairs. It can contain 3
250 * searchfields: allfields, titles, text. The value for each is a
251 * comma-separated list of search terms in that field.
252 * Internally the field names get converted to what FedoraGSearch's
253 * gfindObjects understands: titles becomes dc.title:, text becomes
254 * ds.fulltext and allfields becomes nothing.
255 * @param hitPageStart is the page of search results to start returning.
256 * @param hitPageSize is the number of search result pages to return,
257 * starting from hitPageStart.
258 * @return the XML (in string format) returned from Fedora Generic Search's
259 * gfindObjects method
260 *
261 */
262 public String search(Map fieldsToSearchTerms,
263 int hitPageStart, int hitPageSize)
264 throws Exception
265 {
266 LOG.debug("In FedoraGS3's GSearchConnection.search(Map,...)");
267
268 // HashMap consists of several (key, value) entries, 3 of
269 // which will be dealt with here:
270 // - allfields, <comma separated list of search terms/phrases>
271 // - titles, <comma separated list of search terms/phrases>
272 // - (full)text, <comma separated list of search terms/phrases>
273 // We need to obtain each value and change the separator to space:
274 String allfields = (String)fieldsToSearchTerms.get(ALL_FIELDS);
275 String titles = (String)fieldsToSearchTerms.get(ALL_TITLES);
276 String fulltexts = (String)fieldsToSearchTerms.get(FULLTEXT);
277
278 // Each field is a comma separated list of terms that may be
279 // either a word OR a phrase.
280 // We're going to separate each term from the list,
281 // and put quotes around phrases, then combine all the terms
282 // together again with spaces to separate them.
283 allfields = formatSearchTermsInField(allfields, ALL_FIELDS);
284 // ALL_FIELDS has no field name
285 titles = formatSearchTermsInField(titles, DC_TITLE_FIELD);
286 fulltexts = formatSearchTermsInField(fulltexts, FULLTEXT_FIELD);
287
288 String fullSearchTerm = allfields + titles + fulltexts;
289 if(fullSearchTerm.trim().equals("")) { // nothing to search on
290 return "";
291 }
292
293 // Finally, restrict the search to the Greenstone digital objects
294 // stored in Fedora
295 final String greenstonePID
296 = PID + FedoraGS3DL.COLON + FedoraGS3DL.GREENSTONE;
297 //"PID:\"greenstone\"";
298 fullSearchTerm += greenstonePID;
299 //! Everything after the colon in the pid is ignored by FedoraGSearch:
300 // "PID:\"greenstone:gs2mgdemo\""; // ignores "gs2mgdemo"
301
302 // <snippet> tags interfere when PID field is searched on, set it to 0
303 return search(fullSearchTerm, hitPageStart, hitPageSize, 0);
304 // return search(fullSearchTerm, hitPageStart, hitPageSize, snippetsMax);
305 }
306
307 /** Each field is a comma separated list of terms that may be either a word
308 * OR a phrase. We're going to separate each term from the list, and put
309 * quotes around phrases, then combine all the terms together again with
310 * spaces to separate them. Examples:
311 * <pre>dc.title:"a phrase" word
312 * dc.fulltext: "cyclone val"
313 * (ALL_FIELDS) interview gender</pre>
314 * This is required to facilitate fielded searching with fedoraGSearch.
315 * @param field is a comma separated list of search terms (corresponding
316 * to one fieldName) to be reorganised
317 * @param fieldName is the name of the field to prepend to the reorganised
318 * field value. FieldName ALL_FIELDS is ignored.
319 * @return parameter field reorganised such that terms that are phrases
320 * are in quotes and each term is separated by a space from the previous one.
321 */
322 protected String formatSearchTermsInField(String field, String fieldName)
323 {
324 if(field != null) { // check that the field isn't empty
325 //LOG.debug("field: " + field);
326 String[] terms = field.split(",");
327 field = ""; // we'll build it up again
328 for(int i = 0; i < terms.length; i++) {
329 // if it contains a space, then the term's a phrase,
330 // put it in quotes
331 if(terms[i].indexOf(SPACE) != -1) {
332 terms[i] = "\"" + terms[i] + "\"";
333 }
334 field = field + terms[i] + SPACE;
335 }
336
337 // Prefix it with the name of the field we want to search for
338 // the term in. Every field other than allfields has a prefix
339 if(!fieldName.equals(ALL_FIELDS)) {
340 field = fieldName + ":" + field;
341 }
342
343 } else field = "";
344 return field;
345 }
346
347 /**
348 * Uses FedoraGSearch to perform a search where the query is embedded in
349 * fieldedSearchTerms, which not only provides the terms to search on, but
350 * also the fields to search the (various) given terms in.
351 * @param fieldedSearchTerms is the String specifying all the search terms
352 * with their fields (or no field if it should search for the terms in
353 * all fields). The terms with no associated search-fields should come first.
354 * Search terms may be in quotes.
355 * @param snippetsMax is the maximum number of separate snippets containing
356 * the searchTerm (snippetsMax number of occurrences of the word in the text)
357 * returned.
358 * @param hitPageStart is the page of search results to start returning.
359 * @param hitPageSize is the number of search result pages to return,
360 * starting from hitPageStart.
361 * @return the XML (in string format) returned from Fedora Generic Search's
362 * gfindObjects method
363 */
364 public String search(String fieldedSearchTerms,
365 int hitPageStart, int hitPageSize, int snippetsMax) throws Exception
366 {
367 LOG.debug("In method search(String fieldedSearchTerms,...). "
368 + "Query is:\n" + fieldedSearchTerms);
369
370 final String sort = ""; // returns results from highest to lowest rank
371 final int fieldMaxLength = 50; // maximum length in words of field values
372 // returned. E.g. snippet sizes will be reduced to fieldMaxLength
373 // words too
374 final String resultPageXslt = "";
375 return gFindObjects(fieldedSearchTerms, sort,
376 hitPageStart, hitPageSize, snippetsMax,
377 fieldMaxLength, indexName, resultPageXslt);
378 }
379
380 /** Call this method with the return value of calling search().
381 * Search results are returned in GSearch's XML response format,
382 * containing information that includes the PIDs of the documents that
383 * matched the search. These PIDs are returned in the array.
384 * @param collectionName is the name of the collection to restrict the
385 * search results by. If it's "", then results from all collections are
386 * returned. Generally, don't want to pass "", because, theoretically,
387 * all indexed collections in the repository could be considered and
388 * not all of them may be Greenstone collections. If all Greenstone
389 * collections should be searched for, pass "greenstone" as the
390 * collection name instead.
391 * @param searchResult is the Fedora Generic Search XML response returned
392 * from performing a gfindObjects() operations.
393 * @return an array of the pids of documents found for the search. */
394 public String[] getPIDsFromSearchResult(String collectionName,
395 String searchResult)
396 throws Exception
397 {
398 final String[] empty = {};
399 if(searchResult.equals("")) {
400 return empty;
401 }
402
403 // <?xml version="1.0" encoding="UTF-8"?>
404 // <resultPage xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:foxml="info:fedora/fedora-system:def/foxml#" xmlns:zs="http://www.loc.gov/zing/srw/" indexName="FedoraIndex" dateTime="Sat Feb 09 16:43:04 NZDT 2008">
405 // <gfindObjects hitTotal="1" resultPageXslt="" hitPageSize="10" hitPageStart="1" query="ds.fulltext:Cyclone">
406 // <objects>
407 // <object no="1" score="0.24639596">
408 // <field name="PID">greenstone:gs2mgdemo-HASH01d667303fe98545f03c14ae</field>
409 // <field name="repositoryName">Fedora</field>
410 // <field name="object.type">FedoraObject</field>
411 // <field name="object.state">Active</field>
412 // <field name="object.label">The Courier - N°159 - Sept- Oct 1996 Dossier Inves ... </field>
413 // <field name="object.createdDate">2007-11-23T04:23:15.363Z</field>
414 // <field name="object.lastModifiedDate">2008-01-15T04:37:49.518Z</field>
415 // <field name="dc.title">some title</field>
416 // <field name="dc.title">some title2</field>
417 // ...
418 // <field name="ds.fulltext" snippet="yes">(The 1993 <span class="highlight">cyclone</span>, although</field>
419 // <field name="ds.label">Metadata</field>
420 // ...
421 // </object>
422 // </objects>
423 // </gfindObjects>
424 // 1. Get documentElement, which is <resultPage>
425 Element resultPage = FedoraCommons.getResponseAsDOM(builder, searchResult);
426 // 2. find the hitTotal value which is the number of results
427 // it's an attribute of the sole compulsory <gFindObjects> element
428 int hitTotal = 0;
429 Element gfindObjectsEl
430 = (Element)resultPage.getElementsByTagName(G_FIND_OBJECTS).item(0);
431 String value = gfindObjectsEl.getAttribute(HIT_TOTAL);
432 hitTotal = Integer.parseInt(value);
433 if(hitTotal == 0) {
434 return new String[]{};
435 }
436
437 // Our resulting list of pids will be no more than hitTotal,
438 // but may be fewer if we constrain the results to a collection
439 Vector pidsInCollection = new Vector(hitTotal);
440
441 // Returns a NodeList of all descendant Elements with object tagname
442 NodeList objects = gfindObjectsEl.getElementsByTagName(OBJECT);
443 for(int i = 0; i < objects.getLength(); i++) {
444 // should be the case that pids.length == (digital)objects.getLength()
445 // get the PID of each object
446 Element object = (Element)objects.item(i);
447 NodeList fields = object.getElementsByTagName(FIELD);
448
449 for(int j = 0; j < fields.getLength(); j++) {
450 // find the sole <field> of <object> where NAME attribute == PID
451 Element field = (Element)fields.item(j);
452 if(field.getAttribute(NAME).equals(PID)) {
453 String pid = FedoraCommons.getValue(field);
454 // Either store only the pids which are part of the collection,
455 // or, if no collection is specified (=""),then store the pid too
456 if(collectionName.equals("") || pid.contains(collectionName)) {
457 pidsInCollection.add(pid);
458 }
459 break; // found pid field, meaning that we have
460 // finished for loop on <field>s of this <object>,
461 // consider next <object>
462 }
463 }
464 }
465 String[] pids = new String[pidsInCollection.size()];
466 pidsInCollection.toArray(pids);
467 return pids;
468 }
469
470 public static void main(String[] args) {
471 try {
472 GSearchConnection searcher = new GSearchConnection(
473 "http://localhost:8080/fedoragsearch/services/FgsOperations?wsdl", "FedoraIndex");
474
475
476 HashMap map = new HashMap();
477 map.put(GSearchConnection.ALL_FIELDS, "gender inequalities");
478 map.put(GSearchConnection.FULLTEXT, "cyclone val,worst storm");
479 //map.put(GSearchConnection.ALL_FIELDS, "\"gender inequalities\"");
480 //map.put(GSearchConnection.FULLTEXT, "\"cyclone val\",\"worst storm\"");
481 String searchResult = searcher.search(map, 1, 10); //snippetsMax: 3);
482 System.out.println(searchResult);
483
484 String[] pids = searcher.getPIDsFromSearchResult("gs2mgdemo", searchResult);
485 System.err.println("Found pids for search:\n");
486 for(int i = 0; i < pids.length; i++) {
487 System.out.println(pids[i]);
488 }
489
490 //searchResult = searcher.search("", "minh", 0, 50, 50);
491 //System.err.println(searchResult);
492
493 //String searchTerms = "cyclone dc.title:interview dc.title:gender";
494 String searchTerms="\"gender inequalities\" ds.fulltext:\"cyclone val\" ds.fulltext:\"worst storm\"";
495 searchResult = searcher.search(searchTerms, 1, 10, 3);
496 System.out.println(searchResult);
497
498 // Not restricting results to any collection (search results from
499 // all collections)
500 pids = searcher.getPIDsFromSearchResult("", searchResult);
501 System.err.println("Found pids for search: ");
502 for(int i = 0; i < pids.length; i++) {
503 System.out.println(pids[i]);
504 }
505
506 searchResult = searcher.search("ds.fulltext", "cyclone", 1, 10, 3);
507 //String searchResult = searcher.search("ds.label", "hierarchical", 1, 10, 3);
508 // System.out.println(searcher.search("ds.fulltext", "Pinky", 1, 10, 3));
509 System.out.println(searchResult);
510
511 pids = null;
512 pids = searcher.getPIDsFromSearchResult("", searchResult);
513 System.err.println("Found pids for search: ");
514 for(int i = 0; i < pids.length; i++) {
515 System.out.println(pids[i]);
516 }
517
518 }catch(Exception e) {
519 System.err.println(e.getMessage());
520 }
521
522 }
523
524}
Note: See TracBrowser for help on using the repository browser.