root/gsdl/trunk/runtime-src/src/oaiservr/abstractlistaction.cpp @ 16708

Revision 16708, 15.0 KB (checked in by mdewsnip, 12 years ago)

Changed the resumptionToken tags to not have any whitespace around the resumption tokens, because this confuses harvesters/validators

  • Property svn:keywords set to Author Date Id Revision
Line 
1#include "abstractlistaction.h"
2#include "recptprototools.h"
3
4#include "oaitools.h"
5
6bool abstractlistaction::validateAction(recptproto *protocol, oaiargs &params, int &numArgs)
7{
8  // Remove any parameters that aren't valid for this action
9  text_tmap::const_iterator param_iterator = params.begin();
10  while (param_iterator != params.end())
11  {
12    if (param_iterator->first != "verb" &&
13    param_iterator->first != "from" &&
14    param_iterator->first != "until" &&
15    param_iterator->first != "set" &&
16    param_iterator->first != "resumptionToken" &&
17    param_iterator->first != "metadataPrefix")
18    {
19      params.erase(param_iterator->first);
20    }
21
22    param_iterator++;
23  }
24
25  text_t from  = params["from"];
26  text_t until = params["until"];
27
28  // from date must be less than, or equal to, until date
29  if ((from != "") && (until != "") && !(from <= until)){
30    this->errorType = "badArgument";
31    return false;
32  }
33
34  if (from != ""){
35    // Must be in the form YYYY-MM-DD
36    if(from.size() != 10){
37      this->errorType = "badArgument";
38      params.erase("from");
39    }
40    else{
41      if(from[4] != '-' || from[7] != '-'){
42    this->errorType = "badArgument";
43    params.erase("from");
44      }
45    }
46    ++numArgs; // Increase valid args count
47  }
48
49  if (until != ""){
50    // Must be in the form YYYY-MM-DD
51    if(until.size() != 10){
52      this->errorType = "badArgument";
53      params.erase("until");
54    }
55    else{
56      if(until[4] != '-' || until[7] != '-'){
57    this->errorType = "badArgument";
58    params.erase("until");
59      }
60    }
61    ++numArgs; // Increase valid args count
62  }
63
64  if (this->errorType == "badArgument")
65  {
66    return false;
67  }
68
69  if (params["set"] != "") {
70    text_t gsdlSet = params["set"];
71    text_t gsdlCollect = "";
72
73    // given 'demo:CL2', toGSDL returns 'demo' in gsdlCollect and 'CL2' in gsdlSet. If there is no further
74    // set specified after the name of the collection however, then gsdlSet is empty.
75    oaiclassifier::toGSDL(gsdlCollect, gsdlSet);
76
77    comerror_t err;
78    ColInfoResponse_t cinfo;
79
80    // check that the collection is accessible
81    protocol->get_collectinfo(gsdlCollect, cinfo, err, cerr);
82    if (err != noError) {
83      this->errorType = "badArgument";
84      return false;
85    }
86
87    // exclude collections that are not listed in the configured OAI list
88    text_tarray &collections = this->configuration->getCollectionsList();
89    int          c;
90    for (c = 0; c < collections.size(); c ++) {
91      if (collections[c] == gsdlCollect)
92    break;
93    }
94    if (c == collections.size()) {
95      this->errorType = "badArgument";
96      return false;
97    }
98   
99    if (gsdlSet != "") {
100      // check the child set if it is given
101      if (!this->check_classifier(protocol, gsdlCollect, gsdlSet)) {
102    this->errorType = "badArgument";
103    return false;
104      }
105    }
106    ++numArgs;
107  }
108
109  if (params["resumptionToken"] != "") {
110    ResumptionToken token(params["resumptionToken"]);
111
112    if (!token.isValid()) {
113      this->errorType = "badResumptionToken";
114      return false;
115    }
116    ++numArgs;
117  }
118 
119  this->errorType = "";
120  return true;
121}
122
123//--------------------------------------------------------------------------------------------------
124
125bool abstractlistaction::output_content(ostream &output, recptproto *protocol, oaiargs &params)
126{
127  text_t from = params["from"];
128  text_t until = params["until"];
129  text_t metaFormat = params["metadataPrefix"];
130  bool   prevDocSeen;
131  ResumptionToken *token = NULL;
132
133  // start the call; clear down the total number of output documents
134  this->outputDocs = 0;
135
136  // We don't actually handle resumptionTokens yet; if we get one, ignore it
137  if (params["resumptionToken"] != "") {
138    token = new ResumptionToken(params["resumptionToken"]);
139  }
140
141  this->replyToken = NULL;
142
143  // if we've been asked for a set, then use it!
144  if (params["set"] != "") {
145    // get the children of this set
146    text_t gsdlSet = params["set"];
147    text_t gsdlCollect = "";
148
149    // given 'demo:CL2', toGSDL returns 'demo' in gsdlCollect and 'CL2' in gsdlSet. If there is no further
150    // set specified after the name of the collection however, then gsdlSet is empty.
151    oaiclassifier::toGSDL(gsdlCollect, gsdlSet);
152
153    // If gsdlSet is empty, then the user is requesting all the identifiers for the collection, so
154    // we simply output all docs via their oai_id tag. But if a specific subset IS requested, then
155    // use recurse_set() to traverse any sub classifiers to find the relevant docs.
156    if(gsdlSet == ""){
157      ColInfoResponse_t cinfo;
158      comerror_t err;
159      this->output_content_for_col(output, protocol, gsdlCollect, cinfo, err, params);
160    }
161    else {
162      if (this->check_classifier(protocol, gsdlCollect, gsdlSet)) {
163    this->recurse_set(output, protocol, gsdlCollect, gsdlSet, params, token);
164      }
165    }
166  }
167  // output all records in all hierarchies
168  else {
169    this->output_content_for_all(output, protocol, params);
170  }
171
172  // If - regardless of set required - no documents have been seen, throw an error.
173  if (this->configuration->getOAIVersion() >= 200 && this->prevDocSeen == false) {
174    errorType = "noRecordsMatch";
175    this->output_error(output, errorType);
176
177    return false;
178  }
179
180  // do a resumption token if required; errors cancel a token...
181  if (this->replyToken != NULL && this->errorType == "") {
182    // Don't add any whitespace around the resumption token as it can confuse harvesters/validators
183    output << "  <resumptionToken>" << this->replyToken->getToken() << "</resumptionToken>" << endl;
184  }
185
186  return true;
187}
188
189//--------------------------------------------------------------------------------------------------
190
191void abstractlistaction::output_content_for_col(ostream &output, recptproto *protocol, text_t &gsdlCollect,
192                        ColInfoResponse_t &cinfo, comerror_t &err, oaiargs &params)
193{ int startDoc = 0;
194
195  // get the collection information
196  protocol->get_collectinfo(gsdlCollect, cinfo, err, *this->logout);
197
198  // check resumption token
199  if (params["resumptionToken"] != "") {
200    ResumptionToken token(params["resumptionToken"]);
201    if (token.getCollection() == gsdlCollect) {
202      startDoc = token.getPosition() - 1; // first document is said to be 1..
203    }
204  }
205 
206  // If numDocs is 0, do nothing - this->prevDocSeen will stay false if this is the only collection
207  // looked at, or will keep whatever value it had prior to this col (ensures that if the flag has
208  // been set to true by a previous collection that this won't overwrite it to be false).
209  if (cinfo.numDocs > 0) {
210    int errorCount      = 0; // Count the number of errors found in the given collection
211    text_t from         = params["from"];
212    text_t until        = params["until"];
213
214    for (long i = startDoc; i < cinfo.numDocs; ++i) {
215      if (errorCount > 3) { // If num errors reaches the cut-off value, bail.
216    cerr << "Error: too many records(" << errorCount << ") in the " << gsdlCollect
217         << " collection have invalid or non-existant oai_ids - skipping remainder of collection.\n";
218    return;
219      }
220     
221      text_t oai_id = "oai.";
222      oai_id += i;
223     
224      text_t gsdl_id = oaiclassifier::getGSDL_OID(gsdlCollect, oai_id, protocol, *this->logout);
225     
226      if (gsdl_id == "") { // If the string is empty, then the document didn't have an oai_id, so
227    ++errorCount;     // increase error count
228    continue;
229      }
230
231
232      // Check that the item with the 0ID 'gsdl_id' has a lastmodified field that fits within
233      // the required date range (if specified).
234      if (this->inDateRange(params["from"], params["until"], gsdlCollect, gsdl_id, protocol, output)) {
235    if (this->output_document(output, protocol, gsdlCollect, gsdl_id, params["metadataPrefix"])) {
236      // this should be an IF statement, where prevDocSeen is only set to true if the above
237      // function call returns true (indicating that the doc supported the metadata prefix) but
238      // for some reason this is always false. This means that if no doc in the requested set supports
239      // the metadata format, the "no records match" error that should be thrown won't be...
240      //
241      // GRB: the above comment is no longer true; proper checks are made
242      this->prevDocSeen = true;
243      ++this->outputDocs;
244    }
245      }
246
247      // if we've output the number of resumption documents; prepare a resumptionToken
248      if (this->outputDocs == this->configuration->resumeAfter()) {
249    this->replyToken = new ResumptionToken(gsdlCollect, "", "");
250    this->replyToken->setPosition("", i+2);
251    break;
252      }
253    }
254
255    cinfo.clear(); // Clear for next collection to use (if there is one).
256  }
257}
258
259//--------------------------------------------------------------------------------------------
260// Returns true if at least one document record is found
261void abstractlistaction::output_content_for_all(ostream &output, recptproto *protocol, oaiargs &params)
262{
263  ColInfoResponse_t cinfo;
264  comerror_t        err;
265  text_tarray       collections;
266  text_t            gsdlCollect = "";
267  ResumptionToken   *token = NULL;
268
269  // get a list of the collections available
270  collections = this->configuration->getCollectionsList();
271  //  protocol->get_collection_list(collections, err, output);
272 
273  if (params["resumptionToken"] != "") {
274    token = new ResumptionToken(params["resumptionToken"]);
275  }
276
277  for(int current_col = 0; current_col < collections.size(); ++current_col){
278    gsdlCollect = collections[current_col];
279
280    // ignore all leading collections before the one that matches the resumptiontoken
281    if (token != NULL &&
282    token->getCollection() != gsdlCollect)
283    { continue;
284    }
285
286    this->output_content_for_col(output, protocol, gsdlCollect, cinfo, err, params);
287
288    // once we've output at least one collection, continue
289    // outputting all others until the resumption total hits
290    token = NULL;
291
292    if (this->outputDocs == this->configuration->resumeAfter()) {
293      break;
294    }
295  }
296}
297
298//-------------------------------------------------------------------------------------------------
299// Check that the requested from/until dates don't include a time, as this would be asking for too
300// fine a level of granularity, one that greenstone doesn't support. An OAI error must be thrown.
301/*
302bool abstractlistaction::granularityTooFine(text_t &from, text_t &until)
303{
304  if (from != "" && from.){
305   
306  }
307 
308}
309*/
310//-------------------------------------------------------------------------------------------------
311
312bool abstractlistaction::check_classifier(recptproto *protocol, const text_t &collection,
313                      const text_t &classifier)
314{ text_t topClass;
315  FilterResponse_t response;
316  text_tset        metadata;
317  ofstream         logout("oai.log", ios::app);
318
319  // exclude false children of a top-level classifier immediately...
320  if (!get_info(classifier, collection, "", metadata, false, protocol, response, logout)) {
321    return false;
322  }
323 
324  // now check the top-level parent
325  metadata.insert("supportsmemberof");
326
327  text_t::const_iterator dot = findchar(classifier.begin(), classifier.end(), '.');
328  if (dot != classifier.end()) {
329    topClass = substr(classifier.begin(), dot);
330  }
331  else {
332    topClass = classifier;
333  }
334
335  if (!get_info(topClass, collection, "", metadata, false, protocol, response, logout)) {
336    return false;
337  }
338 
339  if (response.docInfo[0].metadata["supportsmemberof"].values.size() == 0) {
340    return false;
341  }
342
343  if (response.docInfo[0].metadata["supportsmemberof"].values[0] != "true") {
344    return false;
345  }
346
347  return true;
348}
349
350void abstractlistaction::recurse_set(ostream &output, recptproto *protocol, const text_t &collection,
351                     const text_t &classifier, oaiargs &params, ResumptionToken *resumptionToken)
352{
353  // metadata for this call
354  FilterResponse_t response;
355  text_tset        metadata;
356  ofstream         logout("oai.log", ios::app);
357  text_t           from = params["from"];
358  text_t           until = params["until"];
359  text_t           metadataPrefix = params["metadataPrefix"];
360  //  ResumptionToken  resumptionToken(params["resumptionToken"]);
361  int              startPos = 0;
362
363  // This is a recursive function, and so just because the current set is empty doesn't mean we necessarily
364  // want to throw a 'noRecordsMatch' error; another set (parent/sibling/child) may have had documents. It
365  // is therefore not enough to check that the response object in the current iteration has no docs - we
366  // must also verify that NO OTHER set has had any documents. This is done with the 'prevDocSeen' flag.
367  // It is set to FALSE initially, but as soon as we see a set that isn't empty, it is set to TRUE. The
368  // 'noRecordsMatch' error will only be thrown if, after all appropriate sets have been recursed into,
369  // the 'prevDocSeen' flag is still FALSE. The function returns false if no docs were seen, allowing us to
370  // throw the noRecordsMatch error.
371 
372  //  bool prevDocSeen = false;
373
374  get_children(classifier, collection, "", metadata, false, protocol, response, *this->logout);
375
376  if (params["resumptionToken"] != "") {
377    // if we're at a resumptionToken
378    if (classifier == resumptionToken->getNode()) {
379      startPos = resumptionToken->getPosition();
380    }
381    else {
382      text_t fullNode = resumptionToken->getNode();
383      text_t::iterator leafIter = fullNode.begin() + classifier.size();
384
385      // if the next character isn't a dot, blow up!
386      if (*leafIter != '.') {
387    // fatal error;
388    exit(1);
389      }
390     
391      // get the first '.' after the current classifier point;
392      text_t::iterator separator = findchar(leafIter + 1, fullNode.end(), '.');
393
394      // now, create a new subpath
395      text_t nextNode = substr(fullNode.begin(), separator);
396
397      // seek forward; TODO: improve performance of this
398      for (int c = 0; c < response.numDocs; ++c) {
399    if (response.docInfo[c].OID == nextNode) {
400      startPos = c;
401      break;
402    }
403      }
404    }
405
406    // We need to subtract one from the startPos value to turn it into an index value
407    startPos--;
408  }
409 
410  for (int c = startPos; c < response.numDocs; ++c) {
411    text_t child = response.docInfo[c].OID;
412   
413    // distinguish classifiers and documents by checking whether OID
414    // starts with CL or not
415    text_t childHead;
416    text_t::const_iterator start = child.begin();
417    text_t::const_iterator here  = child.begin();
418    here += 2;
419    childHead = substr(start, here);
420   
421    // documents we output now
422    if (childHead != "CL") {
423      // Check that the item with the 0ID 'gsdl_id' has a lastmodified field that fits
424      // within the required date range (if specified)
425      if (this->inDateRange(from, until, collection, child, protocol, output)) {
426    // TODO: check that the document can be disseminated in the required metadataPrefix
427
428    if (this->output_document(output, protocol, collection, child, metadataPrefix)) {
429      this->prevDocSeen = true;
430      ++this->outputDocs;
431    }
432      }
433    }
434    // children which are classifiers are recursed
435    else {
436      if (resumptionToken != NULL) {
437    int depth = countchar(classifier.begin(), classifier.end(), '.');
438    resumptionToken->setOffset(depth, c+2);
439      }
440      this->recurse_set(output, protocol, collection, child, params, resumptionToken);
441    }
442
443    if (this->outputDocs == this->configuration->resumeAfter()) {
444      this->replyToken = new ResumptionToken(collection, params["set"], "");
445      this->replyToken->setPosition(classifier, c+2);     
446      break;
447    }
448  }
449}
450
451
452
453
Note: See TracBrowser for help on using the browser.