source: gsdl/trunk/runtime-src/src/oaiservr/abstractlistaction.cpp@ 16708

Last change on this file since 16708 was 16708, checked in by mdewsnip, 16 years ago

Changed the resumptionToken tags to not have any whitespace around the resumption tokens, because this confuses harvesters/validators

  • Property svn:keywords set to Author Date Id Revision
File size: 15.0 KB
Line 
1#include "abstractlistaction.h"
2#include "recptprototools.h"
3
4#include "oaitools.h"
5
6bool abstractlistaction::validateAction(recptproto *protocol, oaiargs &params, int &numArgs)
7{
8 // Remove any parameters that aren't valid for this action
9 text_tmap::const_iterator param_iterator = params.begin();
10 while (param_iterator != params.end())
11 {
12 if (param_iterator->first != "verb" &&
13 param_iterator->first != "from" &&
14 param_iterator->first != "until" &&
15 param_iterator->first != "set" &&
16 param_iterator->first != "resumptionToken" &&
17 param_iterator->first != "metadataPrefix")
18 {
19 params.erase(param_iterator->first);
20 }
21
22 param_iterator++;
23 }
24
25 text_t from = params["from"];
26 text_t until = params["until"];
27
28 // from date must be less than, or equal to, until date
29 if ((from != "") && (until != "") && !(from <= until)){
30 this->errorType = "badArgument";
31 return false;
32 }
33
34 if (from != ""){
35 // Must be in the form YYYY-MM-DD
36 if(from.size() != 10){
37 this->errorType = "badArgument";
38 params.erase("from");
39 }
40 else{
41 if(from[4] != '-' || from[7] != '-'){
42 this->errorType = "badArgument";
43 params.erase("from");
44 }
45 }
46 ++numArgs; // Increase valid args count
47 }
48
49 if (until != ""){
50 // Must be in the form YYYY-MM-DD
51 if(until.size() != 10){
52 this->errorType = "badArgument";
53 params.erase("until");
54 }
55 else{
56 if(until[4] != '-' || until[7] != '-'){
57 this->errorType = "badArgument";
58 params.erase("until");
59 }
60 }
61 ++numArgs; // Increase valid args count
62 }
63
64 if (this->errorType == "badArgument")
65 {
66 return false;
67 }
68
69 if (params["set"] != "") {
70 text_t gsdlSet = params["set"];
71 text_t gsdlCollect = "";
72
73 // given 'demo:CL2', toGSDL returns 'demo' in gsdlCollect and 'CL2' in gsdlSet. If there is no further
74 // set specified after the name of the collection however, then gsdlSet is empty.
75 oaiclassifier::toGSDL(gsdlCollect, gsdlSet);
76
77 comerror_t err;
78 ColInfoResponse_t cinfo;
79
80 // check that the collection is accessible
81 protocol->get_collectinfo(gsdlCollect, cinfo, err, cerr);
82 if (err != noError) {
83 this->errorType = "badArgument";
84 return false;
85 }
86
87 // exclude collections that are not listed in the configured OAI list
88 text_tarray &collections = this->configuration->getCollectionsList();
89 int c;
90 for (c = 0; c < collections.size(); c ++) {
91 if (collections[c] == gsdlCollect)
92 break;
93 }
94 if (c == collections.size()) {
95 this->errorType = "badArgument";
96 return false;
97 }
98
99 if (gsdlSet != "") {
100 // check the child set if it is given
101 if (!this->check_classifier(protocol, gsdlCollect, gsdlSet)) {
102 this->errorType = "badArgument";
103 return false;
104 }
105 }
106 ++numArgs;
107 }
108
109 if (params["resumptionToken"] != "") {
110 ResumptionToken token(params["resumptionToken"]);
111
112 if (!token.isValid()) {
113 this->errorType = "badResumptionToken";
114 return false;
115 }
116 ++numArgs;
117 }
118
119 this->errorType = "";
120 return true;
121}
122
123//--------------------------------------------------------------------------------------------------
124
125bool abstractlistaction::output_content(ostream &output, recptproto *protocol, oaiargs &params)
126{
127 text_t from = params["from"];
128 text_t until = params["until"];
129 text_t metaFormat = params["metadataPrefix"];
130 bool prevDocSeen;
131 ResumptionToken *token = NULL;
132
133 // start the call; clear down the total number of output documents
134 this->outputDocs = 0;
135
136 // We don't actually handle resumptionTokens yet; if we get one, ignore it
137 if (params["resumptionToken"] != "") {
138 token = new ResumptionToken(params["resumptionToken"]);
139 }
140
141 this->replyToken = NULL;
142
143 // if we've been asked for a set, then use it!
144 if (params["set"] != "") {
145 // get the children of this set
146 text_t gsdlSet = params["set"];
147 text_t gsdlCollect = "";
148
149 // given 'demo:CL2', toGSDL returns 'demo' in gsdlCollect and 'CL2' in gsdlSet. If there is no further
150 // set specified after the name of the collection however, then gsdlSet is empty.
151 oaiclassifier::toGSDL(gsdlCollect, gsdlSet);
152
153 // If gsdlSet is empty, then the user is requesting all the identifiers for the collection, so
154 // we simply output all docs via their oai_id tag. But if a specific subset IS requested, then
155 // use recurse_set() to traverse any sub classifiers to find the relevant docs.
156 if(gsdlSet == ""){
157 ColInfoResponse_t cinfo;
158 comerror_t err;
159 this->output_content_for_col(output, protocol, gsdlCollect, cinfo, err, params);
160 }
161 else {
162 if (this->check_classifier(protocol, gsdlCollect, gsdlSet)) {
163 this->recurse_set(output, protocol, gsdlCollect, gsdlSet, params, token);
164 }
165 }
166 }
167 // output all records in all hierarchies
168 else {
169 this->output_content_for_all(output, protocol, params);
170 }
171
172 // If - regardless of set required - no documents have been seen, throw an error.
173 if (this->configuration->getOAIVersion() >= 200 && this->prevDocSeen == false) {
174 errorType = "noRecordsMatch";
175 this->output_error(output, errorType);
176
177 return false;
178 }
179
180 // do a resumption token if required; errors cancel a token...
181 if (this->replyToken != NULL && this->errorType == "") {
182 // Don't add any whitespace around the resumption token as it can confuse harvesters/validators
183 output << " <resumptionToken>" << this->replyToken->getToken() << "</resumptionToken>" << endl;
184 }
185
186 return true;
187}
188
189//--------------------------------------------------------------------------------------------------
190
191void abstractlistaction::output_content_for_col(ostream &output, recptproto *protocol, text_t &gsdlCollect,
192 ColInfoResponse_t &cinfo, comerror_t &err, oaiargs &params)
193{ int startDoc = 0;
194
195 // get the collection information
196 protocol->get_collectinfo(gsdlCollect, cinfo, err, *this->logout);
197
198 // check resumption token
199 if (params["resumptionToken"] != "") {
200 ResumptionToken token(params["resumptionToken"]);
201 if (token.getCollection() == gsdlCollect) {
202 startDoc = token.getPosition() - 1; // first document is said to be 1..
203 }
204 }
205
206 // If numDocs is 0, do nothing - this->prevDocSeen will stay false if this is the only collection
207 // looked at, or will keep whatever value it had prior to this col (ensures that if the flag has
208 // been set to true by a previous collection that this won't overwrite it to be false).
209 if (cinfo.numDocs > 0) {
210 int errorCount = 0; // Count the number of errors found in the given collection
211 text_t from = params["from"];
212 text_t until = params["until"];
213
214 for (long i = startDoc; i < cinfo.numDocs; ++i) {
215 if (errorCount > 3) { // If num errors reaches the cut-off value, bail.
216 cerr << "Error: too many records(" << errorCount << ") in the " << gsdlCollect
217 << " collection have invalid or non-existant oai_ids - skipping remainder of collection.\n";
218 return;
219 }
220
221 text_t oai_id = "oai.";
222 oai_id += i;
223
224 text_t gsdl_id = oaiclassifier::getGSDL_OID(gsdlCollect, oai_id, protocol, *this->logout);
225
226 if (gsdl_id == "") { // If the string is empty, then the document didn't have an oai_id, so
227 ++errorCount; // increase error count
228 continue;
229 }
230
231
232 // Check that the item with the 0ID 'gsdl_id' has a lastmodified field that fits within
233 // the required date range (if specified).
234 if (this->inDateRange(params["from"], params["until"], gsdlCollect, gsdl_id, protocol, output)) {
235 if (this->output_document(output, protocol, gsdlCollect, gsdl_id, params["metadataPrefix"])) {
236 // this should be an IF statement, where prevDocSeen is only set to true if the above
237 // function call returns true (indicating that the doc supported the metadata prefix) but
238 // for some reason this is always false. This means that if no doc in the requested set supports
239 // the metadata format, the "no records match" error that should be thrown won't be...
240 //
241 // GRB: the above comment is no longer true; proper checks are made
242 this->prevDocSeen = true;
243 ++this->outputDocs;
244 }
245 }
246
247 // if we've output the number of resumption documents; prepare a resumptionToken
248 if (this->outputDocs == this->configuration->resumeAfter()) {
249 this->replyToken = new ResumptionToken(gsdlCollect, "", "");
250 this->replyToken->setPosition("", i+2);
251 break;
252 }
253 }
254
255 cinfo.clear(); // Clear for next collection to use (if there is one).
256 }
257}
258
259//--------------------------------------------------------------------------------------------
260// Returns true if at least one document record is found
261void abstractlistaction::output_content_for_all(ostream &output, recptproto *protocol, oaiargs &params)
262{
263 ColInfoResponse_t cinfo;
264 comerror_t err;
265 text_tarray collections;
266 text_t gsdlCollect = "";
267 ResumptionToken *token = NULL;
268
269 // get a list of the collections available
270 collections = this->configuration->getCollectionsList();
271 // protocol->get_collection_list(collections, err, output);
272
273 if (params["resumptionToken"] != "") {
274 token = new ResumptionToken(params["resumptionToken"]);
275 }
276
277 for(int current_col = 0; current_col < collections.size(); ++current_col){
278 gsdlCollect = collections[current_col];
279
280 // ignore all leading collections before the one that matches the resumptiontoken
281 if (token != NULL &&
282 token->getCollection() != gsdlCollect)
283 { continue;
284 }
285
286 this->output_content_for_col(output, protocol, gsdlCollect, cinfo, err, params);
287
288 // once we've output at least one collection, continue
289 // outputting all others until the resumption total hits
290 token = NULL;
291
292 if (this->outputDocs == this->configuration->resumeAfter()) {
293 break;
294 }
295 }
296}
297
298//-------------------------------------------------------------------------------------------------
299// Check that the requested from/until dates don't include a time, as this would be asking for too
300// fine a level of granularity, one that greenstone doesn't support. An OAI error must be thrown.
301/*
302bool abstractlistaction::granularityTooFine(text_t &from, text_t &until)
303{
304 if (from != "" && from.){
305
306 }
307
308}
309*/
310//-------------------------------------------------------------------------------------------------
311
312bool abstractlistaction::check_classifier(recptproto *protocol, const text_t &collection,
313 const text_t &classifier)
314{ text_t topClass;
315 FilterResponse_t response;
316 text_tset metadata;
317 ofstream logout("oai.log", ios::app);
318
319 // exclude false children of a top-level classifier immediately...
320 if (!get_info(classifier, collection, "", metadata, false, protocol, response, logout)) {
321 return false;
322 }
323
324 // now check the top-level parent
325 metadata.insert("supportsmemberof");
326
327 text_t::const_iterator dot = findchar(classifier.begin(), classifier.end(), '.');
328 if (dot != classifier.end()) {
329 topClass = substr(classifier.begin(), dot);
330 }
331 else {
332 topClass = classifier;
333 }
334
335 if (!get_info(topClass, collection, "", metadata, false, protocol, response, logout)) {
336 return false;
337 }
338
339 if (response.docInfo[0].metadata["supportsmemberof"].values.size() == 0) {
340 return false;
341 }
342
343 if (response.docInfo[0].metadata["supportsmemberof"].values[0] != "true") {
344 return false;
345 }
346
347 return true;
348}
349
350void abstractlistaction::recurse_set(ostream &output, recptproto *protocol, const text_t &collection,
351 const text_t &classifier, oaiargs &params, ResumptionToken *resumptionToken)
352{
353 // metadata for this call
354 FilterResponse_t response;
355 text_tset metadata;
356 ofstream logout("oai.log", ios::app);
357 text_t from = params["from"];
358 text_t until = params["until"];
359 text_t metadataPrefix = params["metadataPrefix"];
360 // ResumptionToken resumptionToken(params["resumptionToken"]);
361 int startPos = 0;
362
363 // This is a recursive function, and so just because the current set is empty doesn't mean we necessarily
364 // want to throw a 'noRecordsMatch' error; another set (parent/sibling/child) may have had documents. It
365 // is therefore not enough to check that the response object in the current iteration has no docs - we
366 // must also verify that NO OTHER set has had any documents. This is done with the 'prevDocSeen' flag.
367 // It is set to FALSE initially, but as soon as we see a set that isn't empty, it is set to TRUE. The
368 // 'noRecordsMatch' error will only be thrown if, after all appropriate sets have been recursed into,
369 // the 'prevDocSeen' flag is still FALSE. The function returns false if no docs were seen, allowing us to
370 // throw the noRecordsMatch error.
371
372 // bool prevDocSeen = false;
373
374 get_children(classifier, collection, "", metadata, false, protocol, response, *this->logout);
375
376 if (params["resumptionToken"] != "") {
377 // if we're at a resumptionToken
378 if (classifier == resumptionToken->getNode()) {
379 startPos = resumptionToken->getPosition();
380 }
381 else {
382 text_t fullNode = resumptionToken->getNode();
383 text_t::iterator leafIter = fullNode.begin() + classifier.size();
384
385 // if the next character isn't a dot, blow up!
386 if (*leafIter != '.') {
387 // fatal error;
388 exit(1);
389 }
390
391 // get the first '.' after the current classifier point;
392 text_t::iterator separator = findchar(leafIter + 1, fullNode.end(), '.');
393
394 // now, create a new subpath
395 text_t nextNode = substr(fullNode.begin(), separator);
396
397 // seek forward; TODO: improve performance of this
398 for (int c = 0; c < response.numDocs; ++c) {
399 if (response.docInfo[c].OID == nextNode) {
400 startPos = c;
401 break;
402 }
403 }
404 }
405
406 // We need to subtract one from the startPos value to turn it into an index value
407 startPos--;
408 }
409
410 for (int c = startPos; c < response.numDocs; ++c) {
411 text_t child = response.docInfo[c].OID;
412
413 // distinguish classifiers and documents by checking whether OID
414 // starts with CL or not
415 text_t childHead;
416 text_t::const_iterator start = child.begin();
417 text_t::const_iterator here = child.begin();
418 here += 2;
419 childHead = substr(start, here);
420
421 // documents we output now
422 if (childHead != "CL") {
423 // Check that the item with the 0ID 'gsdl_id' has a lastmodified field that fits
424 // within the required date range (if specified)
425 if (this->inDateRange(from, until, collection, child, protocol, output)) {
426 // TODO: check that the document can be disseminated in the required metadataPrefix
427
428 if (this->output_document(output, protocol, collection, child, metadataPrefix)) {
429 this->prevDocSeen = true;
430 ++this->outputDocs;
431 }
432 }
433 }
434 // children which are classifiers are recursed
435 else {
436 if (resumptionToken != NULL) {
437 int depth = countchar(classifier.begin(), classifier.end(), '.');
438 resumptionToken->setOffset(depth, c+2);
439 }
440 this->recurse_set(output, protocol, collection, child, params, resumptionToken);
441 }
442
443 if (this->outputDocs == this->configuration->resumeAfter()) {
444 this->replyToken = new ResumptionToken(collection, params["set"], "");
445 this->replyToken->setPosition(classifier, c+2);
446 break;
447 }
448 }
449}
450
451
452
453
Note: See TracBrowser for help on using the repository browser.