source: main/tags/2.80/gsdl/src/oaiservr/abstractlistaction.cpp@ 24527

Last change on this file since 24527 was 11733, checked in by grbuchan, 18 years ago

Added further checks for use of collection names that are not configured
for OAI support.

  • Property svn:keywords set to Author Date Id Revision
File size: 14.3 KB
Line 
1#include "abstractlistaction.h"
2#include "OIDtools.h"
3
4#include "oaitools.h"
5
6bool abstractlistaction::validateAction(recptproto *protocol, oaiargs &params, int &numArgs)
7{
8 text_t from = params["from"];
9 text_t until = params["until"];
10
11 // from date must be less than, or equal to, until date
12 if ((from != "") && (until != "") && !(from <= until)){
13 this->errorType = "badArgument";
14 return false;
15 }
16
17 if (from != ""){
18 // Must be in the form YYYY-MM-DD
19 if(from.size() != 10){
20 this->errorType = "badArgument";
21 return false;
22 }
23 else{
24 if(from[4] != '-' || from[7] != '-'){
25 this->errorType = "badArgument";
26 return false;
27 }
28 }
29 ++numArgs; // Increase valid args count
30 }
31
32 if (until != ""){
33 // Must be in the form YYYY-MM-DD
34 if(until.size() != 10){
35 this->errorType = "badArgument";
36 return false;
37 }
38 else{
39 if(until[4] != '-' || until[7] != '-'){
40 this->errorType = "badArgument";
41 return false;
42 }
43 }
44 ++numArgs; // Increase valid args count
45 }
46
47 if (params["set"] != "") {
48 text_t gsdlSet = params["set"];
49 text_t gsdlCollect = "";
50
51 // given 'demo:CL2', toGSDL returns 'demo' in gsdlCollect and 'CL2' in gsdlSet. If there is no further
52 // set specified after the name of the collection however, then gsdlSet is empty.
53 oaiclassifier::toGSDL(gsdlCollect, gsdlSet);
54
55 comerror_t err;
56 ColInfoResponse_t cinfo;
57
58 // check that the collection is accessible
59 protocol->get_collectinfo(gsdlCollect, cinfo, err, cerr);
60 if (err != noError) {
61 this->errorType = "badArgument";
62 return false;
63 }
64
65 // exclude collections that are not listed in the configured OAI list
66 text_tarray &collections = this->configuration->getCollectionsList();
67 int c;
68 for (c = 0; c < collections.size(); c ++) {
69 if (collections[c] == gsdlCollect)
70 break;
71 }
72 if (c == collections.size()) {
73 this->errorType = "badArgument";
74 return false;
75 }
76
77 if (gsdlSet != "") {
78 // check the child set if it is given
79 if (!this->check_classifier(protocol, gsdlCollect, gsdlSet)) {
80 this->errorType = "badArgument";
81 return false;
82 }
83 }
84 ++numArgs;
85 }
86
87 if (params["resumptionToken"] != "") {
88 ResumptionToken token(params["resumptionToken"]);
89
90 if (!token.isValid()) {
91 this->errorType = "badResumptionToken";
92 return false;
93 }
94 ++numArgs;
95 }
96
97 this->errorType = "";
98 return true;
99}
100
101//--------------------------------------------------------------------------------------------------
102
103bool abstractlistaction::output_content(ostream &output, recptproto *protocol, oaiargs &params)
104{
105 text_t from = params["from"];
106 text_t until = params["until"];
107 text_t metaFormat = params["metadataPrefix"];
108 bool prevDocSeen;
109 ResumptionToken *token = NULL;
110
111 // start the call; clear down the total number of output documents
112 this->outputDocs = 0;
113
114 // We don't actually handle resumptionTokens yet; if we get one, ignore it
115 if (params["resumptionToken"] != "") {
116 token = new ResumptionToken(params["resumptionToken"]);
117 }
118
119 this->replyToken = NULL;
120
121 // if we've been asked for a set, then use it!
122 if (params["set"] != "") {
123 // get the children of this set
124 text_t gsdlSet = params["set"];
125 text_t gsdlCollect = "";
126
127 // given 'demo:CL2', toGSDL returns 'demo' in gsdlCollect and 'CL2' in gsdlSet. If there is no further
128 // set specified after the name of the collection however, then gsdlSet is empty.
129 oaiclassifier::toGSDL(gsdlCollect, gsdlSet);
130
131 // If gsdlSet is empty, then the user is requesting all the identifiers for the collection, so
132 // we simply output all docs via their oai_id tag. But if a specific subset IS requested, then
133 // use recurse_set() to traverse any sub classifiers to find the relevant docs.
134 if(gsdlSet == ""){
135 ColInfoResponse_t cinfo;
136 comerror_t err;
137 this->output_content_for_col(output, protocol, gsdlCollect, cinfo, err, params);
138 }
139 else {
140 if (this->check_classifier(protocol, gsdlCollect, gsdlSet)) {
141 this->recurse_set(output, protocol, gsdlCollect, gsdlSet, params, token);
142 }
143 }
144 }
145 // output all records in all hierarchies
146 else {
147 this->output_content_for_all(output, protocol, params);
148 }
149
150 // If - regardless of set required - no documents have been seen, throw an error.
151 if (this->configuration->getOAIVersion() >= 200 && this->prevDocSeen == false) {
152 errorType = "noRecordsMatch";
153 this->output_error(output, errorType);
154
155 return false;
156 }
157
158 // do a resumption token if required; errors cancel a token...
159 if (this->replyToken != NULL && this->errorType == "") {
160 output << " <resumptionToken>" << endl;
161 output << " " << this->replyToken->getToken() << endl;
162 output << " </resumptionToken>" << endl;
163 }
164
165 return true;
166}
167
168//--------------------------------------------------------------------------------------------------
169
170void abstractlistaction::output_content_for_col(ostream &output, recptproto *protocol, text_t &gsdlCollect,
171 ColInfoResponse_t &cinfo, comerror_t &err, oaiargs &params)
172{ int startDoc = 0;
173
174 // get the collection information
175 protocol->get_collectinfo(gsdlCollect, cinfo, err, *this->logout);
176
177 // check resumption token
178 if (params["resumptionToken"] != "") {
179 ResumptionToken token(params["resumptionToken"]);
180 if (token.getCollection() == gsdlCollect) {
181 startDoc = token.getPosition() - 1; // first document is said to be 1..
182 }
183 }
184
185 // If numDocs is 0, do nothing - this->prevDocSeen will stay false if this is the only collection
186 // looked at, or will keep whatever value it had prior to this col (ensures that if the flag has
187 // been set to true by a previous collection that this won't overwrite it to be false).
188 if (cinfo.numDocs > 0) {
189 int errorCount = 0; // Count the number of errors found in the given collection
190 text_t from = params["from"];
191 text_t until = params["until"];
192
193 for (long i = startDoc; i < cinfo.numDocs; ++i) {
194 if (errorCount > 3) { // If num errors reaches the cut-off value, bail.
195 cerr << "Error: too many records(" << errorCount << ") in the " << gsdlCollect
196 << " collection have invalid or non-existant oai_ids - skipping remainder of collection.\n";
197 return;
198 }
199
200 text_t oai_id = "oai.";
201 oai_id += i;
202
203 text_t gsdl_id = oaiclassifier::getGSDL_OID(gsdlCollect, oai_id, protocol, *this->logout);
204
205 if (gsdl_id == "") { // If the string is empty, then the document didn't have an oai_id, so
206 ++errorCount; // increase error count
207 continue;
208 }
209
210
211 // Check that the item with the 0ID 'gsdl_id' has a lastmodified field that fits within
212 // the required date range (if specified).
213 if (this->inDateRange(params["from"], params["until"], gsdlCollect, gsdl_id, protocol, output)) {
214 if (this->output_document(output, protocol, gsdlCollect, gsdl_id, params["metadataPrefix"])) {
215 // this should be an IF statement, where prevDocSeen is only set to true if the above
216 // function call returns true (indicating that the doc supported the metadata prefix) but
217 // for some reason this is always false. This means that if no doc in the requested set supports
218 // the metadata format, the "no records match" error that should be thrown won't be...
219 //
220 // GRB: the above comment is no longer true; proper checks are made
221 this->prevDocSeen = true;
222 ++this->outputDocs;
223 }
224 }
225
226 // if we've output the number of resumption documents; prepare a resumptionToken
227 if (this->outputDocs == this->configuration->resumeAfter()) {
228 this->replyToken = new ResumptionToken(gsdlCollect, "", "");
229 this->replyToken->setPosition("", i+2);
230 break;
231 }
232 }
233
234 cinfo.clear(); // Clear for next collection to use (if there is one).
235 }
236}
237
238//--------------------------------------------------------------------------------------------
239// Returns true if at least one document record is found
240void abstractlistaction::output_content_for_all(ostream &output, recptproto *protocol, oaiargs &params)
241{
242 ColInfoResponse_t cinfo;
243 comerror_t err;
244 text_tarray collections;
245 text_t gsdlCollect = "";
246 ResumptionToken *token = NULL;
247
248 // get a list of the collections available
249 collections = this->configuration->getCollectionsList();
250 // protocol->get_collection_list(collections, err, output);
251
252 if (params["resumptionToken"] != "") {
253 token = new ResumptionToken(params["resumptionToken"]);
254 }
255
256 for(int current_col = 0; current_col < collections.size(); ++current_col){
257 gsdlCollect = collections[current_col];
258
259 // ignore all leading collections before the one that matches the resumptiontoken
260 if (token != NULL &&
261 token->getCollection() != gsdlCollect)
262 { continue;
263 }
264
265 this->output_content_for_col(output, protocol, gsdlCollect, cinfo, err, params);
266
267 // once we've output at least one collection, continue
268 // outputting all others until the resumption total hits
269 token = NULL;
270
271 if (this->outputDocs == this->configuration->resumeAfter()) {
272 break;
273 }
274 }
275}
276
277//-------------------------------------------------------------------------------------------------
278// Check that the requested from/until dates don't include a time, as this would be asking for too
279// fine a level of granularity, one that greenstone doesn't support. An OAI error must be thrown.
280/*
281bool abstractlistaction::granularityTooFine(text_t &from, text_t &until)
282{
283 if (from != "" && from.){
284
285 }
286
287}
288*/
289//-------------------------------------------------------------------------------------------------
290
291bool abstractlistaction::check_classifier(recptproto *protocol, const text_t &collection,
292 const text_t &classifier)
293{ text_t topClass;
294 FilterResponse_t response;
295 text_tset metadata;
296 ofstream logout("oai.log", ios::app);
297
298 // exclude false children of a top-level classifier immediately...
299 if (!get_info(classifier, collection, "", metadata, false, protocol, response, logout)) {
300 return false;
301 }
302
303 // now check the top-level parent
304 metadata.insert("supportsmemberof");
305
306 text_t::const_iterator dot = findchar(classifier.begin(), classifier.end(), '.');
307 if (dot != classifier.end()) {
308 topClass = substr(classifier.begin(), dot);
309 }
310 else {
311 topClass = classifier;
312 }
313
314 if (!get_info(topClass, collection, "", metadata, false, protocol, response, logout)) {
315 return false;
316 }
317
318 if (response.docInfo[0].metadata["supportsmemberof"].values.size() == 0) {
319 return false;
320 }
321
322 if (response.docInfo[0].metadata["supportsmemberof"].values[0] != "true") {
323 return false;
324 }
325
326 return true;
327}
328
329void abstractlistaction::recurse_set(ostream &output, recptproto *protocol, const text_t &collection,
330 const text_t &classifier, oaiargs &params, ResumptionToken *resumptionToken)
331{
332 // metadata for this call
333 FilterResponse_t response;
334 text_tset metadata;
335 ofstream logout("oai.log", ios::app);
336 text_t from = params["from"];
337 text_t until = params["until"];
338 text_t metadataPrefix = params["metadataPrefix"];
339 // ResumptionToken resumptionToken(params["resumptionToken"]);
340 int startPos = 0;
341
342 // This is a recursive function, and so just because the current set is empty doesn't mean we necessarily
343 // want to throw a 'noRecordsMatch' error; another set (parent/sibling/child) may have had documents. It
344 // is therefore not enough to check that the response object in the current iteration has no docs - we
345 // must also verify that NO OTHER set has had any documents. This is done with the 'prevDocSeen' flag.
346 // It is set to FALSE initially, but as soon as we see a set that isn't empty, it is set to TRUE. The
347 // 'noRecordsMatch' error will only be thrown if, after all appropriate sets have been recursed into,
348 // the 'prevDocSeen' flag is still FALSE. The function returns false if no docs were seen, allowing us to
349 // throw the noRecordsMatch error.
350
351 // bool prevDocSeen = false;
352
353 get_children(classifier, collection, "", metadata, false, protocol, response, *this->logout);
354
355 if (params["resumptionToken"] != "") {
356 // if we're at a resumptionToken
357 if (classifier == resumptionToken->getNode()) {
358 startPos = resumptionToken->getPosition();
359 }
360 else {
361 text_t fullNode = resumptionToken->getNode();
362 text_t::iterator leafIter = fullNode.begin() + classifier.size();
363
364 // if the next character isn't a dot, blow up!
365 if (*leafIter != '.') {
366 // fatal error;
367 exit(1);
368 }
369
370 // get the first '.' after the current classifier point;
371 text_t::iterator separator = findchar(leafIter + 1, fullNode.end(), '.');
372
373 // now, create a new subpath
374 text_t nextNode = substr(fullNode.begin(), separator);
375
376 // seek forward; TODO: improve performance of this
377 for (int c = 0; c < response.numDocs; ++c) {
378 if (response.docInfo[c].OID == nextNode) {
379 startPos = c;
380 break;
381 }
382 }
383 }
384 }
385
386 for (int c = startPos; c < response.numDocs; ++c) {
387 text_t child = response.docInfo[c].OID;
388
389 // distinguish classifiers and documents by checking whether OID
390 // starts with CL or not
391 text_t childHead;
392 text_t::const_iterator start = child.begin();
393 text_t::const_iterator here = child.begin();
394 here += 2;
395 childHead = substr(start, here);
396
397 // documents we output now
398 if (childHead != "CL") {
399 // Check that the item with the 0ID 'gsdl_id' has a lastmodified field that fits
400 // within the required date range (if specified)
401 if (this->inDateRange(from, until, collection, child, protocol, output)) {
402 // TODO: check that the document can be disseminated in the required metadataPrefix
403
404 if (this->output_document(output, protocol, collection, child, metadataPrefix)) {
405 this->prevDocSeen = true;
406 ++this->outputDocs;
407 }
408 }
409 }
410 // children which are classifiers are recursed
411 else {
412 if (resumptionToken != NULL) {
413 int depth = countchar(classifier.begin(), classifier.end(), '.');
414 resumptionToken->setOffset(depth, c+2);
415 }
416 this->recurse_set(output, protocol, collection, child, params, resumptionToken);
417 }
418
419 if (this->outputDocs == this->configuration->resumeAfter()) {
420 // this->replyToken = new ResumptionToken(collection, params["set"], "");
421 this->replyToken = resumptionToken;
422 this->replyToken->setPosition(classifier, c+2);
423 }
424 }
425}
426
427
428
429
Note: See TracBrowser for help on using the repository browser.