source: trunk/gsdl/src/oaiservr/abstractlistaction.cpp@ 8182

Last change on this file since 8182 was 8182, checked in by cs025, 20 years ago

Added OAI Server code to Greenstone

  • Property svn:keywords set to Author Date Id Revision
File size: 14.1 KB
Line 
1#include "abstractlistaction.h"
2#include "OIDtools.h"
3
4#include "oaitools.h"
5
6#define MAXRECORDS 10
7
8bool abstractlistaction::validateAction(recptproto *protocol, oaiargs &params, int &numArgs)
9{
10 text_t from = params["from"];
11 text_t until = params["until"];
12
13 // from date must be less than, or equal to, until date
14 if ((from != "") && (until != "") && !(from <= until)){
15 this->errorType = "badArgument";
16 return false;
17 }
18
19 if (from != ""){
20 // Must be in the form YYYY-MM-DD
21 if(from.size() != 10){
22 this->errorType = "badArgument";
23 return false;
24 }
25 else{
26 if(from[4] != '-' || from[7] != '-'){
27 this->errorType = "badArgument";
28 return false;
29 }
30 }
31 numArgs ++; // Increase valid args count
32 }
33
34 if (until != ""){
35 // Must be in the form YYYY-MM-DD
36 if(until.size() != 10){
37 this->errorType = "badArgument";
38 return false;
39 }
40 else{
41 if(until[4] != '-' || until[7] != '-'){
42 this->errorType = "badArgument";
43 return false;
44 }
45 }
46 numArgs ++; // Increase valid args count
47 }
48
49 if (params["set"] != "") {
50 text_t gsdlSet = params["set"];
51 text_t gsdlCollect = "";
52
53 // given 'demo:CL2', toGSDL returns 'demo' in gsdlCollect and 'CL2' in gsdlSet. If there is no further
54 // set specified after the name of the collection however, then gsdlSet is empty.
55 oaiclassifier::toGSDL(gsdlCollect, gsdlSet);
56
57 comerror_t err;
58 ColInfoResponse_t cinfo;
59
60 // check that the collection is accessible
61 protocol->get_collectinfo(gsdlCollect, cinfo, err, cerr);
62 if (err != noError) {
63 this->errorType = "badArgument";
64 return false;
65 }
66
67 // check the child set if it is given
68 if (gsdlSet != "") {
69 if (!this->check_classifier(protocol, gsdlCollect, gsdlSet)) {
70 this->errorType = "badArgument";
71 return false;
72 }
73 }
74 numArgs ++;
75 }
76
77 if (params["resumptionToken"] != "") {
78 ResumptionToken token(params["resumptionToken"]);
79
80 if (!token.isValid()) {
81 this->errorType = "badResumptionToken";
82 return false;
83 }
84 numArgs ++;
85 }
86
87 this->errorType = "";
88 return true;
89}
90
91//--------------------------------------------------------------------------------------------------
92
93bool abstractlistaction::output_content(ostream &output, recptproto *protocol, oaiargs &params)
94{
95 text_t from = params["from"];
96 text_t until = params["until"];
97 text_t metaFormat = params["metadataPrefix"];
98 bool prevDocSeen;
99 ResumptionToken *token = NULL;
100
101 // start the call; clear down the total number of output documents
102 this->outputDocs = 0;
103
104 // We don't actually handle resumptionTokens yet; if we get one, ignore it
105 if (params["resumptionToken"] != "") {
106 token = new ResumptionToken(params["resumptionToken"]);
107 }
108
109 // if we've been asked for a set, then use it!
110 if (params["set"] != "") {
111 // get the children of this set
112 text_t gsdlSet = params["set"];
113 text_t gsdlCollect = "";
114
115 // given 'demo:CL2', toGSDL returns 'demo' in gsdlCollect and 'CL2' in gsdlSet. If there is no further
116 // set specified after the name of the collection however, then gsdlSet is empty.
117 oaiclassifier::toGSDL(gsdlCollect, gsdlSet);
118
119 // If gsdlSet is empty, then the user is requesting all the identifiers for the collection, so
120 // we simply output all docs via their oai_id tag. But if a specific subset IS requested, then
121 // use recurse_set() to traverse any sub classifiers to find the relevant docs.
122 if(gsdlSet == ""){
123 ColInfoResponse_t cinfo;
124 comerror_t err;
125 this->output_content_for_col(output, protocol, gsdlCollect, cinfo, err, params);
126 }
127 else {
128 if (this->check_classifier(protocol, gsdlCollect, gsdlSet)) {
129 this->recurse_set(output, protocol, gsdlCollect, gsdlSet, params);
130 }
131 }
132 }
133 // output all records in all hierarchies
134 else {
135 this->output_content_for_all(output, protocol, params);
136 }
137
138 // If - regardless of set required - no documents have been seen, throw an error.
139 if (this->configuration->getOAIVersion() >= 200 && this->prevDocSeen == false) {
140 errorType = "noRecordsMatch";
141 this->output_error(output, errorType);
142
143 return false;
144 }
145
146 // do a resumption token if required
147 if (this->replyToken != NULL) {
148 output << "<resumptionToken>";
149 output << this->replyToken->getToken();
150 output << "</resumptionToken>";
151 }
152
153 return true;
154}
155
156//--------------------------------------------------------------------------------------------------
157
158void abstractlistaction::output_content_for_col(ostream &output, recptproto *protocol, text_t &gsdlCollect,
159 ColInfoResponse_t &cinfo, comerror_t &err, oaiargs &params)
160{ int startDoc = 0;
161
162 // get the collection information
163 protocol->get_collectinfo(gsdlCollect, cinfo, err, *this->logout);
164
165 // check resumption token
166 if (params["resumptionToken"] != "") {
167 ResumptionToken token(params["resumptionToken"]);
168 if (token.getCollection() == gsdlCollect) {
169 startDoc = token.getPosition() - 1; // first document is said to be 1..
170 }
171 }
172
173 // If numDocs is 0, do nothing - this->prevDocSeen will stay false if this is the only collection
174 // looked at, or will keep whatever value it had prior to this col (ensures that if the flag has
175 // been set to true by a previous collection that this won't overwrite it to be false).
176 if (cinfo.numDocs > 0) {
177 int errorCount = 0; // Count the number of errors found in the given collection
178 text_t from = params["from"];
179 text_t until = params["until"];
180
181 for (long i = startDoc; i < cinfo.numDocs; i++) {
182 if (errorCount > 3) { // If num errors reaches the cut-off value, bail.
183 cerr << "Error: too many records(" << errorCount << ") in the " << gsdlCollect
184 << " collection have invalid or non-existant oai_ids - skipping remainder of collection.\n";
185 return;
186 }
187
188 text_t oai_id = "oai.";
189 oai_id += i;
190
191 text_t gsdl_id = oaiclassifier::getGSDL_OID(gsdlCollect, oai_id, protocol, *this->logout);
192
193 if (gsdl_id == "") { // If the string is empty, then the document didn't have an oai_id, so
194 errorCount ++; // increase error count
195 continue;
196 }
197
198 // Check that the item with the HASH ID 'gsdl_id' has a lastmodified field that fits within
199 // the required date range (if specified).
200 if (this->inDateRange(params["from"], params["until"], gsdlCollect, gsdl_id, protocol, output)) {
201 if (this->output_document(output, protocol, gsdlCollect, gsdl_id, params["metadataPrefix"])) {
202 // this should be an IF statement, where prevDocSeen is only set to true if the above
203 // function call returns true (indicating that the doc supported the metadata prefix) but
204 // for some reason this is always false. This means that if no doc in the requested set supports
205 // the metadata format, the "no records match" error that should be thrown won't be...
206 //
207 // GRB: the above comment is no longer true; proper checks are made
208 this->prevDocSeen = true;
209 this->outputDocs ++;
210 }
211 }
212
213 // if we've output MAXRECORDS documents; prepare a resumptionToken
214 if (this->outputDocs == MAXRECORDS) {
215 this->replyToken = new ResumptionToken(gsdlCollect, "", "");
216 this->replyToken->setPosition("", i+2);
217 break;
218 }
219 }
220
221 cinfo.clear(); // Clear for next collection to use (if there is one).
222 }
223}
224
225//--------------------------------------------------------------------------------------------
226// Returns true if at least one document record is found
227void abstractlistaction::output_content_for_all(ostream &output, recptproto *protocol, oaiargs &params)
228{
229 ColInfoResponse_t cinfo;
230 comerror_t err;
231 text_tarray collections;
232 text_t gsdlCollect = "";
233
234 // get a list of the collections available
235 protocol->get_collection_list(collections, err, output);
236
237 for(int current_col = 0; current_col < collections.size(); current_col++){
238 gsdlCollect = collections[current_col];
239 this->output_content_for_col(output, protocol, gsdlCollect, cinfo, err, params);
240 }
241}
242
243//-------------------------------------------------------------------------------------------------
244// Check that the requested from/until dates don't include a time, as this would be asking for too
245// fine a level of granularity, one that greenstone doesn't support. An OAI error must be thrown.
246/*
247bool abstractlistaction::granularityTooFine(text_t &from, text_t &until)
248{
249 if (from != "" && from.){
250
251 }
252
253}
254*/
255//-------------------------------------------------------------------------------------------------
256
257bool abstractlistaction::check_classifier(recptproto *protocol, const text_t &collection,
258 const text_t &classifier)
259{ text_t topClass;
260 FilterResponse_t response;
261 text_tset metadata;
262 ofstream logout("grb.log", ios::app);
263
264 // exclude false children of a top-level classifier immediately...
265 if (!get_info(classifier, collection, "", metadata, false, protocol, response, logout)) {
266 return false;
267 }
268
269 // now check the top-level parent
270 metadata.insert("supportsmemberof");
271
272 text_t::const_iterator dot = findchar(classifier.begin(), classifier.end(), '.');
273 if (dot != classifier.end()) {
274 topClass = substr(classifier.begin(), dot);
275 }
276 else {
277 topClass = classifier;
278 }
279
280 if (!get_info(topClass, collection, "", metadata, false, protocol, response, logout)) {
281 return false;
282 }
283
284 if (response.docInfo[0].metadata["supportsmemberof"].values.size() == 0) {
285 return false;
286 }
287
288 if (response.docInfo[0].metadata["supportsmemberof"].values[0] != "true") {
289 return false;
290 }
291
292 return true;
293}
294
295void abstractlistaction::recurse_set(ostream &output, recptproto *protocol, const text_t &collection,
296 const text_t &classifier, oaiargs &params)
297{
298 // metadata for this call
299 FilterResponse_t response;
300 text_tset metadata;
301 ofstream logout("grb.log", ios::app);
302 text_t from = params["from"];
303 text_t until = params["until"];
304 text_t metadataPrefix = params["metadataPrefix"];
305 ResumptionToken resumptionToken(params["resumptionToken"]);
306 int startPos = 0;
307
308 // This is a recursive function, and so just because the current set is empty doesn't mean we necessarily
309 // want to throw a 'noRecordsMatch' error; another set (parent/sibling/child) may have had documents. It
310 // is therefore not enough to check that the response object in the current iteration has no docs - we
311 // must also verify that NO OTHER set has had any documents. This is done with the 'prevDocSeen' flag.
312 // It is set to FALSE initially, but as soon as we see a set that isn't empty, it is set to TRUE. The
313 // 'noRecordsMatch' error will only be thrown if, after all appropriate sets have been recursed into,
314 // the 'prevDocSeen' flag is still FALSE. The function returns false if no docs were seen, allowing us to
315 // throw the noRecordsMatch error.
316
317 // bool prevDocSeen = false;
318
319 get_children(classifier, collection, "", metadata, false, protocol, response, *this->logout);
320
321 if (params["resumptionToken"] != "") {
322 // if we're at a resumptionToken
323 if (classifier == resumptionToken.getNode()) {
324 startPos = resumptionToken.getPosition();
325 }
326 else {
327 text_t fullNode = resumptionToken.getNode();
328 text_t::iterator leafIter = fullNode.begin() + classifier.size();
329
330 // if the next character isn't a dot, blow up!
331 if (*leafIter != '.') {
332 // fatal error;
333 exit(1);
334 }
335
336 // get the first '.' after the current classifier point;
337 text_t::iterator separator = findchar(leafIter + 1, fullNode.end(), '.');
338
339 // now, create a new subpath
340 text_t nextNode = substr(fullNode.begin(), separator);
341
342 // seek forward; TODO: improve performance of this
343 for (int c = 0; c < response.numDocs; c ++) {
344 if (response.docInfo[c].OID == nextNode) {
345 startPos = c;
346 break;
347 }
348 }
349 }
350 }
351
352 cout << classifier << " " << startPos << endl;
353
354 for (int c = startPos; c < response.numDocs; c ++) {
355 text_t child = response.docInfo[c].OID;
356
357 // check for HASH items and exclude them
358 text_t childHead;
359 text_t::const_iterator start = child.begin();
360 text_t::const_iterator here = child.begin();
361 here += 4;
362 childHead = substr(start, here);
363
364 // documents we output now
365 if (childHead == "HASH") {
366 // Check that the item with the HASH ID 'gsdl_id' has a lastmodified field that fits
367 // within the required date range (if specified)
368 if (this->inDateRange(from, until, collection, child, protocol, output)) {
369 // TODO: check that the document can be disseminated in the required metadataPrefix
370
371 if (this->output_document(output, protocol, collection, child, metadataPrefix)) {
372 this->prevDocSeen = true;
373 this->outputDocs ++;
374 }
375 }
376 }
377 // children which are classifiers are recursed
378 else {
379 this->recurse_set(output, protocol, collection, child, params);
380 }
381
382 if (this->outputDocs == MAXRECORDS) {
383 this->replyToken = new ResumptionToken(collection, params["set"], "");
384 this->replyToken->setPosition(classifier, c+2);
385 }
386 }
387}
388
389/*
390bool listrecsaction::recurse_set(ostream &output, recptproto *protocol, const text_t &collection, const text_t &classifier, const text_t &metadataPrefix)
391{
392 FilterResponse_t response;
393 text_tset metadata;
394 ofstream logout("grb.log", ios::app);
395 bool prevDocSeen = false;
396
397 get_children(classifier, collection, metadata, false, protocol, response, logout);
398
399 for (int c = 0; c < response.numDocs; c ++) {
400 text_t child = response.docInfo[c].OID;
401
402 // check for HASH items and exclude them
403 text_t childHead;
404 text_t::const_iterator start = child.begin();
405 text_t::const_iterator here = child.begin();
406 here += 4;
407 childHead = substr(start, here);
408
409 // documents we output now
410 if (childHead == "HASH") {
411 this->output_document(output, protocol, collection, child, metadataPrefix);
412 this->outputDocs ++;
413 prevDocSeen = true;
414 }
415 // children which are classifiers are recursed
416 else {
417 prevDocSeen = this->recurse_set(output, protocol, collection, child, metadataPrefix);
418 }
419 }
420 return prevDocSeen;
421}
422*/
423
424
425
426
Note: See TracBrowser for help on using the repository browser.