[8182] | 1 | #include "abstractlistaction.h"
|
---|
[15428] | 2 | #include "recptprototools.h"
|
---|
[8182] | 3 |
|
---|
| 4 | #include "oaitools.h"
|
---|
| 5 |
|
---|
| 6 | //--------------------------------------------------------------------------------------------------
|
---|
| 7 |
|
---|
| 8 | bool abstractlistaction::output_content(ostream &output, recptproto *protocol, oaiargs ¶ms)
|
---|
| 9 | {
|
---|
| 10 | bool prevDocSeen;
|
---|
| 11 | ResumptionToken *token = NULL;
|
---|
| 12 |
|
---|
| 13 | // start the call; clear down the total number of output documents
|
---|
| 14 | this->outputDocs = 0;
|
---|
| 15 |
|
---|
| 16 | // We don't actually handle resumptionTokens yet; if we get one, ignore it
|
---|
| 17 | if (params["resumptionToken"] != "") {
|
---|
| 18 | token = new ResumptionToken(params["resumptionToken"]);
|
---|
| 19 | }
|
---|
| 20 |
|
---|
[11732] | 21 | this->replyToken = NULL;
|
---|
| 22 |
|
---|
[8182] | 23 | // if we've been asked for a set, then use it!
|
---|
| 24 | if (params["set"] != "") {
|
---|
| 25 | // get the children of this set
|
---|
| 26 | text_t gsdlSet = params["set"];
|
---|
| 27 | text_t gsdlCollect = "";
|
---|
| 28 |
|
---|
| 29 | // given 'demo:CL2', toGSDL returns 'demo' in gsdlCollect and 'CL2' in gsdlSet. If there is no further
|
---|
| 30 | // set specified after the name of the collection however, then gsdlSet is empty.
|
---|
| 31 | oaiclassifier::toGSDL(gsdlCollect, gsdlSet);
|
---|
| 32 |
|
---|
| 33 | // If gsdlSet is empty, then the user is requesting all the identifiers for the collection, so
|
---|
| 34 | // we simply output all docs via their oai_id tag. But if a specific subset IS requested, then
|
---|
| 35 | // use recurse_set() to traverse any sub classifiers to find the relevant docs.
|
---|
| 36 | if(gsdlSet == ""){
|
---|
| 37 | ColInfoResponse_t cinfo;
|
---|
| 38 | comerror_t err;
|
---|
| 39 | this->output_content_for_col(output, protocol, gsdlCollect, cinfo, err, params);
|
---|
| 40 | }
|
---|
| 41 | else {
|
---|
| 42 | if (this->check_classifier(protocol, gsdlCollect, gsdlSet)) {
|
---|
[11732] | 43 | this->recurse_set(output, protocol, gsdlCollect, gsdlSet, params, token);
|
---|
[8182] | 44 | }
|
---|
| 45 | }
|
---|
| 46 | }
|
---|
| 47 | // output all records in all hierarchies
|
---|
| 48 | else {
|
---|
| 49 | this->output_content_for_all(output, protocol, params);
|
---|
| 50 | }
|
---|
| 51 |
|
---|
| 52 | // If - regardless of set required - no documents have been seen, throw an error.
|
---|
| 53 | if (this->configuration->getOAIVersion() >= 200 && this->prevDocSeen == false) {
|
---|
| 54 | errorType = "noRecordsMatch";
|
---|
| 55 | this->output_error(output, errorType);
|
---|
| 56 |
|
---|
| 57 | return false;
|
---|
| 58 | }
|
---|
| 59 |
|
---|
[8219] | 60 | // do a resumption token if required; errors cancel a token...
|
---|
[11732] | 61 | if (this->replyToken != NULL && this->errorType == "") {
|
---|
[16708] | 62 | // Don't add any whitespace around the resumption token as it can confuse harvesters/validators
|
---|
| 63 | output << " <resumptionToken>" << this->replyToken->getToken() << "</resumptionToken>" << endl;
|
---|
[8182] | 64 | }
|
---|
| 65 |
|
---|
| 66 | return true;
|
---|
| 67 | }
|
---|
| 68 |
|
---|
| 69 | //--------------------------------------------------------------------------------------------------
|
---|
| 70 |
|
---|
| 71 | void abstractlistaction::output_content_for_col(ostream &output, recptproto *protocol, text_t &gsdlCollect,
|
---|
| 72 | ColInfoResponse_t &cinfo, comerror_t &err, oaiargs ¶ms)
|
---|
| 73 | { int startDoc = 0;
|
---|
| 74 |
|
---|
| 75 | // get the collection information
|
---|
| 76 | protocol->get_collectinfo(gsdlCollect, cinfo, err, *this->logout);
|
---|
| 77 |
|
---|
| 78 | // check resumption token
|
---|
| 79 | if (params["resumptionToken"] != "") {
|
---|
| 80 | ResumptionToken token(params["resumptionToken"]);
|
---|
| 81 | if (token.getCollection() == gsdlCollect) {
|
---|
| 82 | startDoc = token.getPosition() - 1; // first document is said to be 1..
|
---|
| 83 | }
|
---|
| 84 | }
|
---|
| 85 |
|
---|
| 86 | // If numDocs is 0, do nothing - this->prevDocSeen will stay false if this is the only collection
|
---|
| 87 | // looked at, or will keep whatever value it had prior to this col (ensures that if the flag has
|
---|
| 88 | // been set to true by a previous collection that this won't overwrite it to be false).
|
---|
| 89 | if (cinfo.numDocs > 0) {
|
---|
| 90 | int errorCount = 0; // Count the number of errors found in the given collection
|
---|
| 91 |
|
---|
[9608] | 92 | for (long i = startDoc; i < cinfo.numDocs; ++i) {
|
---|
[8182] | 93 | if (errorCount > 3) { // If num errors reaches the cut-off value, bail.
|
---|
| 94 | cerr << "Error: too many records(" << errorCount << ") in the " << gsdlCollect
|
---|
| 95 | << " collection have invalid or non-existant oai_ids - skipping remainder of collection.\n";
|
---|
| 96 | return;
|
---|
| 97 | }
|
---|
| 98 |
|
---|
| 99 | text_t oai_id = "oai.";
|
---|
| 100 | oai_id += i;
|
---|
| 101 |
|
---|
| 102 | text_t gsdl_id = oaiclassifier::getGSDL_OID(gsdlCollect, oai_id, protocol, *this->logout);
|
---|
| 103 |
|
---|
| 104 | if (gsdl_id == "") { // If the string is empty, then the document didn't have an oai_id, so
|
---|
[9608] | 105 | ++errorCount; // increase error count
|
---|
[8182] | 106 | continue;
|
---|
| 107 | }
|
---|
[11732] | 108 |
|
---|
| 109 |
|
---|
[11311] | 110 | // Check that the item with the 0ID 'gsdl_id' has a lastmodified field that fits within
|
---|
[8182] | 111 | // the required date range (if specified).
|
---|
| 112 | if (this->inDateRange(params["from"], params["until"], gsdlCollect, gsdl_id, protocol, output)) {
|
---|
| 113 | if (this->output_document(output, protocol, gsdlCollect, gsdl_id, params["metadataPrefix"])) {
|
---|
| 114 | // this should be an IF statement, where prevDocSeen is only set to true if the above
|
---|
| 115 | // function call returns true (indicating that the doc supported the metadata prefix) but
|
---|
| 116 | // for some reason this is always false. This means that if no doc in the requested set supports
|
---|
| 117 | // the metadata format, the "no records match" error that should be thrown won't be...
|
---|
| 118 | //
|
---|
| 119 | // GRB: the above comment is no longer true; proper checks are made
|
---|
| 120 | this->prevDocSeen = true;
|
---|
[9608] | 121 | ++this->outputDocs;
|
---|
[8182] | 122 | }
|
---|
| 123 | }
|
---|
| 124 |
|
---|
[11732] | 125 | // if we've output the number of resumption documents; prepare a resumptionToken
|
---|
| 126 | if (this->outputDocs == this->configuration->resumeAfter()) {
|
---|
[8182] | 127 | this->replyToken = new ResumptionToken(gsdlCollect, "", "");
|
---|
| 128 | this->replyToken->setPosition("", i+2);
|
---|
| 129 | break;
|
---|
| 130 | }
|
---|
| 131 | }
|
---|
| 132 |
|
---|
| 133 | cinfo.clear(); // Clear for next collection to use (if there is one).
|
---|
| 134 | }
|
---|
| 135 | }
|
---|
| 136 |
|
---|
| 137 | //--------------------------------------------------------------------------------------------
|
---|
| 138 | // Returns true if at least one document record is found
|
---|
| 139 | void abstractlistaction::output_content_for_all(ostream &output, recptproto *protocol, oaiargs ¶ms)
|
---|
| 140 | {
|
---|
| 141 | ColInfoResponse_t cinfo;
|
---|
| 142 | comerror_t err;
|
---|
| 143 | text_tarray collections;
|
---|
| 144 | text_t gsdlCollect = "";
|
---|
[11732] | 145 | ResumptionToken *token = NULL;
|
---|
[8182] | 146 |
|
---|
| 147 | // get a list of the collections available
|
---|
[11732] | 148 | collections = this->configuration->getCollectionsList();
|
---|
| 149 | // protocol->get_collection_list(collections, err, output);
|
---|
[8182] | 150 |
|
---|
[11732] | 151 | if (params["resumptionToken"] != "") {
|
---|
| 152 | token = new ResumptionToken(params["resumptionToken"]);
|
---|
| 153 | }
|
---|
| 154 |
|
---|
[9608] | 155 | for(int current_col = 0; current_col < collections.size(); ++current_col){
|
---|
[8182] | 156 | gsdlCollect = collections[current_col];
|
---|
[11732] | 157 |
|
---|
| 158 | // ignore all leading collections before the one that matches the resumptiontoken
|
---|
| 159 | if (token != NULL &&
|
---|
| 160 | token->getCollection() != gsdlCollect)
|
---|
| 161 | { continue;
|
---|
| 162 | }
|
---|
| 163 |
|
---|
[8182] | 164 | this->output_content_for_col(output, protocol, gsdlCollect, cinfo, err, params);
|
---|
[11732] | 165 |
|
---|
| 166 | // once we've output at least one collection, continue
|
---|
| 167 | // outputting all others until the resumption total hits
|
---|
| 168 | token = NULL;
|
---|
| 169 |
|
---|
| 170 | if (this->outputDocs == this->configuration->resumeAfter()) {
|
---|
| 171 | break;
|
---|
| 172 | }
|
---|
[8182] | 173 | }
|
---|
| 174 | }
|
---|
| 175 |
|
---|
| 176 | //-------------------------------------------------------------------------------------------------
|
---|
| 177 | // Check that the requested from/until dates don't include a time, as this would be asking for too
|
---|
| 178 | // fine a level of granularity, one that greenstone doesn't support. An OAI error must be thrown.
|
---|
| 179 | /*
|
---|
| 180 | bool abstractlistaction::granularityTooFine(text_t &from, text_t &until)
|
---|
| 181 | {
|
---|
| 182 | if (from != "" && from.){
|
---|
| 183 |
|
---|
| 184 | }
|
---|
| 185 |
|
---|
| 186 | }
|
---|
| 187 | */
|
---|
| 188 | //-------------------------------------------------------------------------------------------------
|
---|
| 189 |
|
---|
| 190 | bool abstractlistaction::check_classifier(recptproto *protocol, const text_t &collection,
|
---|
| 191 | const text_t &classifier)
|
---|
| 192 | { text_t topClass;
|
---|
| 193 | FilterResponse_t response;
|
---|
| 194 | text_tset metadata;
|
---|
[8303] | 195 | ofstream logout("oai.log", ios::app);
|
---|
[8182] | 196 |
|
---|
| 197 | // exclude false children of a top-level classifier immediately...
|
---|
| 198 | if (!get_info(classifier, collection, "", metadata, false, protocol, response, logout)) {
|
---|
| 199 | return false;
|
---|
| 200 | }
|
---|
| 201 |
|
---|
| 202 | // now check the top-level parent
|
---|
| 203 | metadata.insert("supportsmemberof");
|
---|
| 204 |
|
---|
| 205 | text_t::const_iterator dot = findchar(classifier.begin(), classifier.end(), '.');
|
---|
| 206 | if (dot != classifier.end()) {
|
---|
| 207 | topClass = substr(classifier.begin(), dot);
|
---|
| 208 | }
|
---|
| 209 | else {
|
---|
| 210 | topClass = classifier;
|
---|
| 211 | }
|
---|
| 212 |
|
---|
| 213 | if (!get_info(topClass, collection, "", metadata, false, protocol, response, logout)) {
|
---|
| 214 | return false;
|
---|
| 215 | }
|
---|
| 216 |
|
---|
| 217 | if (response.docInfo[0].metadata["supportsmemberof"].values.size() == 0) {
|
---|
| 218 | return false;
|
---|
| 219 | }
|
---|
| 220 |
|
---|
| 221 | if (response.docInfo[0].metadata["supportsmemberof"].values[0] != "true") {
|
---|
| 222 | return false;
|
---|
| 223 | }
|
---|
| 224 |
|
---|
| 225 | return true;
|
---|
| 226 | }
|
---|
| 227 |
|
---|
| 228 | void abstractlistaction::recurse_set(ostream &output, recptproto *protocol, const text_t &collection,
|
---|
[11732] | 229 | const text_t &classifier, oaiargs ¶ms, ResumptionToken *resumptionToken)
|
---|
[8182] | 230 | {
|
---|
| 231 | // metadata for this call
|
---|
| 232 | FilterResponse_t response;
|
---|
| 233 | text_tset metadata;
|
---|
[8303] | 234 | ofstream logout("oai.log", ios::app);
|
---|
[8182] | 235 | text_t from = params["from"];
|
---|
| 236 | text_t until = params["until"];
|
---|
| 237 | text_t metadataPrefix = params["metadataPrefix"];
|
---|
[11732] | 238 | // ResumptionToken resumptionToken(params["resumptionToken"]);
|
---|
[8182] | 239 | int startPos = 0;
|
---|
| 240 |
|
---|
| 241 | // This is a recursive function, and so just because the current set is empty doesn't mean we necessarily
|
---|
| 242 | // want to throw a 'noRecordsMatch' error; another set (parent/sibling/child) may have had documents. It
|
---|
| 243 | // is therefore not enough to check that the response object in the current iteration has no docs - we
|
---|
| 244 | // must also verify that NO OTHER set has had any documents. This is done with the 'prevDocSeen' flag.
|
---|
| 245 | // It is set to FALSE initially, but as soon as we see a set that isn't empty, it is set to TRUE. The
|
---|
| 246 | // 'noRecordsMatch' error will only be thrown if, after all appropriate sets have been recursed into,
|
---|
| 247 | // the 'prevDocSeen' flag is still FALSE. The function returns false if no docs were seen, allowing us to
|
---|
| 248 | // throw the noRecordsMatch error.
|
---|
| 249 |
|
---|
| 250 | // bool prevDocSeen = false;
|
---|
| 251 |
|
---|
| 252 | get_children(classifier, collection, "", metadata, false, protocol, response, *this->logout);
|
---|
| 253 |
|
---|
| 254 | if (params["resumptionToken"] != "") {
|
---|
| 255 | // if we're at a resumptionToken
|
---|
[11732] | 256 | if (classifier == resumptionToken->getNode()) {
|
---|
| 257 | startPos = resumptionToken->getPosition();
|
---|
[8182] | 258 | }
|
---|
| 259 | else {
|
---|
[11732] | 260 | text_t fullNode = resumptionToken->getNode();
|
---|
[8182] | 261 | text_t::iterator leafIter = fullNode.begin() + classifier.size();
|
---|
| 262 |
|
---|
| 263 | // if the next character isn't a dot, blow up!
|
---|
| 264 | if (*leafIter != '.') {
|
---|
| 265 | // fatal error;
|
---|
| 266 | exit(1);
|
---|
| 267 | }
|
---|
| 268 |
|
---|
| 269 | // get the first '.' after the current classifier point;
|
---|
| 270 | text_t::iterator separator = findchar(leafIter + 1, fullNode.end(), '.');
|
---|
| 271 |
|
---|
| 272 | // now, create a new subpath
|
---|
| 273 | text_t nextNode = substr(fullNode.begin(), separator);
|
---|
| 274 |
|
---|
| 275 | // seek forward; TODO: improve performance of this
|
---|
[9608] | 276 | for (int c = 0; c < response.numDocs; ++c) {
|
---|
[8182] | 277 | if (response.docInfo[c].OID == nextNode) {
|
---|
| 278 | startPos = c;
|
---|
| 279 | break;
|
---|
| 280 | }
|
---|
| 281 | }
|
---|
| 282 | }
|
---|
[15380] | 283 |
|
---|
| 284 | // We need to subtract one from the startPos value to turn it into an index value
|
---|
| 285 | startPos--;
|
---|
[8182] | 286 | }
|
---|
| 287 |
|
---|
[9608] | 288 | for (int c = startPos; c < response.numDocs; ++c) {
|
---|
[8182] | 289 | text_t child = response.docInfo[c].OID;
|
---|
| 290 |
|
---|
[11311] | 291 | // distinguish classifiers and documents by checking whether OID
|
---|
| 292 | // starts with CL or not
|
---|
[8182] | 293 | text_t childHead;
|
---|
| 294 | text_t::const_iterator start = child.begin();
|
---|
| 295 | text_t::const_iterator here = child.begin();
|
---|
[11311] | 296 | here += 2;
|
---|
[8182] | 297 | childHead = substr(start, here);
|
---|
| 298 |
|
---|
| 299 | // documents we output now
|
---|
[11311] | 300 | if (childHead != "CL") {
|
---|
| 301 | // Check that the item with the 0ID 'gsdl_id' has a lastmodified field that fits
|
---|
[8182] | 302 | // within the required date range (if specified)
|
---|
| 303 | if (this->inDateRange(from, until, collection, child, protocol, output)) {
|
---|
| 304 | // TODO: check that the document can be disseminated in the required metadataPrefix
|
---|
| 305 |
|
---|
| 306 | if (this->output_document(output, protocol, collection, child, metadataPrefix)) {
|
---|
| 307 | this->prevDocSeen = true;
|
---|
[9608] | 308 | ++this->outputDocs;
|
---|
[8182] | 309 | }
|
---|
| 310 | }
|
---|
| 311 | }
|
---|
| 312 | // children which are classifiers are recursed
|
---|
| 313 | else {
|
---|
[11732] | 314 | if (resumptionToken != NULL) {
|
---|
| 315 | int depth = countchar(classifier.begin(), classifier.end(), '.');
|
---|
| 316 | resumptionToken->setOffset(depth, c+2);
|
---|
| 317 | }
|
---|
| 318 | this->recurse_set(output, protocol, collection, child, params, resumptionToken);
|
---|
[8182] | 319 | }
|
---|
| 320 |
|
---|
[11732] | 321 | if (this->outputDocs == this->configuration->resumeAfter()) {
|
---|
[15380] | 322 | this->replyToken = new ResumptionToken(collection, params["set"], "");
|
---|
[11732] | 323 | this->replyToken->setPosition(classifier, c+2);
|
---|
[15380] | 324 | break;
|
---|
[8182] | 325 | }
|
---|
| 326 | }
|
---|
| 327 | }
|
---|
| 328 |
|
---|
| 329 |
|
---|
| 330 |
|
---|
| 331 |
|
---|