#include "abstractlistaction.h" #include "recptprototools.h" #include "oaitools.h" //-------------------------------------------------------------------------------------------------- bool abstractlistaction::output_content(ostream &output, recptproto *protocol, oaiargs ¶ms) { bool prevDocSeen; ResumptionToken *token = NULL; // start the call; clear down the total number of output documents this->outputDocs = 0; // We don't actually handle resumptionTokens yet; if we get one, ignore it if (params["resumptionToken"] != "") { token = new ResumptionToken(params["resumptionToken"]); } this->replyToken = NULL; // if we've been asked for a set, then use it! if (params["set"] != "") { // get the children of this set text_t gsdlSet = params["set"]; text_t gsdlCollect = ""; // given 'demo:CL2', toGSDL returns 'demo' in gsdlCollect and 'CL2' in gsdlSet. If there is no further // set specified after the name of the collection however, then gsdlSet is empty. oaiclassifier::toGSDL(gsdlCollect, gsdlSet); // If gsdlSet is empty, then the user is requesting all the identifiers for the collection, so // we simply output all docs via their oai_id tag. But if a specific subset IS requested, then // use recurse_set() to traverse any sub classifiers to find the relevant docs. if(gsdlSet == ""){ ColInfoResponse_t cinfo; comerror_t err; this->output_content_for_col(output, protocol, gsdlCollect, cinfo, err, params); } else { if (this->check_classifier(protocol, gsdlCollect, gsdlSet)) { this->recurse_set(output, protocol, gsdlCollect, gsdlSet, params, token); } } } // output all records in all hierarchies else { this->output_content_for_all(output, protocol, params); } // If - regardless of set required - no documents have been seen, throw an error. if (this->configuration->getOAIVersion() >= 200 && this->prevDocSeen == false) { errorType = "noRecordsMatch"; this->output_error(output, errorType); return false; } // do a resumption token if required; errors cancel a token... if (this->replyToken != NULL && this->errorType == "") { // Don't add any whitespace around the resumption token as it can confuse harvesters/validators output << " " << this->replyToken->getToken() << "" << endl; } return true; } //-------------------------------------------------------------------------------------------------- void abstractlistaction::output_content_for_col(ostream &output, recptproto *protocol, text_t &gsdlCollect, ColInfoResponse_t &cinfo, comerror_t &err, oaiargs ¶ms) { int startDoc = 0; // get the collection information protocol->get_collectinfo(gsdlCollect, cinfo, err, *this->logout); // check resumption token if (params["resumptionToken"] != "") { ResumptionToken token(params["resumptionToken"]); if (token.getCollection() == gsdlCollect) { startDoc = token.getPosition() - 1; // first document is said to be 1.. } } // If numDocs is 0, do nothing - this->prevDocSeen will stay false if this is the only collection // looked at, or will keep whatever value it had prior to this col (ensures that if the flag has // been set to true by a previous collection that this won't overwrite it to be false). if (cinfo.numDocs > 0) { int errorCount = 0; // Count the number of errors found in the given collection for (long i = startDoc; i < cinfo.numDocs; ++i) { if (errorCount > 3) { // If num errors reaches the cut-off value, bail. cerr << "Error: too many records(" << errorCount << ") in the " << gsdlCollect << " collection have invalid or non-existant oai_ids - skipping remainder of collection.\n"; return; } text_t oai_id = "oai."; oai_id += i; text_t gsdl_id = oaiclassifier::getGSDL_OID(gsdlCollect, oai_id, protocol, *this->logout); if (gsdl_id == "") { // If the string is empty, then the document didn't have an oai_id, so ++errorCount; // increase error count continue; } // Check that the item with the 0ID 'gsdl_id' has a lastmodified field that fits within // the required date range (if specified). if (this->inDateRange(params["from"], params["until"], gsdlCollect, gsdl_id, protocol, output)) { if (this->output_document(output, protocol, gsdlCollect, gsdl_id, params["metadataPrefix"])) { // this should be an IF statement, where prevDocSeen is only set to true if the above // function call returns true (indicating that the doc supported the metadata prefix) but // for some reason this is always false. This means that if no doc in the requested set supports // the metadata format, the "no records match" error that should be thrown won't be... // // GRB: the above comment is no longer true; proper checks are made this->prevDocSeen = true; ++this->outputDocs; } } // if we've output the number of resumption documents; prepare a resumptionToken if (this->outputDocs == this->configuration->resumeAfter()) { this->replyToken = new ResumptionToken(gsdlCollect, "", ""); this->replyToken->setPosition("", i+2); break; } } cinfo.clear(); // Clear for next collection to use (if there is one). } } //-------------------------------------------------------------------------------------------- // Returns true if at least one document record is found void abstractlistaction::output_content_for_all(ostream &output, recptproto *protocol, oaiargs ¶ms) { ColInfoResponse_t cinfo; comerror_t err; text_tarray collections; text_t gsdlCollect = ""; ResumptionToken *token = NULL; // get a list of the collections available collections = this->configuration->getCollectionsList(); // protocol->get_collection_list(collections, err, output); if (params["resumptionToken"] != "") { token = new ResumptionToken(params["resumptionToken"]); } for(int current_col = 0; current_col < collections.size(); ++current_col){ gsdlCollect = collections[current_col]; // ignore all leading collections before the one that matches the resumptiontoken if (token != NULL && token->getCollection() != gsdlCollect) { continue; } this->output_content_for_col(output, protocol, gsdlCollect, cinfo, err, params); // once we've output at least one collection, continue // outputting all others until the resumption total hits token = NULL; if (this->outputDocs == this->configuration->resumeAfter()) { break; } } } //------------------------------------------------------------------------------------------------- // Check that the requested from/until dates don't include a time, as this would be asking for too // fine a level of granularity, one that greenstone doesn't support. An OAI error must be thrown. /* bool abstractlistaction::granularityTooFine(text_t &from, text_t &until) { if (from != "" && from.){ } } */ //------------------------------------------------------------------------------------------------- bool abstractlistaction::check_classifier(recptproto *protocol, const text_t &collection, const text_t &classifier) { text_t topClass; FilterResponse_t response; text_tset metadata; ofstream logout("oai.log", ios::app); // exclude false children of a top-level classifier immediately... if (!get_info(classifier, collection, "", metadata, false, protocol, response, logout)) { return false; } // now check the top-level parent metadata.insert("supportsmemberof"); text_t::const_iterator dot = findchar(classifier.begin(), classifier.end(), '.'); if (dot != classifier.end()) { topClass = substr(classifier.begin(), dot); } else { topClass = classifier; } if (!get_info(topClass, collection, "", metadata, false, protocol, response, logout)) { return false; } if (response.docInfo[0].metadata["supportsmemberof"].values.size() == 0) { return false; } if (response.docInfo[0].metadata["supportsmemberof"].values[0] != "true") { return false; } return true; } void abstractlistaction::recurse_set(ostream &output, recptproto *protocol, const text_t &collection, const text_t &classifier, oaiargs ¶ms, ResumptionToken *resumptionToken) { // metadata for this call FilterResponse_t response; text_tset metadata; ofstream logout("oai.log", ios::app); text_t from = params["from"]; text_t until = params["until"]; text_t metadataPrefix = params["metadataPrefix"]; // ResumptionToken resumptionToken(params["resumptionToken"]); int startPos = 0; // This is a recursive function, and so just because the current set is empty doesn't mean we necessarily // want to throw a 'noRecordsMatch' error; another set (parent/sibling/child) may have had documents. It // is therefore not enough to check that the response object in the current iteration has no docs - we // must also verify that NO OTHER set has had any documents. This is done with the 'prevDocSeen' flag. // It is set to FALSE initially, but as soon as we see a set that isn't empty, it is set to TRUE. The // 'noRecordsMatch' error will only be thrown if, after all appropriate sets have been recursed into, // the 'prevDocSeen' flag is still FALSE. The function returns false if no docs were seen, allowing us to // throw the noRecordsMatch error. // bool prevDocSeen = false; get_children(classifier, collection, "", metadata, false, protocol, response, *this->logout); if (params["resumptionToken"] != "") { // if we're at a resumptionToken if (classifier == resumptionToken->getNode()) { startPos = resumptionToken->getPosition(); } else { text_t fullNode = resumptionToken->getNode(); text_t::iterator leafIter = fullNode.begin() + classifier.size(); // if the next character isn't a dot, blow up! if (*leafIter != '.') { // fatal error; exit(1); } // get the first '.' after the current classifier point; text_t::iterator separator = findchar(leafIter + 1, fullNode.end(), '.'); // now, create a new subpath text_t nextNode = substr(fullNode.begin(), separator); // seek forward; TODO: improve performance of this for (int c = 0; c < response.numDocs; ++c) { if (response.docInfo[c].OID == nextNode) { startPos = c; break; } } } // We need to subtract one from the startPos value to turn it into an index value startPos--; } for (int c = startPos; c < response.numDocs; ++c) { text_t child = response.docInfo[c].OID; // distinguish classifiers and documents by checking whether OID // starts with CL or not text_t childHead; text_t::const_iterator start = child.begin(); text_t::const_iterator here = child.begin(); here += 2; childHead = substr(start, here); // documents we output now if (childHead != "CL") { // Check that the item with the 0ID 'gsdl_id' has a lastmodified field that fits // within the required date range (if specified) if (this->inDateRange(from, until, collection, child, protocol, output)) { // TODO: check that the document can be disseminated in the required metadataPrefix if (this->output_document(output, protocol, collection, child, metadataPrefix)) { this->prevDocSeen = true; ++this->outputDocs; } } } // children which are classifiers are recursed else { if (resumptionToken != NULL) { int depth = countchar(classifier.begin(), classifier.end(), '.'); resumptionToken->setOffset(depth, c+2); } this->recurse_set(output, protocol, collection, child, params, resumptionToken); } if (this->outputDocs == this->configuration->resumeAfter()) { this->replyToken = new ResumptionToken(collection, params["set"], ""); this->replyToken->setPosition(classifier, c+2); break; } } }