- Timestamp:
- 2009-09-11T11:54:17+12:00 (15 years ago)
- File:
-
- 1 edited
Legend:
- Unmodified
- Added
- Removed
-
gsdl/trunk/runtime-src/src/oaiservr/abstractlistaction.cpp
r16835 r20590 1 1 #include "abstractlistaction.h" 2 #include "OIDtools.h" 2 3 #include "recptprototools.h" 3 4 4 5 #include "oaitools.h" 5 6 6 //-------------------------------------------------------------------------------------------------- 7 8 bool abstractlistaction::check_classifier(recptproto *protocol, const text_t &collection, const text_t &set_name) 9 { 10 text_tset metadata; 11 FilterResponse_t response; 12 return get_info(set_name, collection, "", metadata, false, protocol, response, *this->logout); 13 } 14 7 15 8 16 bool abstractlistaction::output_content(ostream &output, recptproto *protocol, oaiargs ¶ms) 9 17 { 10 bool prevDocSeen; 11 ResumptionToken *token = NULL; 12 13 // start the call; clear down the total number of output documents 14 this->outputDocs = 0; 15 16 // We don't actually handle resumptionTokens yet; if we get one, ignore it 17 if (params["resumptionToken"] != "") { 18 token = new ResumptionToken(params["resumptionToken"]); 19 } 20 21 this->replyToken = NULL; 22 23 // if we've been asked for a set, then use it! 24 if (params["set"] != "") { 25 // get the children of this set 26 text_t gsdlSet = params["set"]; 27 text_t gsdlCollect = ""; 28 29 // given 'demo:CL2', toGSDL returns 'demo' in gsdlCollect and 'CL2' in gsdlSet. If there is no further 30 // set specified after the name of the collection however, then gsdlSet is empty. 31 oaiclassifier::toGSDL(gsdlCollect, gsdlSet); 32 33 // If gsdlSet is empty, then the user is requesting all the identifiers for the collection, so 34 // we simply output all docs via their oai_id tag. But if a specific subset IS requested, then 35 // use recurse_set() to traverse any sub classifiers to find the relevant docs. 36 if(gsdlSet == ""){ 37 ColInfoResponse_t cinfo; 38 comerror_t err; 39 this->output_content_for_col(output, protocol, gsdlCollect, cinfo, err, params); 40 } 41 else { 42 if (this->check_classifier(protocol, gsdlCollect, gsdlSet)) { 43 this->recurse_set(output, protocol, gsdlCollect, gsdlSet, params, token); 44 } 45 } 46 } 47 // output all records in all hierarchies 48 else { 49 this->output_content_for_all(output, protocol, params); 50 } 51 52 // If - regardless of set required - no documents have been seen, throw an error. 53 if (this->configuration->getOAIVersion() >= 200 && this->prevDocSeen == false) { 18 // Reset variables 19 this->output_docs = 0; 20 21 text_t set_name = params["set"]; 22 text_t position = params["position"]; 23 24 // Process the resumptionToken if there is one 25 if (params["resumptionToken"] != "") 26 { 27 ResumptionToken resumption_token(params["resumptionToken"]); 28 set_name = resumption_token.getSet(); 29 position = resumption_token.getPosition(); 30 } 31 32 // Case for "set" argument present -- output just the records in the specified set 33 if (set_name != "") 34 { 35 // Separate the collection name and Greenstone classifier OID from the set name 36 text_t collection_name = ""; 37 text_t gsdl_classifier_OID = set_name; 38 oaiclassifier::toGSDL(collection_name, gsdl_classifier_OID); 39 40 // If output_content_for_set() returns false a resumption token has been output, so it's time to stop 41 if (output_content_for_set(output, protocol, params, collection_name, gsdl_classifier_OID, set_name) == false) 42 { 43 return true; 44 } 45 } 46 47 // Case for no "set" argument present -- output all records in all collections 48 else 49 { 50 // Get a list of the collections available 51 text_tarray& collections = this->configuration->getCollectionsList(); 52 if (collections.size() == 0) 53 { 54 return false; 55 } 56 57 // Get the current collection from the position value 58 text_t collection_name = ""; 59 oaiclassifier::toGSDL(collection_name, position); 60 61 // Find the starting collection 62 text_tarray::iterator collection_iterator = collections.begin(); 63 while (collection_iterator != collections.end()) 64 { 65 if (collection_name == "" || collection_name == *collection_iterator) 66 { 67 break; 68 } 69 70 collection_iterator++; 71 } 72 73 // Now loop through the remaining collections 74 while (collection_iterator != collections.end()) 75 { 76 // If output_content_for_set() returns false a resumption token has been output, so it's time to stop 77 if (output_content_for_set(output, protocol, params, *collection_iterator, "", "") == false) 78 { 79 return true; 80 } 81 82 collection_iterator++; 83 } 84 } 85 86 // If no records were output throw an error 87 if (this->configuration->getOAIVersion() >= 200 && this->output_docs == 0) 88 { 54 89 errorType = "noRecordsMatch"; 55 90 this->output_error(output, errorType); 56 57 return false; 58 } 59 60 // do a resumption token if required; errors cancel a token... 61 if (this->replyToken != NULL && this->errorType == "") { 62 // Don't add any whitespace around the resumption token as it can confuse harvesters/validators 63 output << " <resumptionToken>" << this->replyToken->getToken() << "</resumptionToken>" << endl; 91 return false; 64 92 } 65 93 … … 67 95 } 68 96 69 //-------------------------------------------------------------------------------------------------- 70 71 void abstractlistaction::output_content_for_col(ostream &output, recptproto *protocol, text_t &gsdlCollect, 72 ColInfoResponse_t &cinfo, comerror_t &err, oaiargs ¶ms) 73 { int startDoc = 0; 74 75 text_t metadataPrefix = params["metadataPrefix"]; 76 77 // check resumption token 78 if (params["resumptionToken"] != "") { 79 ResumptionToken token(params["resumptionToken"]); 80 if (token.getCollection() == gsdlCollect) { 81 startDoc = token.getPosition() - 1; // first document is said to be 1.. 82 metadataPrefix = "oai_dc"; // TO DO: This should come from the resumption token 83 } 84 } 85 86 // Get the OAI nodes from the info db file 87 text_t oai_root_node = "oai"; 97 98 bool abstractlistaction::output_content_for_set(ostream &output, recptproto *protocol, oaiargs ¶ms, text_t collection_name, text_t gsdl_classifier_OID, text_t set_name) 99 { 100 // Check if the set is actually a collection 101 if (gsdl_classifier_OID == "") 102 { 103 gsdl_classifier_OID = "oai"; 104 } 105 106 text_t metadata_prefix = params["metadataPrefix"]; 107 text_t from = params["from"]; 108 text_t until = params["until"]; 109 text_t position = ""; 110 111 // Process the resumptionToken if there is one 112 if (params["resumptionToken"] != "") 113 { 114 ResumptionToken resumption_token(params["resumptionToken"]); 115 metadata_prefix = resumption_token.getMetadataPrefix(); 116 from = resumption_token.getFrom(); 117 until = resumption_token.getUntil(); 118 position = resumption_token.getPosition(); 119 } 120 121 // Get the list of identifiers in this collection 122 // Collections should not contain too many identifiers otherwise this will use a lot of time and memory 88 123 text_tset metadata; // Must be empty for efficiency 124 FilterResponse_t identifiers_response; 125 get_children(gsdl_classifier_OID, collection_name, "", metadata, false, protocol, identifiers_response, *this->logout); 126 127 // Find the starting position, if necessary 128 ResultDocInfo_tarray::iterator identifier_iterator = identifiers_response.docInfo.begin(); 129 if (output_docs == 0) 130 { 131 while (identifier_iterator != identifiers_response.docInfo.end()) 132 { 133 if (position == "" || position == (collection_name + ":" + (*identifier_iterator).OID)) 134 { 135 break; 136 } 137 138 identifier_iterator++; 139 } 140 } 141 142 // Now loop through displaying the next matching records 143 while (identifier_iterator != identifiers_response.docInfo.end()) 144 { 145 position = (*identifier_iterator).OID; 146 147 text_t document_OID = position; 148 if (starts_with(document_OID, "oai.")) 149 { 150 document_OID = oaiclassifier::getGSDL_OID(collection_name, document_OID, protocol, *this->logout); 151 } 152 153 // Check this OID is in the (optional) date range specified 154 if (this->in_date_range(output, protocol, params, collection_name, document_OID, from, until)) 155 { 156 // If we've output the desired number of records return a resumptionToken and we're done 157 if (this->output_docs == this->configuration->resumeAfter()) 158 { 159 // Get the buildDate from the build.cfg file 160 ColInfoResponse_t cinfo; 161 comerror_t err; 162 protocol->get_collectinfo(collection_name, cinfo, err, cerr); 163 164 ResumptionToken resumption_token(cinfo.buildDate, set_name, metadata_prefix, from, until, collection_name + ":" + position); 165 166 // Don't add any whitespace around the resumption token as it can confuse harvesters/validators 167 output << " <resumptionToken>" << resumption_token.getResumptionTokenString() << "</resumptionToken>" << endl; 168 return false; 169 } 170 171 // Otherwise output this record and increment the count 172 this->output_document(output, protocol, collection_name, document_OID, metadata_prefix); 173 this->output_docs++; 174 } 175 176 identifier_iterator++; 177 } 178 179 return true; 180 } 181 182 183 bool abstractlistaction::in_date_range(ostream &output, recptproto *protocol, oaiargs ¶ms, 184 text_t& collection, text_t oai_OID, text_t from, text_t until) 185 { 186 // If no "from" or "until" value is specified every record matches, so we don't need to go any further 187 if (from == "" && until == "") 188 { 189 return true; 190 } 191 192 // Get the datestamp from the document as sections do not have this metadata 193 text_t document_OID; 194 get_top(oai_OID, document_OID); 195 196 // Request the lastmodified value for this document 197 text_tset metadata; 198 metadata.insert("lastmodified"); 89 199 FilterResponse_t response; 90 get_children(oai_root_node, gsdlCollect, "", metadata, false, protocol, response, *this->logout); 91 92 // If numDocs is 0, do nothing - this->prevDocSeen will stay false if this is the only collection 93 // looked at, or will keep whatever value it had prior to this col (ensures that if the flag has 94 // been set to true by a previous collection that this won't overwrite it to be false). 95 if (response.docInfo.size() > 0) { 96 int errorCount = 0; // Count the number of errors found in the given collection 97 98 for (long i = startDoc; i < response.docInfo.size(); ++i) { 99 if (errorCount > 3) { // If num errors reaches the cut-off value, bail. 100 cerr << "Error: too many records(" << errorCount << ") in the " << gsdlCollect 101 << " collection have invalid or non-existant oai_ids - skipping remainder of collection.\n"; 102 return; 103 } 104 105 text_t oai_id = "oai."; 106 oai_id += i; 107 108 text_t gsdl_id = oaiclassifier::getGSDL_OID(gsdlCollect, oai_id, protocol, *this->logout); 109 110 if (gsdl_id == "") { // If the string is empty, then the document didn't have an oai_id, so 111 ++errorCount; // increase error count 112 continue; 113 } 114 115 116 // Check that the item with the 0ID 'gsdl_id' has a lastmodified field that fits within 117 // the required date range (if specified). 118 if (this->inDateRange(params["from"], params["until"], gsdlCollect, gsdl_id, protocol, output)) { 119 if (this->output_document(output, protocol, gsdlCollect, gsdl_id, metadataPrefix)) { 120 // this should be an IF statement, where prevDocSeen is only set to true if the above 121 // function call returns true (indicating that the doc supported the metadata prefix) but 122 // for some reason this is always false. This means that if no doc in the requested set supports 123 // the metadata format, the "no records match" error that should be thrown won't be... 124 // 125 // GRB: the above comment is no longer true; proper checks are made 126 this->prevDocSeen = true; 127 ++this->outputDocs; 128 } 129 } 130 131 // if we've output the number of resumption documents; prepare a resumptionToken 132 if (this->outputDocs == this->configuration->resumeAfter()) { 133 this->replyToken = new ResumptionToken(gsdlCollect, "", ""); 134 this->replyToken->setPosition("", i+2); 135 break; 136 } 137 } 138 139 cinfo.clear(); // Clear for next collection to use (if there is one). 140 } 141 } 142 143 //-------------------------------------------------------------------------------------------- 144 // Returns true if at least one document record is found 145 void abstractlistaction::output_content_for_all(ostream &output, recptproto *protocol, oaiargs ¶ms) 146 { 147 ColInfoResponse_t cinfo; 148 comerror_t err; 149 text_tarray collections; 150 text_t gsdlCollect = ""; 151 ResumptionToken *token = NULL; 152 153 // get a list of the collections available 154 collections = this->configuration->getCollectionsList(); 155 // protocol->get_collection_list(collections, err, output); 156 157 if (params["resumptionToken"] != "") { 158 token = new ResumptionToken(params["resumptionToken"]); 159 } 160 161 for(int current_col = 0; current_col < collections.size(); ++current_col){ 162 gsdlCollect = collections[current_col]; 163 164 // ignore all leading collections before the one that matches the resumptiontoken 165 if (token != NULL && 166 token->getCollection() != gsdlCollect) 167 { continue; 168 } 169 170 this->output_content_for_col(output, protocol, gsdlCollect, cinfo, err, params); 171 172 // once we've output at least one collection, continue 173 // outputting all others until the resumption total hits 174 token = NULL; 175 176 if (this->outputDocs == this->configuration->resumeAfter()) { 177 break; 178 } 179 } 180 } 181 182 //------------------------------------------------------------------------------------------------- 183 // Check that the requested from/until dates don't include a time, as this would be asking for too 184 // fine a level of granularity, one that greenstone doesn't support. An OAI error must be thrown. 185 /* 186 bool abstractlistaction::granularityTooFine(text_t &from, text_t &until) 187 { 188 if (from != "" && from.){ 189 190 } 191 192 } 193 */ 194 //------------------------------------------------------------------------------------------------- 195 196 bool abstractlistaction::check_classifier(recptproto *protocol, const text_t &collection, 197 const text_t &classifier) 198 { text_t topClass; 199 FilterResponse_t response; 200 text_tset metadata; 201 ofstream logout("oai.log", ios::app); 202 203 // exclude false children of a top-level classifier immediately... 204 if (!get_info(classifier, collection, "", metadata, false, protocol, response, logout)) { 205 return false; 206 } 207 208 // now check the top-level parent 209 metadata.insert("supportsmemberof"); 210 211 text_t::const_iterator dot = findchar(classifier.begin(), classifier.end(), '.'); 212 if (dot != classifier.end()) { 213 topClass = substr(classifier.begin(), dot); 214 } 215 else { 216 topClass = classifier; 217 } 218 219 if (!get_info(topClass, collection, "", metadata, false, protocol, response, logout)) { 220 return false; 221 } 222 223 if (response.docInfo[0].metadata["supportsmemberof"].values.size() == 0) { 224 return false; 225 } 226 227 if (response.docInfo[0].metadata["supportsmemberof"].values[0] != "true") { 228 return false; 229 } 230 200 if (!get_info(document_OID, collection, "", metadata, false, protocol, response, *this->logout)) 201 { 202 return false; 203 } 204 205 text_t last_modified_date; 206 this->getLastModifiedDate(response.docInfo[0], last_modified_date); 207 208 // Check this record is not before the "from" value, if it exists 209 if (from != "" && last_modified_date < from) 210 { 211 // Too early 212 return false; 213 } 214 215 // Check this record is not after the "until" value, if it exists 216 if (until != "" && last_modified_date > until) 217 { 218 // Too late 219 return false; 220 } 221 222 // Just right 231 223 return true; 232 224 } 233 234 void abstractlistaction::recurse_set(ostream &output, recptproto *protocol, const text_t &collection,235 const text_t &classifier, oaiargs ¶ms, ResumptionToken *resumptionToken)236 {237 // metadata for this call238 FilterResponse_t response;239 text_tset metadata;240 ofstream logout("oai.log", ios::app);241 text_t from = params["from"];242 text_t until = params["until"];243 text_t metadataPrefix = params["metadataPrefix"];244 // ResumptionToken resumptionToken(params["resumptionToken"]);245 int startPos = 0;246 247 // This is a recursive function, and so just because the current set is empty doesn't mean we necessarily248 // want to throw a 'noRecordsMatch' error; another set (parent/sibling/child) may have had documents. It249 // is therefore not enough to check that the response object in the current iteration has no docs - we250 // must also verify that NO OTHER set has had any documents. This is done with the 'prevDocSeen' flag.251 // It is set to FALSE initially, but as soon as we see a set that isn't empty, it is set to TRUE. The252 // 'noRecordsMatch' error will only be thrown if, after all appropriate sets have been recursed into,253 // the 'prevDocSeen' flag is still FALSE. The function returns false if no docs were seen, allowing us to254 // throw the noRecordsMatch error.255 256 // bool prevDocSeen = false;257 258 get_children(classifier, collection, "", metadata, false, protocol, response, *this->logout);259 260 if (params["resumptionToken"] != "") {261 // if we're at a resumptionToken262 if (classifier == resumptionToken->getNode()) {263 startPos = resumptionToken->getPosition();264 }265 else {266 text_t fullNode = resumptionToken->getNode();267 text_t::iterator leafIter = fullNode.begin() + classifier.size();268 269 // if the next character isn't a dot, blow up!270 if (*leafIter != '.') {271 // fatal error;272 exit(1);273 }274 275 // get the first '.' after the current classifier point;276 text_t::iterator separator = findchar(leafIter + 1, fullNode.end(), '.');277 278 // now, create a new subpath279 text_t nextNode = substr(fullNode.begin(), separator);280 281 // seek forward; TODO: improve performance of this282 for (int c = 0; c < response.numDocs; ++c) {283 if (response.docInfo[c].OID == nextNode) {284 startPos = c;285 break;286 }287 }288 }289 290 // We need to subtract one from the startPos value to turn it into an index value291 startPos--;292 }293 294 for (int c = startPos; c < response.numDocs; ++c) {295 text_t child = response.docInfo[c].OID;296 297 // distinguish classifiers and documents by checking whether OID298 // starts with CL or not299 text_t childHead;300 text_t::const_iterator start = child.begin();301 text_t::const_iterator here = child.begin();302 here += 2;303 childHead = substr(start, here);304 305 // documents we output now306 if (childHead != "CL") {307 // Check that the item with the 0ID 'gsdl_id' has a lastmodified field that fits308 // within the required date range (if specified)309 if (this->inDateRange(from, until, collection, child, protocol, output)) {310 // TODO: check that the document can be disseminated in the required metadataPrefix311 312 if (this->output_document(output, protocol, collection, child, metadataPrefix)) {313 this->prevDocSeen = true;314 ++this->outputDocs;315 }316 }317 }318 // children which are classifiers are recursed319 else {320 if (resumptionToken != NULL) {321 int depth = countchar(classifier.begin(), classifier.end(), '.');322 resumptionToken->setOffset(depth, c+2);323 }324 this->recurse_set(output, protocol, collection, child, params, resumptionToken);325 }326 327 if (this->outputDocs == this->configuration->resumeAfter()) {328 this->replyToken = new ResumptionToken(collection, params["set"], "");329 this->replyToken->setPosition(classifier, c+2);330 break;331 }332 }333 }334 335 336 337
Note:
See TracChangeset
for help on using the changeset viewer.