Ignore:
Timestamp:
2009-09-11T11:54:17+12:00 (15 years ago)
Author:
mdewsnip
Message:

Completely rewrote the resumption token support, as its buginess finally tipped the "I can't stand it any more" scale...

File:
1 edited

Legend:

Unmodified
Added
Removed
  • gsdl/trunk/runtime-src/src/oaiservr/abstractlistaction.cpp

    r16835 r20590  
    11#include "abstractlistaction.h"
     2#include "OIDtools.h"
    23#include "recptprototools.h"
    34
    45#include "oaitools.h"
    56
    6 //--------------------------------------------------------------------------------------------------
     7
     8bool abstractlistaction::check_classifier(recptproto *protocol, const text_t &collection, const text_t &set_name)
     9{
     10  text_tset metadata;
     11  FilterResponse_t response;
     12  return get_info(set_name, collection, "", metadata, false, protocol, response, *this->logout);
     13}
     14
    715
    816bool abstractlistaction::output_content(ostream &output, recptproto *protocol, oaiargs &params)
    917{
    10   bool   prevDocSeen;
    11   ResumptionToken *token = NULL;
    12 
    13   // start the call; clear down the total number of output documents
    14   this->outputDocs = 0;
    15 
    16   // We don't actually handle resumptionTokens yet; if we get one, ignore it
    17   if (params["resumptionToken"] != "") {
    18     token = new ResumptionToken(params["resumptionToken"]);
    19   }
    20 
    21   this->replyToken = NULL;
    22 
    23   // if we've been asked for a set, then use it!
    24   if (params["set"] != "") {
    25     // get the children of this set
    26     text_t gsdlSet = params["set"];
    27     text_t gsdlCollect = "";
    28 
    29     // given 'demo:CL2', toGSDL returns 'demo' in gsdlCollect and 'CL2' in gsdlSet. If there is no further
    30     // set specified after the name of the collection however, then gsdlSet is empty.
    31     oaiclassifier::toGSDL(gsdlCollect, gsdlSet);
    32 
    33     // If gsdlSet is empty, then the user is requesting all the identifiers for the collection, so
    34     // we simply output all docs via their oai_id tag. But if a specific subset IS requested, then
    35     // use recurse_set() to traverse any sub classifiers to find the relevant docs.
    36     if(gsdlSet == ""){
    37       ColInfoResponse_t cinfo;
    38       comerror_t err;
    39       this->output_content_for_col(output, protocol, gsdlCollect, cinfo, err, params);
    40     }
    41     else {
    42       if (this->check_classifier(protocol, gsdlCollect, gsdlSet)) {
    43     this->recurse_set(output, protocol, gsdlCollect, gsdlSet, params, token);
    44       }
    45     }
    46   }
    47   // output all records in all hierarchies
    48   else {
    49     this->output_content_for_all(output, protocol, params);
    50   }
    51 
    52   // If - regardless of set required - no documents have been seen, throw an error.
    53   if (this->configuration->getOAIVersion() >= 200 && this->prevDocSeen == false) {
     18  // Reset variables
     19  this->output_docs = 0;
     20
     21  text_t set_name = params["set"];
     22  text_t position = params["position"];
     23
     24  // Process the resumptionToken if there is one
     25  if (params["resumptionToken"] != "")
     26  {
     27    ResumptionToken resumption_token(params["resumptionToken"]);
     28    set_name = resumption_token.getSet();
     29    position = resumption_token.getPosition();
     30  }
     31
     32  // Case for "set" argument present -- output just the records in the specified set
     33  if (set_name != "")
     34  {
     35    // Separate the collection name and Greenstone classifier OID from the set name
     36    text_t collection_name = "";
     37    text_t gsdl_classifier_OID = set_name;
     38    oaiclassifier::toGSDL(collection_name, gsdl_classifier_OID);
     39
     40    // If output_content_for_set() returns false a resumption token has been output, so it's time to stop
     41    if (output_content_for_set(output, protocol, params, collection_name, gsdl_classifier_OID, set_name) == false)
     42    {
     43      return true;
     44    }
     45  }
     46
     47  // Case for no "set" argument present -- output all records in all collections
     48  else
     49  {
     50    // Get a list of the collections available
     51    text_tarray& collections = this->configuration->getCollectionsList();
     52    if (collections.size() == 0)
     53    {
     54      return false;
     55    }
     56
     57    // Get the current collection from the position value
     58    text_t collection_name = "";
     59    oaiclassifier::toGSDL(collection_name, position);
     60
     61    // Find the starting collection
     62    text_tarray::iterator collection_iterator = collections.begin();
     63    while (collection_iterator != collections.end())
     64    {
     65      if (collection_name == "" || collection_name == *collection_iterator)
     66      {
     67    break;
     68      }
     69
     70      collection_iterator++;
     71    }
     72
     73    // Now loop through the remaining collections
     74    while (collection_iterator != collections.end())
     75    {
     76      // If output_content_for_set() returns false a resumption token has been output, so it's time to stop
     77      if (output_content_for_set(output, protocol, params, *collection_iterator, "", "") == false)
     78      {
     79    return true;
     80      }
     81
     82      collection_iterator++;
     83    }
     84  }
     85
     86  // If no records were output throw an error
     87  if (this->configuration->getOAIVersion() >= 200 && this->output_docs == 0)
     88  {
    5489    errorType = "noRecordsMatch";
    5590    this->output_error(output, errorType);
    56 
    57     return false;
    58   }
    59 
    60   // do a resumption token if required; errors cancel a token...
    61   if (this->replyToken != NULL && this->errorType == "") {
    62     // Don't add any whitespace around the resumption token as it can confuse harvesters/validators
    63     output << "  <resumptionToken>" << this->replyToken->getToken() << "</resumptionToken>" << endl;
     91    return false;
    6492  }
    6593
     
    6795}
    6896
    69 //--------------------------------------------------------------------------------------------------
    70 
    71 void abstractlistaction::output_content_for_col(ostream &output, recptproto *protocol, text_t &gsdlCollect,
    72                         ColInfoResponse_t &cinfo, comerror_t &err, oaiargs &params)
    73 { int startDoc = 0;
    74 
    75   text_t metadataPrefix = params["metadataPrefix"];
    76 
    77   // check resumption token
    78   if (params["resumptionToken"] != "") {
    79     ResumptionToken token(params["resumptionToken"]);
    80     if (token.getCollection() == gsdlCollect) {
    81       startDoc = token.getPosition() - 1; // first document is said to be 1..
    82       metadataPrefix = "oai_dc";  // TO DO: This should come from the resumption token
    83     }
    84   }
    85 
    86   // Get the OAI nodes from the info db file
    87   text_t oai_root_node = "oai";
     97
     98bool abstractlistaction::output_content_for_set(ostream &output, recptproto *protocol, oaiargs &params, text_t collection_name, text_t gsdl_classifier_OID, text_t set_name)
     99{
     100  // Check if the set is actually a collection
     101  if (gsdl_classifier_OID == "")
     102  {
     103    gsdl_classifier_OID = "oai";
     104  }
     105
     106  text_t metadata_prefix = params["metadataPrefix"];
     107  text_t from = params["from"];
     108  text_t until = params["until"];
     109  text_t position = "";
     110
     111  // Process the resumptionToken if there is one
     112  if (params["resumptionToken"] != "")
     113  {
     114    ResumptionToken resumption_token(params["resumptionToken"]);
     115    metadata_prefix = resumption_token.getMetadataPrefix();
     116    from = resumption_token.getFrom();
     117    until = resumption_token.getUntil();
     118    position = resumption_token.getPosition();
     119  }
     120
     121  // Get the list of identifiers in this collection
     122  // Collections should not contain too many identifiers otherwise this will use a lot of time and memory
    88123  text_tset metadata;  // Must be empty for efficiency
     124  FilterResponse_t identifiers_response;
     125  get_children(gsdl_classifier_OID, collection_name, "", metadata, false, protocol, identifiers_response, *this->logout);
     126
     127  // Find the starting position, if necessary
     128  ResultDocInfo_tarray::iterator identifier_iterator = identifiers_response.docInfo.begin();
     129  if (output_docs == 0)
     130  {
     131    while (identifier_iterator != identifiers_response.docInfo.end())
     132    {
     133      if (position == "" || position == (collection_name + ":" + (*identifier_iterator).OID))
     134      {
     135    break;
     136      }
     137
     138      identifier_iterator++;
     139    }
     140  }
     141
     142  // Now loop through displaying the next matching records
     143  while (identifier_iterator != identifiers_response.docInfo.end())
     144  {
     145    position = (*identifier_iterator).OID;
     146
     147    text_t document_OID = position;
     148    if (starts_with(document_OID, "oai."))
     149    {
     150      document_OID = oaiclassifier::getGSDL_OID(collection_name, document_OID, protocol, *this->logout);
     151    }
     152
     153    // Check this OID is in the (optional) date range specified
     154    if (this->in_date_range(output, protocol, params, collection_name, document_OID, from, until))
     155    {
     156      // If we've output the desired number of records return a resumptionToken and we're done
     157      if (this->output_docs == this->configuration->resumeAfter())
     158      {
     159    // Get the buildDate from the build.cfg file
     160    ColInfoResponse_t cinfo;
     161    comerror_t err;
     162    protocol->get_collectinfo(collection_name, cinfo, err, cerr);
     163
     164    ResumptionToken resumption_token(cinfo.buildDate, set_name, metadata_prefix, from, until, collection_name + ":" + position);
     165
     166    // Don't add any whitespace around the resumption token as it can confuse harvesters/validators
     167    output << "  <resumptionToken>" << resumption_token.getResumptionTokenString() << "</resumptionToken>" << endl;
     168    return false;
     169      }
     170
     171      // Otherwise output this record and increment the count
     172      this->output_document(output, protocol, collection_name, document_OID, metadata_prefix);
     173      this->output_docs++;
     174    }
     175
     176    identifier_iterator++;
     177  }
     178
     179  return true;
     180}
     181
     182
     183bool abstractlistaction::in_date_range(ostream &output, recptproto *protocol, oaiargs &params,
     184                       text_t& collection, text_t oai_OID, text_t from, text_t until)
     185{
     186  // If no "from" or "until" value is specified every record matches, so we don't need to go any further
     187  if (from == "" && until == "")
     188  {
     189    return true;
     190  }
     191
     192  // Get the datestamp from the document as sections do not have this metadata
     193  text_t document_OID;
     194  get_top(oai_OID, document_OID);
     195
     196  // Request the lastmodified value for this document
     197  text_tset metadata;
     198  metadata.insert("lastmodified");
    89199  FilterResponse_t response;
    90   get_children(oai_root_node, gsdlCollect, "", metadata, false, protocol, response, *this->logout);
    91  
    92   // If numDocs is 0, do nothing - this->prevDocSeen will stay false if this is the only collection
    93   // looked at, or will keep whatever value it had prior to this col (ensures that if the flag has
    94   // been set to true by a previous collection that this won't overwrite it to be false).
    95   if (response.docInfo.size() > 0) {
    96     int errorCount      = 0; // Count the number of errors found in the given collection
    97 
    98     for (long i = startDoc; i < response.docInfo.size(); ++i) {
    99       if (errorCount > 3) { // If num errors reaches the cut-off value, bail.
    100     cerr << "Error: too many records(" << errorCount << ") in the " << gsdlCollect
    101          << " collection have invalid or non-existant oai_ids - skipping remainder of collection.\n";
    102     return;
    103       }
    104      
    105       text_t oai_id = "oai.";
    106       oai_id += i;
    107      
    108       text_t gsdl_id = oaiclassifier::getGSDL_OID(gsdlCollect, oai_id, protocol, *this->logout);
    109      
    110       if (gsdl_id == "") { // If the string is empty, then the document didn't have an oai_id, so
    111     ++errorCount;     // increase error count
    112     continue;
    113       }
    114 
    115 
    116       // Check that the item with the 0ID 'gsdl_id' has a lastmodified field that fits within
    117       // the required date range (if specified).
    118       if (this->inDateRange(params["from"], params["until"], gsdlCollect, gsdl_id, protocol, output)) {
    119     if (this->output_document(output, protocol, gsdlCollect, gsdl_id, metadataPrefix)) {
    120       // this should be an IF statement, where prevDocSeen is only set to true if the above
    121       // function call returns true (indicating that the doc supported the metadata prefix) but
    122       // for some reason this is always false. This means that if no doc in the requested set supports
    123       // the metadata format, the "no records match" error that should be thrown won't be...
    124       //
    125       // GRB: the above comment is no longer true; proper checks are made
    126       this->prevDocSeen = true;
    127       ++this->outputDocs;
    128     }
    129       }
    130 
    131       // if we've output the number of resumption documents; prepare a resumptionToken
    132       if (this->outputDocs == this->configuration->resumeAfter()) {
    133     this->replyToken = new ResumptionToken(gsdlCollect, "", "");
    134     this->replyToken->setPosition("", i+2);
    135     break;
    136       }
    137     }
    138 
    139     cinfo.clear(); // Clear for next collection to use (if there is one).
    140   }
    141 }
    142 
    143 //--------------------------------------------------------------------------------------------
    144 // Returns true if at least one document record is found
    145 void abstractlistaction::output_content_for_all(ostream &output, recptproto *protocol, oaiargs &params)
    146 {
    147   ColInfoResponse_t cinfo;
    148   comerror_t        err;
    149   text_tarray       collections;
    150   text_t            gsdlCollect = "";
    151   ResumptionToken   *token = NULL;
    152 
    153   // get a list of the collections available
    154   collections = this->configuration->getCollectionsList();
    155   //  protocol->get_collection_list(collections, err, output);
    156  
    157   if (params["resumptionToken"] != "") {
    158     token = new ResumptionToken(params["resumptionToken"]);
    159   }
    160 
    161   for(int current_col = 0; current_col < collections.size(); ++current_col){
    162     gsdlCollect = collections[current_col];
    163 
    164     // ignore all leading collections before the one that matches the resumptiontoken
    165     if (token != NULL &&
    166     token->getCollection() != gsdlCollect)
    167     { continue;
    168     }
    169 
    170     this->output_content_for_col(output, protocol, gsdlCollect, cinfo, err, params);
    171 
    172     // once we've output at least one collection, continue
    173     // outputting all others until the resumption total hits
    174     token = NULL;
    175 
    176     if (this->outputDocs == this->configuration->resumeAfter()) {
    177       break;
    178     }
    179   }
    180 }
    181 
    182 //-------------------------------------------------------------------------------------------------
    183 // Check that the requested from/until dates don't include a time, as this would be asking for too
    184 // fine a level of granularity, one that greenstone doesn't support. An OAI error must be thrown.
    185 /*
    186 bool abstractlistaction::granularityTooFine(text_t &from, text_t &until)
    187 {
    188   if (from != "" && from.){
    189    
    190   }
    191  
    192 }
    193 */
    194 //-------------------------------------------------------------------------------------------------
    195 
    196 bool abstractlistaction::check_classifier(recptproto *protocol, const text_t &collection,
    197                       const text_t &classifier)
    198 { text_t topClass;
    199   FilterResponse_t response;
    200   text_tset        metadata;
    201   ofstream         logout("oai.log", ios::app);
    202 
    203   // exclude false children of a top-level classifier immediately...
    204   if (!get_info(classifier, collection, "", metadata, false, protocol, response, logout)) {
    205     return false;
    206   }
    207  
    208   // now check the top-level parent
    209   metadata.insert("supportsmemberof");
    210 
    211   text_t::const_iterator dot = findchar(classifier.begin(), classifier.end(), '.');
    212   if (dot != classifier.end()) {
    213     topClass = substr(classifier.begin(), dot);
    214   }
    215   else {
    216     topClass = classifier;
    217   }
    218 
    219   if (!get_info(topClass, collection, "", metadata, false, protocol, response, logout)) {
    220     return false;
    221   }
    222  
    223   if (response.docInfo[0].metadata["supportsmemberof"].values.size() == 0) {
    224     return false;
    225   }
    226 
    227   if (response.docInfo[0].metadata["supportsmemberof"].values[0] != "true") {
    228     return false;
    229   }
    230 
     200  if (!get_info(document_OID, collection, "", metadata, false, protocol, response, *this->logout))
     201  {
     202    return false;
     203  }
     204
     205  text_t last_modified_date;
     206  this->getLastModifiedDate(response.docInfo[0], last_modified_date);
     207
     208  // Check this record is not before the "from" value, if it exists
     209  if (from != "" && last_modified_date < from)
     210  {
     211    // Too early
     212    return false;
     213  }
     214
     215  // Check this record is not after the "until" value, if it exists
     216  if (until != "" && last_modified_date > until)
     217  {
     218    // Too late
     219    return false;
     220  }
     221
     222  // Just right
    231223  return true;
    232224}
    233 
    234 void abstractlistaction::recurse_set(ostream &output, recptproto *protocol, const text_t &collection,
    235                      const text_t &classifier, oaiargs &params, ResumptionToken *resumptionToken)
    236 {
    237   // metadata for this call
    238   FilterResponse_t response;
    239   text_tset        metadata;
    240   ofstream         logout("oai.log", ios::app);
    241   text_t           from = params["from"];
    242   text_t           until = params["until"];
    243   text_t           metadataPrefix = params["metadataPrefix"];
    244   //  ResumptionToken  resumptionToken(params["resumptionToken"]);
    245   int              startPos = 0;
    246 
    247   // This is a recursive function, and so just because the current set is empty doesn't mean we necessarily
    248   // want to throw a 'noRecordsMatch' error; another set (parent/sibling/child) may have had documents. It
    249   // is therefore not enough to check that the response object in the current iteration has no docs - we
    250   // must also verify that NO OTHER set has had any documents. This is done with the 'prevDocSeen' flag.
    251   // It is set to FALSE initially, but as soon as we see a set that isn't empty, it is set to TRUE. The
    252   // 'noRecordsMatch' error will only be thrown if, after all appropriate sets have been recursed into,
    253   // the 'prevDocSeen' flag is still FALSE. The function returns false if no docs were seen, allowing us to
    254   // throw the noRecordsMatch error.
    255  
    256   //  bool prevDocSeen = false;
    257 
    258   get_children(classifier, collection, "", metadata, false, protocol, response, *this->logout);
    259 
    260   if (params["resumptionToken"] != "") {
    261     // if we're at a resumptionToken
    262     if (classifier == resumptionToken->getNode()) {
    263       startPos = resumptionToken->getPosition();
    264     }
    265     else {
    266       text_t fullNode = resumptionToken->getNode();
    267       text_t::iterator leafIter = fullNode.begin() + classifier.size();
    268 
    269       // if the next character isn't a dot, blow up!
    270       if (*leafIter != '.') {
    271     // fatal error;
    272     exit(1);
    273       }
    274      
    275       // get the first '.' after the current classifier point;
    276       text_t::iterator separator = findchar(leafIter + 1, fullNode.end(), '.');
    277 
    278       // now, create a new subpath
    279       text_t nextNode = substr(fullNode.begin(), separator);
    280 
    281       // seek forward; TODO: improve performance of this
    282       for (int c = 0; c < response.numDocs; ++c) {
    283     if (response.docInfo[c].OID == nextNode) {
    284       startPos = c;
    285       break;
    286     }
    287       }
    288     }
    289 
    290     // We need to subtract one from the startPos value to turn it into an index value
    291     startPos--;
    292   }
    293  
    294   for (int c = startPos; c < response.numDocs; ++c) {
    295     text_t child = response.docInfo[c].OID;
    296    
    297     // distinguish classifiers and documents by checking whether OID
    298     // starts with CL or not
    299     text_t childHead;
    300     text_t::const_iterator start = child.begin();
    301     text_t::const_iterator here  = child.begin();
    302     here += 2;
    303     childHead = substr(start, here);
    304    
    305     // documents we output now
    306     if (childHead != "CL") {
    307       // Check that the item with the 0ID 'gsdl_id' has a lastmodified field that fits
    308       // within the required date range (if specified)
    309       if (this->inDateRange(from, until, collection, child, protocol, output)) {
    310     // TODO: check that the document can be disseminated in the required metadataPrefix
    311 
    312     if (this->output_document(output, protocol, collection, child, metadataPrefix)) {
    313       this->prevDocSeen = true;
    314       ++this->outputDocs;
    315     }
    316       }
    317     }
    318     // children which are classifiers are recursed
    319     else {
    320       if (resumptionToken != NULL) {
    321     int depth = countchar(classifier.begin(), classifier.end(), '.');
    322     resumptionToken->setOffset(depth, c+2);
    323       }
    324       this->recurse_set(output, protocol, collection, child, params, resumptionToken);
    325     }
    326 
    327     if (this->outputDocs == this->configuration->resumeAfter()) {
    328       this->replyToken = new ResumptionToken(collection, params["set"], "");
    329       this->replyToken->setPosition(classifier, c+2);     
    330       break;
    331     }
    332   }
    333 }
    334 
    335 
    336 
    337 
Note: See TracChangeset for help on using the changeset viewer.