source: gsdl/trunk/runtime-src/src/oaiservr/abstractlistaction.cpp@ 16835

Last change on this file since 16835 was 16835, checked in by mdewsnip, 16 years ago

Now gets the list of OAI nodes to output from the GDBM file ("oai" node). Previously it just used the numdocs value from the build.cfg file, and assumed that the oai nodes were "oai.0" -> "oai.X". This isn't any good if you want to include section-level nodes in the OAI output.

  • Property svn:keywords set to Author Date Id Revision
File size: 12.2 KB
Line 
1#include "abstractlistaction.h"
2#include "recptprototools.h"
3
4#include "oaitools.h"
5
6//--------------------------------------------------------------------------------------------------
7
8bool abstractlistaction::output_content(ostream &output, recptproto *protocol, oaiargs &params)
9{
10 bool prevDocSeen;
11 ResumptionToken *token = NULL;
12
13 // start the call; clear down the total number of output documents
14 this->outputDocs = 0;
15
16 // We don't actually handle resumptionTokens yet; if we get one, ignore it
17 if (params["resumptionToken"] != "") {
18 token = new ResumptionToken(params["resumptionToken"]);
19 }
20
21 this->replyToken = NULL;
22
23 // if we've been asked for a set, then use it!
24 if (params["set"] != "") {
25 // get the children of this set
26 text_t gsdlSet = params["set"];
27 text_t gsdlCollect = "";
28
29 // given 'demo:CL2', toGSDL returns 'demo' in gsdlCollect and 'CL2' in gsdlSet. If there is no further
30 // set specified after the name of the collection however, then gsdlSet is empty.
31 oaiclassifier::toGSDL(gsdlCollect, gsdlSet);
32
33 // If gsdlSet is empty, then the user is requesting all the identifiers for the collection, so
34 // we simply output all docs via their oai_id tag. But if a specific subset IS requested, then
35 // use recurse_set() to traverse any sub classifiers to find the relevant docs.
36 if(gsdlSet == ""){
37 ColInfoResponse_t cinfo;
38 comerror_t err;
39 this->output_content_for_col(output, protocol, gsdlCollect, cinfo, err, params);
40 }
41 else {
42 if (this->check_classifier(protocol, gsdlCollect, gsdlSet)) {
43 this->recurse_set(output, protocol, gsdlCollect, gsdlSet, params, token);
44 }
45 }
46 }
47 // output all records in all hierarchies
48 else {
49 this->output_content_for_all(output, protocol, params);
50 }
51
52 // If - regardless of set required - no documents have been seen, throw an error.
53 if (this->configuration->getOAIVersion() >= 200 && this->prevDocSeen == false) {
54 errorType = "noRecordsMatch";
55 this->output_error(output, errorType);
56
57 return false;
58 }
59
60 // do a resumption token if required; errors cancel a token...
61 if (this->replyToken != NULL && this->errorType == "") {
62 // Don't add any whitespace around the resumption token as it can confuse harvesters/validators
63 output << " <resumptionToken>" << this->replyToken->getToken() << "</resumptionToken>" << endl;
64 }
65
66 return true;
67}
68
69//--------------------------------------------------------------------------------------------------
70
71void abstractlistaction::output_content_for_col(ostream &output, recptproto *protocol, text_t &gsdlCollect,
72 ColInfoResponse_t &cinfo, comerror_t &err, oaiargs &params)
73{ int startDoc = 0;
74
75 text_t metadataPrefix = params["metadataPrefix"];
76
77 // check resumption token
78 if (params["resumptionToken"] != "") {
79 ResumptionToken token(params["resumptionToken"]);
80 if (token.getCollection() == gsdlCollect) {
81 startDoc = token.getPosition() - 1; // first document is said to be 1..
82 metadataPrefix = "oai_dc"; // TO DO: This should come from the resumption token
83 }
84 }
85
86 // Get the OAI nodes from the info db file
87 text_t oai_root_node = "oai";
88 text_tset metadata; // Must be empty for efficiency
89 FilterResponse_t response;
90 get_children(oai_root_node, gsdlCollect, "", metadata, false, protocol, response, *this->logout);
91
92 // If numDocs is 0, do nothing - this->prevDocSeen will stay false if this is the only collection
93 // looked at, or will keep whatever value it had prior to this col (ensures that if the flag has
94 // been set to true by a previous collection that this won't overwrite it to be false).
95 if (response.docInfo.size() > 0) {
96 int errorCount = 0; // Count the number of errors found in the given collection
97
98 for (long i = startDoc; i < response.docInfo.size(); ++i) {
99 if (errorCount > 3) { // If num errors reaches the cut-off value, bail.
100 cerr << "Error: too many records(" << errorCount << ") in the " << gsdlCollect
101 << " collection have invalid or non-existant oai_ids - skipping remainder of collection.\n";
102 return;
103 }
104
105 text_t oai_id = "oai.";
106 oai_id += i;
107
108 text_t gsdl_id = oaiclassifier::getGSDL_OID(gsdlCollect, oai_id, protocol, *this->logout);
109
110 if (gsdl_id == "") { // If the string is empty, then the document didn't have an oai_id, so
111 ++errorCount; // increase error count
112 continue;
113 }
114
115
116 // Check that the item with the 0ID 'gsdl_id' has a lastmodified field that fits within
117 // the required date range (if specified).
118 if (this->inDateRange(params["from"], params["until"], gsdlCollect, gsdl_id, protocol, output)) {
119 if (this->output_document(output, protocol, gsdlCollect, gsdl_id, metadataPrefix)) {
120 // this should be an IF statement, where prevDocSeen is only set to true if the above
121 // function call returns true (indicating that the doc supported the metadata prefix) but
122 // for some reason this is always false. This means that if no doc in the requested set supports
123 // the metadata format, the "no records match" error that should be thrown won't be...
124 //
125 // GRB: the above comment is no longer true; proper checks are made
126 this->prevDocSeen = true;
127 ++this->outputDocs;
128 }
129 }
130
131 // if we've output the number of resumption documents; prepare a resumptionToken
132 if (this->outputDocs == this->configuration->resumeAfter()) {
133 this->replyToken = new ResumptionToken(gsdlCollect, "", "");
134 this->replyToken->setPosition("", i+2);
135 break;
136 }
137 }
138
139 cinfo.clear(); // Clear for next collection to use (if there is one).
140 }
141}
142
143//--------------------------------------------------------------------------------------------
144// Returns true if at least one document record is found
145void abstractlistaction::output_content_for_all(ostream &output, recptproto *protocol, oaiargs &params)
146{
147 ColInfoResponse_t cinfo;
148 comerror_t err;
149 text_tarray collections;
150 text_t gsdlCollect = "";
151 ResumptionToken *token = NULL;
152
153 // get a list of the collections available
154 collections = this->configuration->getCollectionsList();
155 // protocol->get_collection_list(collections, err, output);
156
157 if (params["resumptionToken"] != "") {
158 token = new ResumptionToken(params["resumptionToken"]);
159 }
160
161 for(int current_col = 0; current_col < collections.size(); ++current_col){
162 gsdlCollect = collections[current_col];
163
164 // ignore all leading collections before the one that matches the resumptiontoken
165 if (token != NULL &&
166 token->getCollection() != gsdlCollect)
167 { continue;
168 }
169
170 this->output_content_for_col(output, protocol, gsdlCollect, cinfo, err, params);
171
172 // once we've output at least one collection, continue
173 // outputting all others until the resumption total hits
174 token = NULL;
175
176 if (this->outputDocs == this->configuration->resumeAfter()) {
177 break;
178 }
179 }
180}
181
182//-------------------------------------------------------------------------------------------------
183// Check that the requested from/until dates don't include a time, as this would be asking for too
184// fine a level of granularity, one that greenstone doesn't support. An OAI error must be thrown.
185/*
186bool abstractlistaction::granularityTooFine(text_t &from, text_t &until)
187{
188 if (from != "" && from.){
189
190 }
191
192}
193*/
194//-------------------------------------------------------------------------------------------------
195
196bool abstractlistaction::check_classifier(recptproto *protocol, const text_t &collection,
197 const text_t &classifier)
198{ text_t topClass;
199 FilterResponse_t response;
200 text_tset metadata;
201 ofstream logout("oai.log", ios::app);
202
203 // exclude false children of a top-level classifier immediately...
204 if (!get_info(classifier, collection, "", metadata, false, protocol, response, logout)) {
205 return false;
206 }
207
208 // now check the top-level parent
209 metadata.insert("supportsmemberof");
210
211 text_t::const_iterator dot = findchar(classifier.begin(), classifier.end(), '.');
212 if (dot != classifier.end()) {
213 topClass = substr(classifier.begin(), dot);
214 }
215 else {
216 topClass = classifier;
217 }
218
219 if (!get_info(topClass, collection, "", metadata, false, protocol, response, logout)) {
220 return false;
221 }
222
223 if (response.docInfo[0].metadata["supportsmemberof"].values.size() == 0) {
224 return false;
225 }
226
227 if (response.docInfo[0].metadata["supportsmemberof"].values[0] != "true") {
228 return false;
229 }
230
231 return true;
232}
233
234void abstractlistaction::recurse_set(ostream &output, recptproto *protocol, const text_t &collection,
235 const text_t &classifier, oaiargs &params, ResumptionToken *resumptionToken)
236{
237 // metadata for this call
238 FilterResponse_t response;
239 text_tset metadata;
240 ofstream logout("oai.log", ios::app);
241 text_t from = params["from"];
242 text_t until = params["until"];
243 text_t metadataPrefix = params["metadataPrefix"];
244 // ResumptionToken resumptionToken(params["resumptionToken"]);
245 int startPos = 0;
246
247 // This is a recursive function, and so just because the current set is empty doesn't mean we necessarily
248 // want to throw a 'noRecordsMatch' error; another set (parent/sibling/child) may have had documents. It
249 // is therefore not enough to check that the response object in the current iteration has no docs - we
250 // must also verify that NO OTHER set has had any documents. This is done with the 'prevDocSeen' flag.
251 // It is set to FALSE initially, but as soon as we see a set that isn't empty, it is set to TRUE. The
252 // 'noRecordsMatch' error will only be thrown if, after all appropriate sets have been recursed into,
253 // the 'prevDocSeen' flag is still FALSE. The function returns false if no docs were seen, allowing us to
254 // throw the noRecordsMatch error.
255
256 // bool prevDocSeen = false;
257
258 get_children(classifier, collection, "", metadata, false, protocol, response, *this->logout);
259
260 if (params["resumptionToken"] != "") {
261 // if we're at a resumptionToken
262 if (classifier == resumptionToken->getNode()) {
263 startPos = resumptionToken->getPosition();
264 }
265 else {
266 text_t fullNode = resumptionToken->getNode();
267 text_t::iterator leafIter = fullNode.begin() + classifier.size();
268
269 // if the next character isn't a dot, blow up!
270 if (*leafIter != '.') {
271 // fatal error;
272 exit(1);
273 }
274
275 // get the first '.' after the current classifier point;
276 text_t::iterator separator = findchar(leafIter + 1, fullNode.end(), '.');
277
278 // now, create a new subpath
279 text_t nextNode = substr(fullNode.begin(), separator);
280
281 // seek forward; TODO: improve performance of this
282 for (int c = 0; c < response.numDocs; ++c) {
283 if (response.docInfo[c].OID == nextNode) {
284 startPos = c;
285 break;
286 }
287 }
288 }
289
290 // We need to subtract one from the startPos value to turn it into an index value
291 startPos--;
292 }
293
294 for (int c = startPos; c < response.numDocs; ++c) {
295 text_t child = response.docInfo[c].OID;
296
297 // distinguish classifiers and documents by checking whether OID
298 // starts with CL or not
299 text_t childHead;
300 text_t::const_iterator start = child.begin();
301 text_t::const_iterator here = child.begin();
302 here += 2;
303 childHead = substr(start, here);
304
305 // documents we output now
306 if (childHead != "CL") {
307 // Check that the item with the 0ID 'gsdl_id' has a lastmodified field that fits
308 // within the required date range (if specified)
309 if (this->inDateRange(from, until, collection, child, protocol, output)) {
310 // TODO: check that the document can be disseminated in the required metadataPrefix
311
312 if (this->output_document(output, protocol, collection, child, metadataPrefix)) {
313 this->prevDocSeen = true;
314 ++this->outputDocs;
315 }
316 }
317 }
318 // children which are classifiers are recursed
319 else {
320 if (resumptionToken != NULL) {
321 int depth = countchar(classifier.begin(), classifier.end(), '.');
322 resumptionToken->setOffset(depth, c+2);
323 }
324 this->recurse_set(output, protocol, collection, child, params, resumptionToken);
325 }
326
327 if (this->outputDocs == this->configuration->resumeAfter()) {
328 this->replyToken = new ResumptionToken(collection, params["set"], "");
329 this->replyToken->setPosition(classifier, c+2);
330 break;
331 }
332 }
333}
334
335
336
337
Note: See TracBrowser for help on using the repository browser.