source: trunk/gsdl/src/colservr/gdbmclass.cpp@ 308

Last change on this file since 308 was 125, checked in by sjboddie, 25 years ago

fixed a bug in sorting code

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 18.2 KB
Line 
1/**********************************************************************
2 *
3 * gdbmclass.cpp --
4 * Copyright (C) 1999 The New Zealand Digital Library Project
5 *
6 * PUT COPYRIGHT NOTICE HERE
7 *
8 * $Id: gdbmclass.cpp 125 1999-01-25 03:59:40Z sjboddie $
9 *
10 *********************************************************************/
11
12/*
13 $Log$
14 Revision 1.5 1999/01/25 03:59:40 sjboddie
15 fixed a bug in sorting code
16
17 Revision 1.4 1999/01/21 21:20:08 sjboddie
18 removed unused collection parameter from several functions
19
20 Revision 1.3 1999/01/19 01:38:15 rjmcnab
21
22 Made the source more portable.
23
24 Revision 1.2 1999/01/12 01:51:01 rjmcnab
25
26 Standard header.
27
28 Revision 1.1 1999/01/08 09:02:15 rjmcnab
29
30 Moved from src/library.
31
32 */
33
34
35#include "text_t.h"
36#include "gdbmclass.h"
37#include "unitool.h"
38#include "gsdlunicode.h"
39#include "fileutil.h"
40#include <ctype.h>
41#include <string.h>
42
43#if defined(GSDL_NEED_STRINGS_H)
44#include <strings.h>
45#endif
46
47#if defined(GSDL_USE_OBJECTSPACE)
48# include <ospace\std\algorithm>
49#elif defined(GSDL_USE_STL_H)
50# if defined(GSDL_USE_ALGO_H)
51# include <algo.h>
52# else
53# include <algorithm.h>
54# endif
55#else
56# include <algorithm>
57#endif
58
59
60static int compare_str (const char *e1, const char *e2) {
61#ifdef __WIN32__
62 return _stricmp(e1, e2);
63#else
64 return strcasecmp(e1, e2);
65#endif
66}
67
68static int compare_str (const void *e1, const void *e2) {
69#ifdef __WIN32__
70 return _stricmp(*((char**)e1), *((char**)e2));
71#else
72 return strcasecmp(*((char**)e1), *((char**)e2));
73#endif
74}
75
76
77
78void gdbm_info::clear () {
79 docnum = 0; // 'd'
80 title.clear(); // 't'
81 parent.clear(); // 'p'
82 classification.clear(); // 'x'
83 contents.clear(); // 'c'
84 jobnum.clear(); // 'j'
85 OID.clear(); // 'o'
86 author.clear(); // 'a'
87 source.clear(); // 's'
88 date.clear(); // 'i'
89}
90
91
92
93
94
95// returns 0 if failed, 1 if opened
96int gdbmclass::opendatabase (const text_t &filename) {
97 text_t data_location;
98 int block_size = 0;
99
100 if (gdbmfile != NULL) {
101 if (openfile == filename) return 1;
102 else closedatabase ();
103 }
104
105 openfile = filename;
106
107 char *namebuffer = filename.getcstr();
108 gdbmfile = gdbm_open (namebuffer, block_size, GDBM_READER, 00664, NULL);
109 delete namebuffer;
110
111 if (gdbmfile == NULL && logout != NULL) {
112 outconvertclass text_t2ascii;
113 (*logout) << text_t2ascii << "database open failed on: " << filename << "\n";
114 }
115
116 return (gdbmfile != NULL);
117}
118
119
120void gdbmclass::closedatabase () {
121 if (gdbmfile == NULL) return;
122
123 gdbm_close (gdbmfile);
124 gdbmfile = NULL;
125 openfile.clear();
126}
127
128
129// returns 0 on success, -1 on failure
130// key and collection aren't references as they might be aliases to
131// something in info
132int gdbmclass::getinfo (text_t key, gdbm_info &info) {
133 text_t data;
134
135 if (!getkeydata (key, data)) return -1;
136 text_t::iterator here = data.begin ();
137 text_t::iterator end = data.end ();
138
139 text_t ikey, ivalue;
140 info.clear (); // reset info
141
142 while (getinfoline(here, end, ikey, ivalue)) {
143 if (ikey == "d") { info.docnum = ivalue.getint(); }
144 else if (ikey == "t") { info.title = ivalue; }
145 else if (ikey == "p") { info.parent = ivalue; }
146 else if (ikey == "x") { info.classification = ivalue; }
147 else if (ikey == "c") { info.contents = ivalue; }
148 else if (ikey == "j") { info.jobnum = ivalue; }
149 else if (ikey == "o") { info.OID = ivalue; }
150 else if (ikey == "a") { info.author = ivalue; }
151 else if (ikey == "s") { info.source = ivalue; }
152 else if (ikey == "i") { info.date = ivalue; }
153 }
154
155 return 0;
156}
157
158
159// returns 1 if the key exists
160int gdbmclass::exists (text_t key) {
161 text_t data;
162 return getkeydata (key, data);
163}
164
165
166// returns 1 if successful
167int gdbmclass::getkeydata (text_t key, text_t &data) {
168 datum key_data;
169 datum return_data;
170
171 if (gdbmfile == NULL) return 0;
172
173 // get a utf-8 encoded c string of the unicode key
174 key_data.dptr = (to_utf8(key)).getcstr();
175 if (key_data.dptr == NULL) {
176 if (logout != NULL) (*logout) << "gdbmclass: out of memory\n";
177 return 0;
178 }
179 key_data.dsize = strlen (key_data.dptr);
180
181 // fetch the result
182 return_data = gdbm_fetch (gdbmfile, key_data);
183 delete key_data.dptr;
184
185 if (return_data.dptr == NULL) return 0;
186
187 data.setcarr (return_data.dptr, return_data.dsize);
188 free (return_data.dptr);
189 data = to_uni(data); // convert to unicode
190
191 return 1;
192}
193
194
195// parses a line of the form <key>value\n
196// returns 1 if successful
197int gdbmclass::getinfoline (text_t::iterator &here, text_t::iterator end,
198 text_t &key, text_t &value) {
199 key.clear();
200 value.clear();
201
202 // ignore white space
203 while (here != end && is_unicode_space (*here)) here++;
204
205 // get the '<'
206 if (here == end || *here != '<') return 0;
207 here++;
208
209 // get the key
210 while (here != end && *here != '>') {
211 key.push_back(*here);
212 here++;
213 }
214
215 // get the '>'
216 if (here == end || *here != '>') return 0;
217 here++;
218
219 // get the value
220 while (here != end && *here != '\n') {
221 value.push_back(*here);
222 here++;
223 }
224
225 return 1;
226}
227
228
229
230
231
232// a few useful functions
233
234//////////////////////////////////////////////////////////////////////////////////////////
235// functions for testing classification strings
236
237
238// returns 1 if targetdoc is top level of a book (i.e. =~ /B\.\d+$/) - otherwise 0;
239int is_top_level (const text_t &targetdoc) {
240
241 text_t::const_iterator here = targetdoc.begin();
242 text_t::const_iterator end = targetdoc.end();
243
244 // look for the 'B'
245 here = findchar (here, end, 'B');
246
247 // there must be exactly one dot after the 'B'
248 if ((here != end) && (countchar (here, end, '.') == 1))
249 return 1;
250
251 return 0;
252}
253
254// returns 1 if targetdoc is any level of a book (i.e. contains 'B') - otherwise 0
255int is_book (const text_t &targetdoc) {
256
257 text_t::const_iterator here = targetdoc.begin();
258 text_t::const_iterator end = targetdoc.end();
259
260 here = findchar (here, end, 'B');
261 if (here != end) return 1;
262 return 0;
263}
264
265// returns (in book_top) the top level of the book in targetdoc
266void get_book_top (const text_t &targetdoc, text_t &book_top) {
267
268 text_t::const_iterator here = targetdoc.begin();
269 text_t::const_iterator end = targetdoc.end();
270
271 book_top.clear();
272
273 // look for the 'B'
274 here = findchar (here, end, 'B');
275
276 // copy up to the second '.'
277 int founddot = 0;
278 while (here != end) {
279 if (*here == '.') {
280 if (founddot) return;
281 founddot = 1;
282 }
283 book_top.push_back(*here);
284 here++;
285 }
286}
287
288// returns (in book) the book section part of the classification
289// contained in targetdoc
290void get_book (const text_t &targetdoc, text_t &book) {
291
292 text_t::const_iterator here = targetdoc.begin();
293 text_t::const_iterator end = targetdoc.end();
294
295 book.clear ();
296
297 // look for the 'B'
298 here = findchar (here, end, 'B');
299
300 // copy the rest of the string
301 while (here != end) {
302 book.push_back(*here);
303 here ++;
304 }
305}
306
307// get_parent_section removes the last part from section (i.e.=~ s/\.\d+$//)
308void get_parent_section (text_t &section) {
309 int founddot = 0;
310 text_t::iterator end;
311 while (!founddot && !section.empty()) {
312 end = section.end();
313 end --;
314 if (*end == '.') founddot = 1;
315 section.pop_back();
316 }
317}
318
319// same as above but also returns ths child section that's removed
320void get_parent_section (text_t &parentsection, text_t &childsection) {
321 int founddot = 0;
322 text_t tmp;
323 childsection.clear();
324 text_t::iterator end;
325 while (!founddot && !parentsection.empty()) {
326 end = parentsection.end();
327 end --;
328 if (*end == '.') founddot = 1;
329 else tmp.push_back(*end); childsection = tmp + childsection; tmp.clear();
330 parentsection.pop_back();
331 }
332}
333
334// count_dots returns the number of dots ('.') there are
335// in a range of a targetdoc string
336int count_dots(text_t::const_iterator first, text_t::const_iterator last) {
337 return countchar (first, last, '.');
338}
339
340int count_dots (const text_t &targetdoc) {
341 return count_dots(targetdoc.begin(), targetdoc.end());
342}
343
344// returns 1 if targetdoc is a first level descendant
345// (i.e. B.n.1, B.n.1.1, B.n.1.1.1 etc.) - otherwise 0
346int is_section_top(const text_t &targetdoc) {
347 text_t::const_iterator here = targetdoc.begin();
348 text_t::const_iterator end = targetdoc.end();
349
350 // look for the 'B'
351 here = findchar (here, end, 'B');
352 here = findchar (here, end, '.');
353 if (here != end) here++; // skip over the '.'
354 here = findchar (here, end, '.');
355
356 // make sure that all '.' are followed by a '1'
357 while (here != end) {
358 if (*here != '.') return 0;
359 here ++;
360
361 if (here != end) {
362 if (*here != '1') return 0;
363 here ++;
364 }
365 }
366 return 1;
367}
368
369// seperate_parts seperates targetdoc into its classification and booksection
370// if classification isn't supplied it gets the first classification for the
371// book from the gdbm
372// if booksection doesn't exist it remains blank
373void separate_parts(const text_t &targetdoc, gdbmclass &gdbm,
374 text_t &classification, text_t &booksection) {
375
376 split_targetdoc (targetdoc, classification, booksection);
377
378 if (classification.empty()) {
379 // no classification included so get first one for this book
380 gdbm_info info;
381 text_t book_top;
382 vector<text_t> classarray;
383 get_book_top (targetdoc, book_top);
384 gdbm.getinfo(book_top, info);
385 splitstring (info.classification, classarray);
386 if (!classarray.empty()) classification = classarray[0];
387 else classification = "C.1";
388 }
389}
390
391// split_targetdoc splits up a string containing a classification
392// and book (or one or the other)
393void split_targetdoc(const text_t &targetdoc, text_t &classification,
394 text_t &booksection) {
395
396 classification.clear ();
397 booksection.clear();
398
399 text_t::const_iterator here = targetdoc.begin();
400 text_t::const_iterator end = targetdoc.end();
401
402 // copy everything up to the first 'B'
403 while (here != end) {
404 if (*here == 'B') break;
405 classification.push_back(*here);
406 here++;
407 }
408
409 // remove middle '.'
410 if (!classification.empty() &&
411 classification[classification.size()-1] == '.')
412 classification.pop_back();
413
414 // copy the rest of the string
415 while (here != end) {
416 booksection.push_back(*here);
417 here++;
418 }
419}
420
421// splitstring splits a colon seperated string into an array
422void splitstring (const text_t &string, vector<text_t> &array) {
423 splitchar (string.begin(), string.end(), ':', array);
424}
425
426// get_parents returns the parents array containing all the parents of the
427// document specified by classification and booksection
428void get_parents (const text_t &targetdoc, vector<text_t> &parents)
429{
430 text_t::const_iterator here = targetdoc.begin ();
431 text_t::const_iterator end = targetdoc.end ();
432
433 text_t currentparent;
434 text_t newsuffixpart;
435 text_t newsuffix;
436 bool first = true;
437 while (here != end)
438 {
439 // if there is a newsuffix add it to the current parent
440 // and add that parent to the parents vector
441 if (!newsuffix.empty())
442 {
443 currentparent += newsuffix;
444 parents.push_back (currentparent);
445 }
446
447 // keep getting suffixes until one is found which starts with
448 // a number
449 newsuffix.clear();
450 do
451 {
452 here = getdelimitstr (here, end, '.', newsuffixpart);
453 if (!first) newsuffix.push_back ('.');
454 first = false;
455 newsuffix += newsuffixpart;
456 }
457 while ((here != end) && !newsuffixpart.empty() &&
458 (newsuffixpart[0] < '0' || newsuffixpart[0] > '9'));
459 }
460}
461
462
463// get_siblings returns the siblings array containing all the siblings of the current
464// classification or booksection
465void get_siblings (const text_t &classification, const text_t &booksection,
466 gdbmclass &gdbm, vector<text_t> &siblings) {
467
468 gdbm_info info;
469
470 if (booksection.empty() && classification.size() == 1) {
471 // top level classification has no siblings
472 return;
473
474 } else if (booksection.empty()) {
475 // get classification siblings
476 gdbm.getinfo(classification, info);
477 gdbm.getinfo(info.parent, info); // info is now parent info
478 splitstring(info.contents, siblings);
479 return;
480
481 } else {
482 // get book section siblings
483 if (is_top_level(booksection)) {
484 // top level of book so siblings are children of classification
485 gdbm.getinfo(classification, info);
486 splitstring(info.contents, siblings);
487
488 // add classifications to book sections
489 for (unsigned int i = 0; i < siblings.size(); i++) {
490 if (is_book(siblings[i])) siblings[i] = classification + "." + siblings[i];
491 }
492
493 } else {
494 // siblings come from immediate parent
495 gdbm.getinfo(booksection, info);
496 gdbm.getinfo(info.parent, info); // info is now parent info
497 splitstring(info.contents, siblings);
498
499 // add classifications to book sections
500 for (unsigned int i = 0; i < siblings.size(); i++) {
501 if (is_book(siblings[i])) siblings[i] = classification + "." + siblings[i];
502 }
503 }
504 }
505}
506
507// compares section 1 and section 2 and returns 1 if section2 belongs to
508// the same chapter as section1 (i.e. is sibling of or child of or child of sibling)
509int are_same_chapter(text_t section1, text_t section2)
510{
511 get_parent_section(section1);
512
513 while (!section2.empty()) {
514 get_parent_section(section2);
515 if (section2 == section1) return 1;
516 }
517 return 0;
518}
519
520// get_first_section gets the first section from a colon separated
521// list (instring)
522void get_first_section(const text_t &instring, text_t &returnstring) {
523
524 returnstring.clear();
525
526 text_t::const_iterator here = instring.begin();
527 text_t::const_iterator end = instring.end();
528
529 while (here != end) {
530 if (*here == ':') return;
531 returnstring.push_back(*here);
532 here ++;
533 }
534}
535
536
537// removes html tags from string - everything after < will be removed
538// if < occurs without >
539void remove_tags (text_t &text)
540{
541 text_t::const_iterator here = text.begin ();
542 text_t::const_iterator end = text.end ();
543 int found = 0;
544 text_t tmp;
545
546 while (here != end) {
547 if (*here == '<') {found = 1; here ++; continue;}
548 if (*here == '>') {found = 0; here ++; continue;}
549
550 if (!found) tmp.push_back(*here);
551 here ++;
552 }
553 text = tmp;
554}
555
556// checks text to see if it is a number (i.e. contains only 0-9)
557// returns 1 if true, 0 if false
558int is_number (text_t &text) {
559
560 text_t::const_iterator here = text.begin();
561 text_t::const_iterator end = text.end();
562
563 while (here != end) {
564 if ((*here!='0') && (*here!='1') && (*here!='2') &&
565 (*here!='3') && (*here!='4') && (*here!='5') &&
566 (*here!='6') && (*here!='7') && (*here!='8') &&
567 (*here!='9')) return 0;
568 here ++;
569 }
570 return 1;
571}
572
573// functions related to sorting
574
575// returns whatever comes after ':#:' in str
576// -- this is a nasty hack that I'm sure Rodger will want to change ;-)
577text_t get_section_str(const text_t &str) {
578
579 text_t ret;
580 int found = 0;
581
582 text_t::const_iterator here = str.begin();
583 text_t::const_iterator end = str.end();
584
585 while (here != end) {
586 if (found) {
587 ret.push_back(*here);
588 } else {
589 here = findchar (here, end, ':');
590 if ((*(here+1) == '#') && (*(here+2) == ':')) {
591 found = 1;
592 here = here+2;
593 }
594 }
595 here ++;
596 }
597 return ret;
598}
599
600// removes leading spaces and leading 'the' 'a' and 'an'
601// from string
602void alphabetize_string_english (text_t &text) {
603
604 if (text.empty()) return;
605
606 text_t firstword;
607 char *word;
608
609 text_t::iterator here = text.begin();
610 text_t::const_iterator end = text.end();
611
612 if ((*here != ' ') && (*here != 'a') && (*here != 'A') &&
613 (*here != 't') && (*here != 'T')) return;
614
615 int foundchar = 0;
616 while (here != end) {
617 if (*here == ' ' && !foundchar) {here ++; continue;}
618 if (*here == ' ' && foundchar) {
619 text.erase(text.begin(), (here+1));
620 break;
621 }
622 foundchar ++;
623 if (foundchar == 1) {
624 getdelimitstr (here, end, ' ', firstword);
625 word = firstword.getcstr();
626 if ((compare_str(word, "the") != 0) && (compare_str(word, "a") != 0) &&
627 (compare_str(word, "an") != 0)) break;
628 }
629 here ++;
630 }
631 delete word;
632}
633
634// removes leading space, puts last name before
635// any preceeding names
636void alphabetize_string_name (text_t &text) {
637
638 if (text.empty()) return;
639
640 text_t lastname;
641 char *lname;
642 vector<text_t> words;
643 splitchar (text.begin(), text.end(), ' ', words);
644 lastname = words.back();
645 words.pop_back();
646 lname = lastname.getcstr();
647
648 while ((compare_str(lname, "jnr") == 0) || (compare_str(lname, "snr") == 0) ||
649 (compare_str(lname, "esq") == 0)) {
650 lastname = words.back();
651 words.pop_back();
652 lname = lastname.getcstr();
653 }
654
655 text.clear();
656 joinchar (words, ' ', text);
657 text = lastname + text;
658}
659
660char ** string_add (char **array, int *len, char *str) {
661 char **ret;
662
663 ret = (char**)realloc(array, (*len+1)*sizeof(char*));
664 ret[*len] = (char*)strdup(str);
665 (*len) ++;
666
667 return ret;
668}
669
670void string_sort (char **array, int len) {
671 qsort((void*)array, (unsigned int)(len), sizeof(char*), compare_str);
672}
673
674void string_free (char **array, int len) {
675 for (int i = 0; i < len; i++)
676 free (array[i]);
677 free (array);
678}
679
680// returns a date of form _dec_ 31, 1999
681// input is date of type 19991231
682// at least the year must be present in date
683text_t format_date (const text_t &date) {
684 text_t::const_iterator here = date.begin();
685 text_t::const_iterator end = date.end();
686
687 text_t year, month, day, dreturn;
688 int i;
689
690 for (i = 0; i < 4 && here != end; i++) {
691 year.push_back(*here);
692 here ++;
693 }
694 if (year.empty()) return "";
695
696 for (i = 0; i < 2 && here != end; i++) {
697 month.push_back(*here);
698 here ++;
699 }
700 for (i = 0; i < 2 && here != end; i++) {
701 day.push_back(*here);
702 here ++;
703 }
704
705 if (!month.empty()) format_month(month);
706
707 if (!day.empty()) format_day(day);
708
709 if (!month.empty()) {
710 dreturn += month + " ";
711 if (!day.empty()) {
712 dreturn += day + ", ";
713 }
714 }
715 dreturn += year;
716 return dreturn;
717}
718
719void format_month (text_t &month) {
720 if (month == "01") month = "_jan_";
721 else if (month == "02") month = "_feb_";
722 else if (month == "03") month = "_mar_";
723 else if (month == "04") month = "_apr_";
724 else if (month == "05") month = "_may_";
725 else if (month == "06") month = "_jun_";
726 else if (month == "07") month = "_jul_";
727 else if (month == "08") month = "_aug_";
728 else if (month == "09") month = "_sep_";
729 else if (month == "10") month = "_oct_";
730 else if (month == "11") month = "_nov_";
731 else if (month == "12") month = "_dec_";
732 else month.clear();
733}
734
735void format_day(text_t &day) {
736 if (day[0] == '0') {
737 char tmp = day[1];
738 day.clear();
739 day.push_back(tmp);
740 }
741}
742
Note: See TracBrowser for help on using the repository browser.