source: trunk/gsdl/src/colservr/gdbmclass.cpp@ 110

Last change on this file since 110 was 110, checked in by rjmcnab, 25 years ago

Moved from src/library.

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 18.1 KB
Line 
1/**********************************************************************
2 *
3 * gdbmclass.cpp --
4 * Copyright (C) 1999 The New Zealand Digital Library Project
5 *
6 * PUT COPYRIGHT NOTICE HERE
7 *
8 * $Id: gdbmclass.cpp 110 1999-01-08 09:02:22Z rjmcnab $
9 *
10 *********************************************************************/
11
12/*
13 $Log$
14 Revision 1.1 1999/01/08 09:02:15 rjmcnab
15
16 Moved from src/library.
17
18 */
19
20static char *RCSID = "$Id: gdbmclass.cpp 110 1999-01-08 09:02:22Z rjmcnab $";
21
22
23#include "text_t.h"
24#include "gdbmclass.h"
25#include "unitool.h"
26#include "gsdlunicode.h"
27#include "fileutil.h"
28#include <ctype.h>
29#include <string.h>
30
31#ifndef USE_OBJECTSPACE
32# include <algorithm>
33#else
34# include <ospace\std\algorithm>
35#endif
36
37
38static int my_stricmp (const char *str1, const char *str2) {
39 char c1, c2;
40
41 if ((str1 == NULL) || (str2 == NULL)) {
42 if ((str1 == NULL) && (str2 == NULL)) return 0;
43 if (str1 == NULL) return -1;
44 return 1;
45 }
46
47 while (((c1 = tolower(*str1)) != '\0') ||
48 ((c2 = tolower(*str2)) != '\0')) {
49 if (c1 < c2) return -1;
50 if (c1 > c2) return 1;
51
52 str1++;
53 str2++;
54 }
55
56 if ((*str1 == '\0') && (*str2 == '\0')) return 0;
57 if (*str1 == '\0') return -1;
58 return 1;
59}
60
61
62void gdbm_info::clear () {
63 docnum = 0; // 'd'
64 title.clear(); // 't'
65 parent.clear(); // 'p'
66 classification.clear(); // 'x'
67 contents.clear(); // 'c'
68 jobnum.clear(); // 'j'
69 OID.clear(); // 'o'
70 author.clear(); // 'a'
71 source.clear(); // 's'
72 date.clear(); // 'i'
73}
74
75
76
77
78
79// returns 0 if failed, 1 if opened
80int gdbmclass::opendatabase (const text_t &filename) {
81 text_t data_location;
82 int block_size = 0;
83
84 if (gdbmfile != NULL) {
85 if (openfile == filename) return 1;
86 else closedatabase ();
87 }
88
89 openfile = filename;
90
91 char *namebuffer = filename.getcstr();
92 gdbmfile = gdbm_open (namebuffer, block_size, GDBM_READER, 00664, NULL);
93 delete namebuffer;
94
95 if (gdbmfile == NULL && logout != NULL) {
96 outconvertclass text_t2ascii;
97 (*logout) << text_t2ascii << "database open failed on: " << filename << "\n";
98 }
99
100 return (gdbmfile != NULL);
101}
102
103
104void gdbmclass::closedatabase () {
105 if (gdbmfile == NULL) return;
106
107 gdbm_close (gdbmfile);
108 gdbmfile = NULL;
109 openfile.clear();
110}
111
112
113// returns 0 on success, -1 on failure
114// key and collection aren't references as they might be aliases to
115// something in info
116int gdbmclass::getinfo (text_t key, gdbm_info &info) {
117 text_t data;
118
119 if (!getkeydata (key, data)) return -1;
120 text_t::iterator here = data.begin ();
121 text_t::iterator end = data.end ();
122
123 text_t ikey, ivalue;
124 info.clear (); // reset info
125
126 while (getinfoline(here, end, ikey, ivalue)) {
127 if (ikey == "d") { info.docnum = ivalue.getint(); }
128 else if (ikey == "t") { info.title = ivalue; }
129 else if (ikey == "p") { info.parent = ivalue; }
130 else if (ikey == "x") { info.classification = ivalue; }
131 else if (ikey == "c") { info.contents = ivalue; }
132 else if (ikey == "j") { info.jobnum = ivalue; }
133 else if (ikey == "o") { info.OID = ivalue; }
134 else if (ikey == "a") { info.author = ivalue; }
135 else if (ikey == "s") { info.source = ivalue; }
136 else if (ikey == "i") { info.date = ivalue; }
137 }
138
139 return 0;
140}
141
142
143// returns 1 if the key exists
144int gdbmclass::exists (text_t key) {
145 text_t data;
146 return getkeydata (key, data);
147}
148
149
150// returns 1 if successful
151int gdbmclass::getkeydata (text_t key, text_t &data) {
152 datum key_data;
153 datum return_data;
154
155 if (gdbmfile == NULL) return 0;
156
157 // get a utf-8 encoded c string of the unicode key
158 key_data.dptr = (to_utf8(key)).getcstr();
159 if (key_data.dptr == NULL) {
160 if (logout != NULL) (*logout) << "gdbmclass: out of memory\n";
161 return 0;
162 }
163 key_data.dsize = strlen (key_data.dptr);
164
165 // fetch the result
166 return_data = gdbm_fetch (gdbmfile, key_data);
167 delete key_data.dptr;
168
169 if (return_data.dptr == NULL) return 0;
170
171 data.setcarr (return_data.dptr, return_data.dsize);
172 free (return_data.dptr);
173 data = to_uni(data); // convert to unicode
174
175 return 1;
176}
177
178
179// parses a line of the form <key>value\n
180// returns 1 if successful
181int gdbmclass::getinfoline (text_t::iterator &here, text_t::iterator end,
182 text_t &key, text_t &value) {
183 key.clear();
184 value.clear();
185
186 // ignore white space
187 while (here != end && is_unicode_space (*here)) here++;
188
189 // get the '<'
190 if (here == end || *here != '<') return 0;
191 here++;
192
193 // get the key
194 while (here != end && *here != '>') {
195 key.push_back(*here);
196 here++;
197 }
198
199 // get the '>'
200 if (here == end || *here != '>') return 0;
201 here++;
202
203 // get the value
204 while (here != end && *here != '\n') {
205 value.push_back(*here);
206 here++;
207 }
208
209 return 1;
210}
211
212
213
214
215
216// a few useful functions
217
218//////////////////////////////////////////////////////////////////////////////////////////
219// functions for testing classification strings
220
221
222// returns 1 if targetdoc is top level of a book (i.e. =~ /B\.\d+$/) - otherwise 0;
223int is_top_level (const text_t &targetdoc) {
224
225 text_t::const_iterator here = targetdoc.begin();
226 text_t::const_iterator end = targetdoc.end();
227
228 // look for the 'B'
229 here = findchar (here, end, 'B');
230
231 // there must be exactly one dot after the 'B'
232 if ((here != end) && (countchar (here, end, '.') == 1))
233 return 1;
234
235 return 0;
236}
237
238// returns 1 if targetdoc is any level of a book (i.e. contains 'B') - otherwise 0
239int is_book (const text_t &targetdoc) {
240
241 text_t::const_iterator here = targetdoc.begin();
242 text_t::const_iterator end = targetdoc.end();
243
244 here = findchar (here, end, 'B');
245 if (here != end) return 1;
246 return 0;
247}
248
249// returns (in book_top) the top level of the book in targetdoc
250void get_book_top (const text_t &targetdoc, text_t &book_top) {
251
252 text_t::const_iterator here = targetdoc.begin();
253 text_t::const_iterator end = targetdoc.end();
254
255 book_top.clear();
256
257 // look for the 'B'
258 here = findchar (here, end, 'B');
259
260 // copy up to the second '.'
261 int founddot = 0;
262 while (here != end) {
263 if (*here == '.') {
264 if (founddot) return;
265 founddot = 1;
266 }
267 book_top.push_back(*here);
268 here++;
269 }
270}
271
272// returns (in book) the book section part of the classification
273// contained in targetdoc
274void get_book (const text_t &targetdoc, text_t &book) {
275
276 text_t::const_iterator here = targetdoc.begin();
277 text_t::const_iterator end = targetdoc.end();
278
279 book.clear ();
280
281 // look for the 'B'
282 here = findchar (here, end, 'B');
283
284 // copy the rest of the string
285 while (here != end) {
286 book.push_back(*here);
287 here ++;
288 }
289}
290
291// get_parent_section removes the last part from section (i.e.=~ s/\.\d+$//)
292void get_parent_section (text_t &section) {
293 int founddot = 0;
294 text_t::iterator end;
295 while (!founddot && !section.empty()) {
296 end = section.end();
297 end --;
298 if (*end == '.') founddot = 1;
299 section.pop_back();
300 }
301}
302
303// same as above but also returns ths child section that's removed
304void get_parent_section (text_t &parentsection, text_t &childsection) {
305 int founddot = 0;
306 text_t tmp;
307 childsection.clear();
308 text_t::iterator end;
309 while (!founddot && !parentsection.empty()) {
310 end = parentsection.end();
311 end --;
312 if (*end == '.') founddot = 1;
313 else tmp.push_back(*end); childsection = tmp + childsection; tmp.clear();
314 parentsection.pop_back();
315 }
316}
317
318// count_dots returns the number of dots ('.') there are
319// in a range of a targetdoc string
320int count_dots(text_t::const_iterator first, text_t::const_iterator last) {
321 return countchar (first, last, '.');
322}
323
324int count_dots (const text_t &targetdoc) {
325 return count_dots(targetdoc.begin(), targetdoc.end());
326}
327
328// returns 1 if targetdoc is a first level descendant
329// (i.e. B.n.1, B.n.1.1, B.n.1.1.1 etc.) - otherwise 0
330int is_section_top(const text_t &targetdoc) {
331 text_t::const_iterator here = targetdoc.begin();
332 text_t::const_iterator end = targetdoc.end();
333
334 // look for the 'B'
335 here = findchar (here, end, 'B');
336 here = findchar (here, end, '.');
337 if (here != end) here++; // skip over the '.'
338 here = findchar (here, end, '.');
339
340 // make sure that all '.' are followed by a '1'
341 while (here != end) {
342 if (*here != '.') return 0;
343 here ++;
344
345 if (here != end) {
346 if (*here != '1') return 0;
347 here ++;
348 }
349 }
350 return 1;
351}
352
353// seperate_parts seperates targetdoc into its classification and booksection
354// if classification isn't supplied it gets the first classification for the
355// book from the gdbm
356// if booksection doesn't exist it remains blank
357void seperate_parts(const text_t &targetdoc, gdbmclass &gdbm, const text_t &collection,
358 text_t &classification, text_t &booksection) {
359
360 split_targetdoc (targetdoc, classification, booksection);
361
362 if (classification.empty()) {
363 // no classification included so get first one for this book
364 gdbm_info info;
365 text_t book_top;
366 vector<text_t> classarray;
367 get_book_top (targetdoc, book_top);
368 gdbm.getinfo(book_top, info);
369 splitstring (info.classification, classarray);
370 if (!classarray.empty()) classification = classarray[0];
371 else classification = "C.1";
372 }
373}
374
375// split_targetdoc splits up a string containing a classification
376// and book (or one or the other)
377void split_targetdoc(const text_t &targetdoc, text_t &classification,
378 text_t &booksection) {
379
380 classification.clear ();
381 booksection.clear();
382
383 text_t::const_iterator here = targetdoc.begin();
384 text_t::const_iterator end = targetdoc.end();
385
386 // copy everything up to the first 'B'
387 while (here != end) {
388 if (*here == 'B') break;
389 classification.push_back(*here);
390 here++;
391 }
392
393 // remove middle '.'
394 if (!classification.empty() &&
395 classification[classification.size()-1] == '.')
396 classification.pop_back();
397
398 // copy the rest of the string
399 while (here != end) {
400 booksection.push_back(*here);
401 here++;
402 }
403}
404
405// splitstring splits a colon seperated string into an array
406void splitstring (const text_t &string, vector<text_t> &array) {
407 splitchar (string.begin(), string.end(), ':', array);
408}
409
410// get_parents returns the parents array containing all the parents of the
411// document specified by classification and booksection
412void get_parents (const text_t &targetdoc, vector<text_t> &parents)
413{
414 text_t::const_iterator here = targetdoc.begin ();
415 text_t::const_iterator end = targetdoc.end ();
416
417 text_t currentparent;
418 text_t newsuffixpart;
419 text_t newsuffix;
420 bool first = true;
421 while (here != end)
422 {
423 // if there is a newsuffix add it to the current parent
424 // and add that parent to the parents vector
425 if (!newsuffix.empty())
426 {
427 currentparent += newsuffix;
428 parents.push_back (currentparent);
429 }
430
431 // keep getting suffixes until one is found which starts with
432 // a number
433 newsuffix.clear();
434 do
435 {
436 here = getdelimitstr (here, end, '.', newsuffixpart);
437 if (!first) newsuffix.push_back ('.');
438 first = false;
439 newsuffix += newsuffixpart;
440 }
441 while ((here != end) && !newsuffixpart.empty() &&
442 (newsuffixpart[0] < '0' || newsuffixpart[0] > '9'));
443 }
444}
445
446
447// get_siblings returns the siblings array containing all the siblings of the current
448// classification or booksection
449void get_siblings (const text_t &classification, const text_t &booksection,
450 gdbmclass &gdbm, const text_t &collection,
451 vector<text_t> &siblings) {
452
453 gdbm_info info;
454
455 if (booksection.empty() && classification.size() == 1) {
456 // top level classification has no siblings
457 return;
458
459 } else if (booksection.empty()) {
460 // get classification siblings
461 gdbm.getinfo(classification, info);
462 gdbm.getinfo(info.parent, info); // info is now parent info
463 splitstring(info.contents, siblings);
464 return;
465
466 } else {
467 // get book section siblings
468 if (is_top_level(booksection)) {
469 // top level of book so siblings are children of classification
470 gdbm.getinfo(classification, info);
471 splitstring(info.contents, siblings);
472
473 // add classifications to book sections
474 for (unsigned int i = 0; i < siblings.size(); i++) {
475 if (is_book(siblings[i])) siblings[i] = classification + "." + siblings[i];
476 }
477
478 } else {
479 // siblings come from immediate parent
480 gdbm.getinfo(booksection, info);
481 gdbm.getinfo(info.parent, info); // info is now parent info
482 splitstring(info.contents, siblings);
483
484 // add classifications to book sections
485 for (unsigned int i = 0; i < siblings.size(); i++) {
486 if (is_book(siblings[i])) siblings[i] = classification + "." + siblings[i];
487 }
488 }
489 }
490}
491
492// compares section 1 and section 2 and returns 1 if section2 belongs to
493// the same chapter as section1 (i.e. is sibling of or child of or child of sibling)
494int are_same_chapter(text_t section1, text_t section2)
495{
496 get_parent_section(section1);
497
498 while (!section2.empty()) {
499 get_parent_section(section2);
500 if (section2 == section1) return 1;
501 }
502 return 0;
503}
504
505// get_first_section gets the first section from a colon separated
506// list (instring)
507void get_first_section(const text_t &instring, text_t &returnstring) {
508
509 returnstring.clear();
510
511 text_t::const_iterator here = instring.begin();
512 text_t::const_iterator end = instring.end();
513
514 while (here != end) {
515 if (*here == ':') return;
516 returnstring.push_back(*here);
517 here ++;
518 }
519}
520
521
522// removes html tags from string - everything after < will be removed
523// if < occurs without >
524void remove_tags (text_t &text)
525{
526 text_t::const_iterator here = text.begin ();
527 text_t::const_iterator end = text.end ();
528 int found = 0;
529 text_t tmp;
530
531 while (here != end) {
532 if (*here == '<') {found = 1; here ++; continue;}
533 if (*here == '>') {found = 0; here ++; continue;}
534
535 if (!found) tmp.push_back(*here);
536 here ++;
537 }
538 text = tmp;
539}
540
541// checks text to see if it is a number (i.e. contains only 0-9)
542// returns 1 if true, 0 if false
543int is_number (text_t &text) {
544
545 text_t::const_iterator here = text.begin();
546 text_t::const_iterator end = text.end();
547
548 while (here != end) {
549 if ((*here!='0') && (*here!='1') && (*here!='2') &&
550 (*here!='3') && (*here!='4') && (*here!='5') &&
551 (*here!='6') && (*here!='7') && (*here!='8') &&
552 (*here!='9')) return 0;
553 here ++;
554 }
555 return 1;
556}
557
558// functions related to sorting
559
560// returns whatever comes after ':#:' in str
561// -- this is a nasty hack that I'm sure Rodger will want to change ;-)
562text_t get_section_str(const text_t &str) {
563
564 text_t ret;
565 int found = 0;
566
567 text_t::const_iterator here = str.begin();
568 text_t::const_iterator end = str.end();
569
570 while (here != end) {
571 if (found) {
572 ret.push_back(*here);
573 } else {
574 here = findchar (here, end, ':');
575 if ((*(here+1) == '#') && (*(here+2) == ':')) {
576 found = 1;
577 here = here+2;
578 }
579 }
580 here ++;
581 }
582 return ret;
583}
584
585// removes leading spaces and leading 'the' 'a' and 'an'
586// from string
587void alphabetize_string_english (text_t &text) {
588
589 if (text.empty()) return;
590
591 text_t firstword;
592 char *word;
593
594 text_t::iterator here = text.begin();
595 text_t::const_iterator end = text.end();
596
597 if ((*here != ' ') && (*here != 'a') && (*here != 'A') &&
598 (*here != 't') && (*here != 'T')) return;
599
600 int foundchar = 0;
601 while (here != end) {
602 if (*here == ' ' && !foundchar) {here ++; continue;}
603 if (*here == ' ' && foundchar) {
604 text.erase(text.begin(), (here+1));
605 break;
606 }
607 foundchar ++;
608 if (foundchar == 1) {
609 getdelimitstr (here, end, ' ', firstword);
610 word = firstword.getcstr();
611 if ((my_stricmp(word, "the") != 0) && (my_stricmp(word, "a") != 0) &&
612 (my_stricmp(word, "an") != 0)) break;
613 }
614 here ++;
615 }
616 delete word;
617}
618
619// removes leading space, puts last name before
620// any preceeding names
621void alphabetize_string_name (text_t &text) {
622
623 if (text.empty()) return;
624
625 text_t lastname;
626 char *lname;
627 vector<text_t> words;
628 splitchar (text.begin(), text.end(), ' ', words);
629 lastname = words.back();
630 words.pop_back();
631 lname = lastname.getcstr();
632
633 while ((my_stricmp(lname, "jnr") == 0) || (my_stricmp(lname, "snr") == 0) ||
634 (my_stricmp(lname, "esq") == 0)) {
635 lastname = words.back();
636 words.pop_back();
637 lname = lastname.getcstr();
638 }
639
640 text.clear();
641 joinchar (words, ' ', text);
642 text = lastname + text;
643}
644
645char ** string_add (char **array, int *len, char *str) {
646 char **ret;
647
648 ret = (char**)realloc(array, (*len+1)*sizeof(char*));
649 ret[*len] = (char*)strdup(str);
650 (*len) ++;
651
652 return ret;
653}
654
655void string_sort (char **array, int len) {
656 qsort((void*)array, (unsigned int)(len), sizeof(char*), compare_str);
657}
658
659static int compare_str (const void *e1, const void *e2) {
660#ifdef __WIN32__
661 return _stricmp(*((char**)e1), *((char**)e2));
662#else
663 return strcasecmp(*((char**)e1), *((char**)e2));
664#endif
665}
666
667void string_free (char **array, int len) {
668 for (int i = 0; i < len; i++)
669 free (array[i]);
670 free (array);
671}
672
673// returns a date of form _dec_ 31, 1999
674// input is date of type 19991231
675// at least the year must be present in date
676text_t format_date (const text_t &date) {
677 text_t::const_iterator here = date.begin();
678 text_t::const_iterator end = date.end();
679
680 text_t year, month, day, dreturn;
681 int i;
682
683 for (i = 0; i < 4 && here != end; i++) {
684 year.push_back(*here);
685 here ++;
686 }
687 if (year.empty()) return "";
688
689 for (i = 0; i < 2 && here != end; i++) {
690 month.push_back(*here);
691 here ++;
692 }
693 for (i = 0; i < 2 && here != end; i++) {
694 day.push_back(*here);
695 here ++;
696 }
697
698 if (!month.empty()) format_month(month);
699
700 if (!day.empty()) format_day(day);
701
702 if (!month.empty()) {
703 dreturn += month + " ";
704 if (!day.empty()) {
705 dreturn += day + ", ";
706 }
707 }
708 dreturn += year;
709 return dreturn;
710}
711
712void format_month (text_t &month) {
713 if (month == "01") month = "_jan_";
714 else if (month == "02") month = "_feb_";
715 else if (month == "03") month = "_mar_";
716 else if (month == "04") month = "_apr_";
717 else if (month == "05") month = "_may_";
718 else if (month == "06") month = "_jun_";
719 else if (month == "07") month = "_jul_";
720 else if (month == "08") month = "_aug_";
721 else if (month == "09") month = "_sep_";
722 else if (month == "10") month = "_oct_";
723 else if (month == "11") month = "_nov_";
724 else if (month == "12") month = "_dec_";
725 else month.clear();
726}
727
728void format_day(text_t &day) {
729 if (day[0] == '0') {
730 char tmp = day[1];
731 day.clear();
732 day.push_back(tmp);
733 }
734}
735
Note: See TracBrowser for help on using the repository browser.