source: trunk/gsdl/src/colservr/gdbmclass.cpp@ 543

Last change on this file since 543 was 534, checked in by sjboddie, 25 years ago

added gpl notice

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 19.0 KB
Line 
1/**********************************************************************
2 *
3 * gdbmclass.cpp --
4 * Copyright (C) 1999 The New Zealand Digital Library Project
5 *
6 * A component of the Greenstone digital library software
7 * from the New Zealand Digital Library Project at the
8 * University of Waikato, New Zealand.
9 *
10 * This program is free software; you can redistribute it and/or modify
11 * it under the terms of the GNU General Public License as published by
12 * the Free Software Foundation; either version 2 of the License, or
13 * (at your option) any later version.
14 *
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 * GNU General Public License for more details.
19 *
20 * You should have received a copy of the GNU General Public License
21 * along with this program; if not, write to the Free Software
22 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23 *
24 * $Id: gdbmclass.cpp 534 1999-09-07 04:57:43Z sjboddie $
25 *
26 *********************************************************************/
27
28/*
29 $Log$
30 Revision 1.6 1999/09/07 04:57:21 sjboddie
31 added gpl notice
32
33 Revision 1.5 1999/01/25 03:59:40 sjboddie
34 fixed a bug in sorting code
35
36 Revision 1.4 1999/01/21 21:20:08 sjboddie
37 removed unused collection parameter from several functions
38
39 Revision 1.3 1999/01/19 01:38:15 rjmcnab
40
41 Made the source more portable.
42
43 Revision 1.2 1999/01/12 01:51:01 rjmcnab
44
45 Standard header.
46
47 Revision 1.1 1999/01/08 09:02:15 rjmcnab
48
49 Moved from src/library.
50
51 */
52
53
54#include "text_t.h"
55#include "gdbmclass.h"
56#include "unitool.h"
57#include "gsdlunicode.h"
58#include "fileutil.h"
59#include <ctype.h>
60#include <string.h>
61
62#if defined(GSDL_NEED_STRINGS_H)
63#include <strings.h>
64#endif
65
66#if defined(GSDL_USE_OBJECTSPACE)
67# include <ospace\std\algorithm>
68#elif defined(GSDL_USE_STL_H)
69# if defined(GSDL_USE_ALGO_H)
70# include <algo.h>
71# else
72# include <algorithm.h>
73# endif
74#else
75# include <algorithm>
76#endif
77
78
79static int compare_str (const char *e1, const char *e2) {
80#ifdef __WIN32__
81 return _stricmp(e1, e2);
82#else
83 return strcasecmp(e1, e2);
84#endif
85}
86
87static int compare_str (const void *e1, const void *e2) {
88#ifdef __WIN32__
89 return _stricmp(*((char**)e1), *((char**)e2));
90#else
91 return strcasecmp(*((char**)e1), *((char**)e2));
92#endif
93}
94
95
96
97void gdbm_info::clear () {
98 docnum = 0; // 'd'
99 title.clear(); // 't'
100 parent.clear(); // 'p'
101 classification.clear(); // 'x'
102 contents.clear(); // 'c'
103 jobnum.clear(); // 'j'
104 OID.clear(); // 'o'
105 author.clear(); // 'a'
106 source.clear(); // 's'
107 date.clear(); // 'i'
108}
109
110
111
112
113
114// returns 0 if failed, 1 if opened
115int gdbmclass::opendatabase (const text_t &filename) {
116 text_t data_location;
117 int block_size = 0;
118
119 if (gdbmfile != NULL) {
120 if (openfile == filename) return 1;
121 else closedatabase ();
122 }
123
124 openfile = filename;
125
126 char *namebuffer = filename.getcstr();
127 gdbmfile = gdbm_open (namebuffer, block_size, GDBM_READER, 00664, NULL);
128 delete namebuffer;
129
130 if (gdbmfile == NULL && logout != NULL) {
131 outconvertclass text_t2ascii;
132 (*logout) << text_t2ascii << "database open failed on: " << filename << "\n";
133 }
134
135 return (gdbmfile != NULL);
136}
137
138
139void gdbmclass::closedatabase () {
140 if (gdbmfile == NULL) return;
141
142 gdbm_close (gdbmfile);
143 gdbmfile = NULL;
144 openfile.clear();
145}
146
147
148// returns 0 on success, -1 on failure
149// key and collection aren't references as they might be aliases to
150// something in info
151int gdbmclass::getinfo (text_t key, gdbm_info &info) {
152 text_t data;
153
154 if (!getkeydata (key, data)) return -1;
155 text_t::iterator here = data.begin ();
156 text_t::iterator end = data.end ();
157
158 text_t ikey, ivalue;
159 info.clear (); // reset info
160
161 while (getinfoline(here, end, ikey, ivalue)) {
162 if (ikey == "d") { info.docnum = ivalue.getint(); }
163 else if (ikey == "t") { info.title = ivalue; }
164 else if (ikey == "p") { info.parent = ivalue; }
165 else if (ikey == "x") { info.classification = ivalue; }
166 else if (ikey == "c") { info.contents = ivalue; }
167 else if (ikey == "j") { info.jobnum = ivalue; }
168 else if (ikey == "o") { info.OID = ivalue; }
169 else if (ikey == "a") { info.author = ivalue; }
170 else if (ikey == "s") { info.source = ivalue; }
171 else if (ikey == "i") { info.date = ivalue; }
172 }
173
174 return 0;
175}
176
177
178// returns 1 if the key exists
179int gdbmclass::exists (text_t key) {
180 text_t data;
181 return getkeydata (key, data);
182}
183
184
185// returns 1 if successful
186int gdbmclass::getkeydata (text_t key, text_t &data) {
187 datum key_data;
188 datum return_data;
189
190 if (gdbmfile == NULL) return 0;
191
192 // get a utf-8 encoded c string of the unicode key
193 key_data.dptr = (to_utf8(key)).getcstr();
194 if (key_data.dptr == NULL) {
195 if (logout != NULL) (*logout) << "gdbmclass: out of memory\n";
196 return 0;
197 }
198 key_data.dsize = strlen (key_data.dptr);
199
200 // fetch the result
201 return_data = gdbm_fetch (gdbmfile, key_data);
202 delete key_data.dptr;
203
204 if (return_data.dptr == NULL) return 0;
205
206 data.setcarr (return_data.dptr, return_data.dsize);
207 free (return_data.dptr);
208 data = to_uni(data); // convert to unicode
209
210 return 1;
211}
212
213
214// parses a line of the form <key>value\n
215// returns 1 if successful
216int gdbmclass::getinfoline (text_t::iterator &here, text_t::iterator end,
217 text_t &key, text_t &value) {
218 key.clear();
219 value.clear();
220
221 // ignore white space
222 while (here != end && is_unicode_space (*here)) here++;
223
224 // get the '<'
225 if (here == end || *here != '<') return 0;
226 here++;
227
228 // get the key
229 while (here != end && *here != '>') {
230 key.push_back(*here);
231 here++;
232 }
233
234 // get the '>'
235 if (here == end || *here != '>') return 0;
236 here++;
237
238 // get the value
239 while (here != end && *here != '\n') {
240 value.push_back(*here);
241 here++;
242 }
243
244 return 1;
245}
246
247
248
249
250
251// a few useful functions
252
253//////////////////////////////////////////////////////////////////////////////////////////
254// functions for testing classification strings
255
256
257// returns 1 if targetdoc is top level of a book (i.e. =~ /B\.\d+$/) - otherwise 0;
258int is_top_level (const text_t &targetdoc) {
259
260 text_t::const_iterator here = targetdoc.begin();
261 text_t::const_iterator end = targetdoc.end();
262
263 // look for the 'B'
264 here = findchar (here, end, 'B');
265
266 // there must be exactly one dot after the 'B'
267 if ((here != end) && (countchar (here, end, '.') == 1))
268 return 1;
269
270 return 0;
271}
272
273// returns 1 if targetdoc is any level of a book (i.e. contains 'B') - otherwise 0
274int is_book (const text_t &targetdoc) {
275
276 text_t::const_iterator here = targetdoc.begin();
277 text_t::const_iterator end = targetdoc.end();
278
279 here = findchar (here, end, 'B');
280 if (here != end) return 1;
281 return 0;
282}
283
284// returns (in book_top) the top level of the book in targetdoc
285void get_book_top (const text_t &targetdoc, text_t &book_top) {
286
287 text_t::const_iterator here = targetdoc.begin();
288 text_t::const_iterator end = targetdoc.end();
289
290 book_top.clear();
291
292 // look for the 'B'
293 here = findchar (here, end, 'B');
294
295 // copy up to the second '.'
296 int founddot = 0;
297 while (here != end) {
298 if (*here == '.') {
299 if (founddot) return;
300 founddot = 1;
301 }
302 book_top.push_back(*here);
303 here++;
304 }
305}
306
307// returns (in book) the book section part of the classification
308// contained in targetdoc
309void get_book (const text_t &targetdoc, text_t &book) {
310
311 text_t::const_iterator here = targetdoc.begin();
312 text_t::const_iterator end = targetdoc.end();
313
314 book.clear ();
315
316 // look for the 'B'
317 here = findchar (here, end, 'B');
318
319 // copy the rest of the string
320 while (here != end) {
321 book.push_back(*here);
322 here ++;
323 }
324}
325
326// get_parent_section removes the last part from section (i.e.=~ s/\.\d+$//)
327void get_parent_section (text_t &section) {
328 int founddot = 0;
329 text_t::iterator end;
330 while (!founddot && !section.empty()) {
331 end = section.end();
332 end --;
333 if (*end == '.') founddot = 1;
334 section.pop_back();
335 }
336}
337
338// same as above but also returns ths child section that's removed
339void get_parent_section (text_t &parentsection, text_t &childsection) {
340 int founddot = 0;
341 text_t tmp;
342 childsection.clear();
343 text_t::iterator end;
344 while (!founddot && !parentsection.empty()) {
345 end = parentsection.end();
346 end --;
347 if (*end == '.') founddot = 1;
348 else tmp.push_back(*end); childsection = tmp + childsection; tmp.clear();
349 parentsection.pop_back();
350 }
351}
352
353// count_dots returns the number of dots ('.') there are
354// in a range of a targetdoc string
355int count_dots(text_t::const_iterator first, text_t::const_iterator last) {
356 return countchar (first, last, '.');
357}
358
359int count_dots (const text_t &targetdoc) {
360 return count_dots(targetdoc.begin(), targetdoc.end());
361}
362
363// returns 1 if targetdoc is a first level descendant
364// (i.e. B.n.1, B.n.1.1, B.n.1.1.1 etc.) - otherwise 0
365int is_section_top(const text_t &targetdoc) {
366 text_t::const_iterator here = targetdoc.begin();
367 text_t::const_iterator end = targetdoc.end();
368
369 // look for the 'B'
370 here = findchar (here, end, 'B');
371 here = findchar (here, end, '.');
372 if (here != end) here++; // skip over the '.'
373 here = findchar (here, end, '.');
374
375 // make sure that all '.' are followed by a '1'
376 while (here != end) {
377 if (*here != '.') return 0;
378 here ++;
379
380 if (here != end) {
381 if (*here != '1') return 0;
382 here ++;
383 }
384 }
385 return 1;
386}
387
388// seperate_parts seperates targetdoc into its classification and booksection
389// if classification isn't supplied it gets the first classification for the
390// book from the gdbm
391// if booksection doesn't exist it remains blank
392void separate_parts(const text_t &targetdoc, gdbmclass &gdbm,
393 text_t &classification, text_t &booksection) {
394
395 split_targetdoc (targetdoc, classification, booksection);
396
397 if (classification.empty()) {
398 // no classification included so get first one for this book
399 gdbm_info info;
400 text_t book_top;
401 vector<text_t> classarray;
402 get_book_top (targetdoc, book_top);
403 gdbm.getinfo(book_top, info);
404 splitstring (info.classification, classarray);
405 if (!classarray.empty()) classification = classarray[0];
406 else classification = "C.1";
407 }
408}
409
410// split_targetdoc splits up a string containing a classification
411// and book (or one or the other)
412void split_targetdoc(const text_t &targetdoc, text_t &classification,
413 text_t &booksection) {
414
415 classification.clear ();
416 booksection.clear();
417
418 text_t::const_iterator here = targetdoc.begin();
419 text_t::const_iterator end = targetdoc.end();
420
421 // copy everything up to the first 'B'
422 while (here != end) {
423 if (*here == 'B') break;
424 classification.push_back(*here);
425 here++;
426 }
427
428 // remove middle '.'
429 if (!classification.empty() &&
430 classification[classification.size()-1] == '.')
431 classification.pop_back();
432
433 // copy the rest of the string
434 while (here != end) {
435 booksection.push_back(*here);
436 here++;
437 }
438}
439
440// splitstring splits a colon seperated string into an array
441void splitstring (const text_t &string, vector<text_t> &array) {
442 splitchar (string.begin(), string.end(), ':', array);
443}
444
445// get_parents returns the parents array containing all the parents of the
446// document specified by classification and booksection
447void get_parents (const text_t &targetdoc, vector<text_t> &parents)
448{
449 text_t::const_iterator here = targetdoc.begin ();
450 text_t::const_iterator end = targetdoc.end ();
451
452 text_t currentparent;
453 text_t newsuffixpart;
454 text_t newsuffix;
455 bool first = true;
456 while (here != end)
457 {
458 // if there is a newsuffix add it to the current parent
459 // and add that parent to the parents vector
460 if (!newsuffix.empty())
461 {
462 currentparent += newsuffix;
463 parents.push_back (currentparent);
464 }
465
466 // keep getting suffixes until one is found which starts with
467 // a number
468 newsuffix.clear();
469 do
470 {
471 here = getdelimitstr (here, end, '.', newsuffixpart);
472 if (!first) newsuffix.push_back ('.');
473 first = false;
474 newsuffix += newsuffixpart;
475 }
476 while ((here != end) && !newsuffixpart.empty() &&
477 (newsuffixpart[0] < '0' || newsuffixpart[0] > '9'));
478 }
479}
480
481
482// get_siblings returns the siblings array containing all the siblings of the current
483// classification or booksection
484void get_siblings (const text_t &classification, const text_t &booksection,
485 gdbmclass &gdbm, vector<text_t> &siblings) {
486
487 gdbm_info info;
488
489 if (booksection.empty() && classification.size() == 1) {
490 // top level classification has no siblings
491 return;
492
493 } else if (booksection.empty()) {
494 // get classification siblings
495 gdbm.getinfo(classification, info);
496 gdbm.getinfo(info.parent, info); // info is now parent info
497 splitstring(info.contents, siblings);
498 return;
499
500 } else {
501 // get book section siblings
502 if (is_top_level(booksection)) {
503 // top level of book so siblings are children of classification
504 gdbm.getinfo(classification, info);
505 splitstring(info.contents, siblings);
506
507 // add classifications to book sections
508 for (unsigned int i = 0; i < siblings.size(); i++) {
509 if (is_book(siblings[i])) siblings[i] = classification + "." + siblings[i];
510 }
511
512 } else {
513 // siblings come from immediate parent
514 gdbm.getinfo(booksection, info);
515 gdbm.getinfo(info.parent, info); // info is now parent info
516 splitstring(info.contents, siblings);
517
518 // add classifications to book sections
519 for (unsigned int i = 0; i < siblings.size(); i++) {
520 if (is_book(siblings[i])) siblings[i] = classification + "." + siblings[i];
521 }
522 }
523 }
524}
525
526// compares section 1 and section 2 and returns 1 if section2 belongs to
527// the same chapter as section1 (i.e. is sibling of or child of or child of sibling)
528int are_same_chapter(text_t section1, text_t section2)
529{
530 get_parent_section(section1);
531
532 while (!section2.empty()) {
533 get_parent_section(section2);
534 if (section2 == section1) return 1;
535 }
536 return 0;
537}
538
539// get_first_section gets the first section from a colon separated
540// list (instring)
541void get_first_section(const text_t &instring, text_t &returnstring) {
542
543 returnstring.clear();
544
545 text_t::const_iterator here = instring.begin();
546 text_t::const_iterator end = instring.end();
547
548 while (here != end) {
549 if (*here == ':') return;
550 returnstring.push_back(*here);
551 here ++;
552 }
553}
554
555
556// removes html tags from string - everything after < will be removed
557// if < occurs without >
558void remove_tags (text_t &text)
559{
560 text_t::const_iterator here = text.begin ();
561 text_t::const_iterator end = text.end ();
562 int found = 0;
563 text_t tmp;
564
565 while (here != end) {
566 if (*here == '<') {found = 1; here ++; continue;}
567 if (*here == '>') {found = 0; here ++; continue;}
568
569 if (!found) tmp.push_back(*here);
570 here ++;
571 }
572 text = tmp;
573}
574
575// checks text to see if it is a number (i.e. contains only 0-9)
576// returns 1 if true, 0 if false
577int is_number (text_t &text) {
578
579 text_t::const_iterator here = text.begin();
580 text_t::const_iterator end = text.end();
581
582 while (here != end) {
583 if ((*here!='0') && (*here!='1') && (*here!='2') &&
584 (*here!='3') && (*here!='4') && (*here!='5') &&
585 (*here!='6') && (*here!='7') && (*here!='8') &&
586 (*here!='9')) return 0;
587 here ++;
588 }
589 return 1;
590}
591
592// functions related to sorting
593
594// returns whatever comes after ':#:' in str
595// -- this is a nasty hack that I'm sure Rodger will want to change ;-)
596text_t get_section_str(const text_t &str) {
597
598 text_t ret;
599 int found = 0;
600
601 text_t::const_iterator here = str.begin();
602 text_t::const_iterator end = str.end();
603
604 while (here != end) {
605 if (found) {
606 ret.push_back(*here);
607 } else {
608 here = findchar (here, end, ':');
609 if ((*(here+1) == '#') && (*(here+2) == ':')) {
610 found = 1;
611 here = here+2;
612 }
613 }
614 here ++;
615 }
616 return ret;
617}
618
619// removes leading spaces and leading 'the' 'a' and 'an'
620// from string
621void alphabetize_string_english (text_t &text) {
622
623 if (text.empty()) return;
624
625 text_t firstword;
626 char *word;
627
628 text_t::iterator here = text.begin();
629 text_t::const_iterator end = text.end();
630
631 if ((*here != ' ') && (*here != 'a') && (*here != 'A') &&
632 (*here != 't') && (*here != 'T')) return;
633
634 int foundchar = 0;
635 while (here != end) {
636 if (*here == ' ' && !foundchar) {here ++; continue;}
637 if (*here == ' ' && foundchar) {
638 text.erase(text.begin(), (here+1));
639 break;
640 }
641 foundchar ++;
642 if (foundchar == 1) {
643 getdelimitstr (here, end, ' ', firstword);
644 word = firstword.getcstr();
645 if ((compare_str(word, "the") != 0) && (compare_str(word, "a") != 0) &&
646 (compare_str(word, "an") != 0)) break;
647 }
648 here ++;
649 }
650 delete word;
651}
652
653// removes leading space, puts last name before
654// any preceeding names
655void alphabetize_string_name (text_t &text) {
656
657 if (text.empty()) return;
658
659 text_t lastname;
660 char *lname;
661 vector<text_t> words;
662 splitchar (text.begin(), text.end(), ' ', words);
663 lastname = words.back();
664 words.pop_back();
665 lname = lastname.getcstr();
666
667 while ((compare_str(lname, "jnr") == 0) || (compare_str(lname, "snr") == 0) ||
668 (compare_str(lname, "esq") == 0)) {
669 lastname = words.back();
670 words.pop_back();
671 lname = lastname.getcstr();
672 }
673
674 text.clear();
675 joinchar (words, ' ', text);
676 text = lastname + text;
677}
678
679char ** string_add (char **array, int *len, char *str) {
680 char **ret;
681
682 ret = (char**)realloc(array, (*len+1)*sizeof(char*));
683 ret[*len] = (char*)strdup(str);
684 (*len) ++;
685
686 return ret;
687}
688
689void string_sort (char **array, int len) {
690 qsort((void*)array, (unsigned int)(len), sizeof(char*), compare_str);
691}
692
693void string_free (char **array, int len) {
694 for (int i = 0; i < len; i++)
695 free (array[i]);
696 free (array);
697}
698
699// returns a date of form _dec_ 31, 1999
700// input is date of type 19991231
701// at least the year must be present in date
702text_t format_date (const text_t &date) {
703 text_t::const_iterator here = date.begin();
704 text_t::const_iterator end = date.end();
705
706 text_t year, month, day, dreturn;
707 int i;
708
709 for (i = 0; i < 4 && here != end; i++) {
710 year.push_back(*here);
711 here ++;
712 }
713 if (year.empty()) return "";
714
715 for (i = 0; i < 2 && here != end; i++) {
716 month.push_back(*here);
717 here ++;
718 }
719 for (i = 0; i < 2 && here != end; i++) {
720 day.push_back(*here);
721 here ++;
722 }
723
724 if (!month.empty()) format_month(month);
725
726 if (!day.empty()) format_day(day);
727
728 if (!month.empty()) {
729 dreturn += month + " ";
730 if (!day.empty()) {
731 dreturn += day + ", ";
732 }
733 }
734 dreturn += year;
735 return dreturn;
736}
737
738void format_month (text_t &month) {
739 if (month == "01") month = "_jan_";
740 else if (month == "02") month = "_feb_";
741 else if (month == "03") month = "_mar_";
742 else if (month == "04") month = "_apr_";
743 else if (month == "05") month = "_may_";
744 else if (month == "06") month = "_jun_";
745 else if (month == "07") month = "_jul_";
746 else if (month == "08") month = "_aug_";
747 else if (month == "09") month = "_sep_";
748 else if (month == "10") month = "_oct_";
749 else if (month == "11") month = "_nov_";
750 else if (month == "12") month = "_dec_";
751 else month.clear();
752}
753
754void format_day(text_t &day) {
755 if (day[0] == '0') {
756 char tmp = day[1];
757 day.clear();
758 day.push_back(tmp);
759 }
760}
761
Note: See TracBrowser for help on using the repository browser.