Context Navigation

← Previous Revision
Latest Revision
Next Revision →
Blame
Revision Log

source: trunk/gsdl/src/colservr/gdbmclass.cpp@ 110

Last change on this file since 110 was 110, checked in by rjmcnab, 25 years ago
Moved from src/library.
Property svn:executable set to ``* Property svn:keywords set to `Author Date Id Revision`
File size: 18.1 KB

Line
1	/**********************************************************************
2	*
3	* gdbmclass.cpp --
4	* Copyright (C) 1999 The New Zealand Digital Library Project
5	*
6	* PUT COPYRIGHT NOTICE HERE
7	*
8	* $Id: gdbmclass.cpp 110 1999-01-08 09:02:22Z rjmcnab $
9	*
10	*********************************************************************/
11
12	/*
13	$Log$
14	Revision 1.1 1999/01/08 09:02:15 rjmcnab
15
16	Moved from src/library.
17
18	*/
19
20	static char *RCSID = "$Id: gdbmclass.cpp 110 1999-01-08 09:02:22Z rjmcnab $";
21
22
23	#include "text_t.h"
24	#include "gdbmclass.h"
25	#include "unitool.h"
26	#include "gsdlunicode.h"
27	#include "fileutil.h"
28	#include <ctype.h>
29	#include <string.h>
30
31	#ifndef USE_OBJECTSPACE
32	# include <algorithm>
33	#else
34	# include <ospace\std\algorithm>
35	#endif
36
37
38	static int my_stricmp (const char str1, const char str2) {
39	char c1, c2;
40
41	if ((str1 == NULL) \|\| (str2 == NULL)) {
42	if ((str1 == NULL) && (str2 == NULL)) return 0;
43	if (str1 == NULL) return -1;
44	return 1;
45	}
46
47	while (((c1 = tolower(*str1)) != '\0') \|\|
48	((c2 = tolower(*str2)) != '\0')) {
49	if (c1 < c2) return -1;
50	if (c1 > c2) return 1;
51
52	str1++;
53	str2++;
54	}
55
56	if ((str1 == '\0') && (str2 == '\0')) return 0;
57	if (*str1 == '\0') return -1;
58	return 1;
59	}
60
61
62	void gdbm_info::clear () {
63	docnum = 0; // 'd'
64	title.clear(); // 't'
65	parent.clear(); // 'p'
66	classification.clear(); // 'x'
67	contents.clear(); // 'c'
68	jobnum.clear(); // 'j'
69	OID.clear(); // 'o'
70	author.clear(); // 'a'
71	source.clear(); // 's'
72	date.clear(); // 'i'
73	}
74
75
76
77
78
79	// returns 0 if failed, 1 if opened
80	int gdbmclass::opendatabase (const text_t &filename) {
81	text_t data_location;
82	int block_size = 0;
83
84	if (gdbmfile != NULL) {
85	if (openfile == filename) return 1;
86	else closedatabase ();
87	}
88
89	openfile = filename;
90
91	char *namebuffer = filename.getcstr();
92	gdbmfile = gdbm_open (namebuffer, block_size, GDBM_READER, 00664, NULL);
93	delete namebuffer;
94
95	if (gdbmfile == NULL && logout != NULL) {
96	outconvertclass text_t2ascii;
97	(*logout) << text_t2ascii << "database open failed on: " << filename << "\n";
98	}
99
100	return (gdbmfile != NULL);
101	}
102
103
104	void gdbmclass::closedatabase () {
105	if (gdbmfile == NULL) return;
106
107	gdbm_close (gdbmfile);
108	gdbmfile = NULL;
109	openfile.clear();
110	}
111
112
113	// returns 0 on success, -1 on failure
114	// key and collection aren't references as they might be aliases to
115	// something in info
116	int gdbmclass::getinfo (text_t key, gdbm_info &info) {
117	text_t data;
118
119	if (!getkeydata (key, data)) return -1;
120	text_t::iterator here = data.begin ();
121	text_t::iterator end = data.end ();
122
123	text_t ikey, ivalue;
124	info.clear (); // reset info
125
126	while (getinfoline(here, end, ikey, ivalue)) {
127	if (ikey == "d") { info.docnum = ivalue.getint(); }
128	else if (ikey == "t") { info.title = ivalue; }
129	else if (ikey == "p") { info.parent = ivalue; }
130	else if (ikey == "x") { info.classification = ivalue; }
131	else if (ikey == "c") { info.contents = ivalue; }
132	else if (ikey == "j") { info.jobnum = ivalue; }
133	else if (ikey == "o") { info.OID = ivalue; }
134	else if (ikey == "a") { info.author = ivalue; }
135	else if (ikey == "s") { info.source = ivalue; }
136	else if (ikey == "i") { info.date = ivalue; }
137	}
138
139	return 0;
140	}
141
142
143	// returns 1 if the key exists
144	int gdbmclass::exists (text_t key) {
145	text_t data;
146	return getkeydata (key, data);
147	}
148
149
150	// returns 1 if successful
151	int gdbmclass::getkeydata (text_t key, text_t &data) {
152	datum key_data;
153	datum return_data;
154
155	if (gdbmfile == NULL) return 0;
156
157	// get a utf-8 encoded c string of the unicode key
158	key_data.dptr = (to_utf8(key)).getcstr();
159	if (key_data.dptr == NULL) {
160	if (logout != NULL) (*logout) << "gdbmclass: out of memory\n";
161	return 0;
162	}
163	key_data.dsize = strlen (key_data.dptr);
164
165	// fetch the result
166	return_data = gdbm_fetch (gdbmfile, key_data);
167	delete key_data.dptr;
168
169	if (return_data.dptr == NULL) return 0;
170
171	data.setcarr (return_data.dptr, return_data.dsize);
172	free (return_data.dptr);
173	data = to_uni(data); // convert to unicode
174
175	return 1;
176	}
177
178
179	// parses a line of the form <key>value\n
180	// returns 1 if successful
181	int gdbmclass::getinfoline (text_t::iterator &here, text_t::iterator end,
182	text_t &key, text_t &value) {
183	key.clear();
184	value.clear();
185
186	// ignore white space
187	while (here != end && is_unicode_space (*here)) here++;
188
189	// get the '<'
190	if (here == end \|\| *here != '<') return 0;
191	here++;
192
193	// get the key
194	while (here != end && *here != '>') {
195	key.push_back(*here);
196	here++;
197	}
198
199	// get the '>'
200	if (here == end \|\| *here != '>') return 0;
201	here++;
202
203	// get the value
204	while (here != end && *here != '\n') {
205	value.push_back(*here);
206	here++;
207	}
208
209	return 1;
210	}
211
212
213
214
215
216	// a few useful functions
217
218	//////////////////////////////////////////////////////////////////////////////////////////
219	// functions for testing classification strings
220
221
222	// returns 1 if targetdoc is top level of a book (i.e. =~ /B\.\d+$/) - otherwise 0;
223	int is_top_level (const text_t &targetdoc) {
224
225	text_t::const_iterator here = targetdoc.begin();
226	text_t::const_iterator end = targetdoc.end();
227
228	// look for the 'B'
229	here = findchar (here, end, 'B');
230
231	// there must be exactly one dot after the 'B'
232	if ((here != end) && (countchar (here, end, '.') == 1))
233	return 1;
234
235	return 0;
236	}
237
238	// returns 1 if targetdoc is any level of a book (i.e. contains 'B') - otherwise 0
239	int is_book (const text_t &targetdoc) {
240
241	text_t::const_iterator here = targetdoc.begin();
242	text_t::const_iterator end = targetdoc.end();
243
244	here = findchar (here, end, 'B');
245	if (here != end) return 1;
246	return 0;
247	}
248
249	// returns (in book_top) the top level of the book in targetdoc
250	void get_book_top (const text_t &targetdoc, text_t &book_top) {
251
252	text_t::const_iterator here = targetdoc.begin();
253	text_t::const_iterator end = targetdoc.end();
254
255	book_top.clear();
256
257	// look for the 'B'
258	here = findchar (here, end, 'B');
259
260	// copy up to the second '.'
261	int founddot = 0;
262	while (here != end) {
263	if (*here == '.') {
264	if (founddot) return;
265	founddot = 1;
266	}
267	book_top.push_back(*here);
268	here++;
269	}
270	}
271
272	// returns (in book) the book section part of the classification
273	// contained in targetdoc
274	void get_book (const text_t &targetdoc, text_t &book) {
275
276	text_t::const_iterator here = targetdoc.begin();
277	text_t::const_iterator end = targetdoc.end();
278
279	book.clear ();
280
281	// look for the 'B'
282	here = findchar (here, end, 'B');
283
284	// copy the rest of the string
285	while (here != end) {
286	book.push_back(*here);
287	here ++;
288	}
289	}
290
291	// get_parent_section removes the last part from section (i.e.=~ s/\.\d+$//)
292	void get_parent_section (text_t &section) {
293	int founddot = 0;
294	text_t::iterator end;
295	while (!founddot && !section.empty()) {
296	end = section.end();
297	end --;
298	if (*end == '.') founddot = 1;
299	section.pop_back();
300	}
301	}
302
303	// same as above but also returns ths child section that's removed
304	void get_parent_section (text_t &parentsection, text_t &childsection) {
305	int founddot = 0;
306	text_t tmp;
307	childsection.clear();
308	text_t::iterator end;
309	while (!founddot && !parentsection.empty()) {
310	end = parentsection.end();
311	end --;
312	if (*end == '.') founddot = 1;
313	else tmp.push_back(*end); childsection = tmp + childsection; tmp.clear();
314	parentsection.pop_back();
315	}
316	}
317
318	// count_dots returns the number of dots ('.') there are
319	// in a range of a targetdoc string
320	int count_dots(text_t::const_iterator first, text_t::const_iterator last) {
321	return countchar (first, last, '.');
322	}
323
324	int count_dots (const text_t &targetdoc) {
325	return count_dots(targetdoc.begin(), targetdoc.end());
326	}
327
328	// returns 1 if targetdoc is a first level descendant
329	// (i.e. B.n.1, B.n.1.1, B.n.1.1.1 etc.) - otherwise 0
330	int is_section_top(const text_t &targetdoc) {
331	text_t::const_iterator here = targetdoc.begin();
332	text_t::const_iterator end = targetdoc.end();
333
334	// look for the 'B'
335	here = findchar (here, end, 'B');
336	here = findchar (here, end, '.');
337	if (here != end) here++; // skip over the '.'
338	here = findchar (here, end, '.');
339
340	// make sure that all '.' are followed by a '1'
341	while (here != end) {
342	if (*here != '.') return 0;
343	here ++;
344
345	if (here != end) {
346	if (*here != '1') return 0;
347	here ++;
348	}
349	}
350	return 1;
351	}
352
353	// seperate_parts seperates targetdoc into its classification and booksection
354	// if classification isn't supplied it gets the first classification for the
355	// book from the gdbm
356	// if booksection doesn't exist it remains blank
357	void seperate_parts(const text_t &targetdoc, gdbmclass &gdbm, const text_t &collection,
358	text_t &classification, text_t &booksection) {
359
360	split_targetdoc (targetdoc, classification, booksection);
361
362	if (classification.empty()) {
363	// no classification included so get first one for this book
364	gdbm_info info;
365	text_t book_top;
366	vector<text_t> classarray;
367	get_book_top (targetdoc, book_top);
368	gdbm.getinfo(book_top, info);
369	splitstring (info.classification, classarray);
370	if (!classarray.empty()) classification = classarray[0];
371	else classification = "C.1";
372	}
373	}
374
375	// split_targetdoc splits up a string containing a classification
376	// and book (or one or the other)
377	void split_targetdoc(const text_t &targetdoc, text_t &classification,
378	text_t &booksection) {
379
380	classification.clear ();
381	booksection.clear();
382
383	text_t::const_iterator here = targetdoc.begin();
384	text_t::const_iterator end = targetdoc.end();
385
386	// copy everything up to the first 'B'
387	while (here != end) {
388	if (*here == 'B') break;
389	classification.push_back(*here);
390	here++;
391	}
392
393	// remove middle '.'
394	if (!classification.empty() &&
395	classification[classification.size()-1] == '.')
396	classification.pop_back();
397
398	// copy the rest of the string
399	while (here != end) {
400	booksection.push_back(*here);
401	here++;
402	}
403	}
404
405	// splitstring splits a colon seperated string into an array
406	void splitstring (const text_t &string, vector<text_t> &array) {
407	splitchar (string.begin(), string.end(), ':', array);
408	}
409
410	// get_parents returns the parents array containing all the parents of the
411	// document specified by classification and booksection
412	void get_parents (const text_t &targetdoc, vector<text_t> &parents)
413	{
414	text_t::const_iterator here = targetdoc.begin ();
415	text_t::const_iterator end = targetdoc.end ();
416
417	text_t currentparent;
418	text_t newsuffixpart;
419	text_t newsuffix;
420	bool first = true;
421	while (here != end)
422	{
423	// if there is a newsuffix add it to the current parent
424	// and add that parent to the parents vector
425	if (!newsuffix.empty())
426	{
427	currentparent += newsuffix;
428	parents.push_back (currentparent);
429	}
430
431	// keep getting suffixes until one is found which starts with
432	// a number
433	newsuffix.clear();
434	do
435	{
436	here = getdelimitstr (here, end, '.', newsuffixpart);
437	if (!first) newsuffix.push_back ('.');
438	first = false;
439	newsuffix += newsuffixpart;
440	}
441	while ((here != end) && !newsuffixpart.empty() &&
442	(newsuffixpart[0] < '0' \|\| newsuffixpart[0] > '9'));
443	}
444	}
445
446
447	// get_siblings returns the siblings array containing all the siblings of the current
448	// classification or booksection
449	void get_siblings (const text_t &classification, const text_t &booksection,
450	gdbmclass &gdbm, const text_t &collection,
451	vector<text_t> &siblings) {
452
453	gdbm_info info;
454
455	if (booksection.empty() && classification.size() == 1) {
456	// top level classification has no siblings
457	return;
458
459	} else if (booksection.empty()) {
460	// get classification siblings
461	gdbm.getinfo(classification, info);
462	gdbm.getinfo(info.parent, info); // info is now parent info
463	splitstring(info.contents, siblings);
464	return;
465
466	} else {
467	// get book section siblings
468	if (is_top_level(booksection)) {
469	// top level of book so siblings are children of classification
470	gdbm.getinfo(classification, info);
471	splitstring(info.contents, siblings);
472
473	// add classifications to book sections
474	for (unsigned int i = 0; i < siblings.size(); i++) {
475	if (is_book(siblings[i])) siblings[i] = classification + "." + siblings[i];
476	}
477
478	} else {
479	// siblings come from immediate parent
480	gdbm.getinfo(booksection, info);
481	gdbm.getinfo(info.parent, info); // info is now parent info
482	splitstring(info.contents, siblings);
483
484	// add classifications to book sections
485	for (unsigned int i = 0; i < siblings.size(); i++) {
486	if (is_book(siblings[i])) siblings[i] = classification + "." + siblings[i];
487	}
488	}
489	}
490	}
491
492	// compares section 1 and section 2 and returns 1 if section2 belongs to
493	// the same chapter as section1 (i.e. is sibling of or child of or child of sibling)
494	int are_same_chapter(text_t section1, text_t section2)
495	{
496	get_parent_section(section1);
497
498	while (!section2.empty()) {
499	get_parent_section(section2);
500	if (section2 == section1) return 1;
501	}
502	return 0;
503	}
504
505	// get_first_section gets the first section from a colon separated
506	// list (instring)
507	void get_first_section(const text_t &instring, text_t &returnstring) {
508
509	returnstring.clear();
510
511	text_t::const_iterator here = instring.begin();
512	text_t::const_iterator end = instring.end();
513
514	while (here != end) {
515	if (*here == ':') return;
516	returnstring.push_back(*here);
517	here ++;
518	}
519	}
520
521
522	// removes html tags from string - everything after < will be removed
523	// if < occurs without >
524	void remove_tags (text_t &text)
525	{
526	text_t::const_iterator here = text.begin ();
527	text_t::const_iterator end = text.end ();
528	int found = 0;
529	text_t tmp;
530
531	while (here != end) {
532	if (*here == '<') {found = 1; here ++; continue;}
533	if (*here == '>') {found = 0; here ++; continue;}
534
535	if (!found) tmp.push_back(*here);
536	here ++;
537	}
538	text = tmp;
539	}
540
541	// checks text to see if it is a number (i.e. contains only 0-9)
542	// returns 1 if true, 0 if false
543	int is_number (text_t &text) {
544
545	text_t::const_iterator here = text.begin();
546	text_t::const_iterator end = text.end();
547
548	while (here != end) {
549	if ((here!='0') && (here!='1') && (*here!='2') &&
550	(here!='3') && (here!='4') && (*here!='5') &&
551	(here!='6') && (here!='7') && (*here!='8') &&
552	(*here!='9')) return 0;
553	here ++;
554	}
555	return 1;
556	}
557
558	// functions related to sorting
559
560	// returns whatever comes after ':#:' in str
561	// -- this is a nasty hack that I'm sure Rodger will want to change ;-)
562	text_t get_section_str(const text_t &str) {
563
564	text_t ret;
565	int found = 0;
566
567	text_t::const_iterator here = str.begin();
568	text_t::const_iterator end = str.end();
569
570	while (here != end) {
571	if (found) {
572	ret.push_back(*here);
573	} else {
574	here = findchar (here, end, ':');
575	if (((here+1) == '#') && ((here+2) == ':')) {
576	found = 1;
577	here = here+2;
578	}
579	}
580	here ++;
581	}
582	return ret;
583	}
584
585	// removes leading spaces and leading 'the' 'a' and 'an'
586	// from string
587	void alphabetize_string_english (text_t &text) {
588
589	if (text.empty()) return;
590
591	text_t firstword;
592	char *word;
593
594	text_t::iterator here = text.begin();
595	text_t::const_iterator end = text.end();
596
597	if ((here != ' ') && (here != 'a') && (*here != 'A') &&
598	(here != 't') && (here != 'T')) return;
599
600	int foundchar = 0;
601	while (here != end) {
602	if (*here == ' ' && !foundchar) {here ++; continue;}
603	if (*here == ' ' && foundchar) {
604	text.erase(text.begin(), (here+1));
605	break;
606	}
607	foundchar ++;
608	if (foundchar == 1) {
609	getdelimitstr (here, end, ' ', firstword);
610	word = firstword.getcstr();
611	if ((my_stricmp(word, "the") != 0) && (my_stricmp(word, "a") != 0) &&
612	(my_stricmp(word, "an") != 0)) break;
613	}
614	here ++;
615	}
616	delete word;
617	}
618
619	// removes leading space, puts last name before
620	// any preceeding names
621	void alphabetize_string_name (text_t &text) {
622
623	if (text.empty()) return;
624
625	text_t lastname;
626	char *lname;
627	vector<text_t> words;
628	splitchar (text.begin(), text.end(), ' ', words);
629	lastname = words.back();
630	words.pop_back();
631	lname = lastname.getcstr();
632
633	while ((my_stricmp(lname, "jnr") == 0) \|\| (my_stricmp(lname, "snr") == 0) \|\|
634	(my_stricmp(lname, "esq") == 0)) {
635	lastname = words.back();
636	words.pop_back();
637	lname = lastname.getcstr();
638	}
639
640	text.clear();
641	joinchar (words, ' ', text);
642	text = lastname + text;
643	}
644
645	char string_add (char array, int len, char str) {
646	char **ret;
647
648	ret = (char*)realloc(array, (len+1)sizeof(char));
649	ret[len] = (char)strdup(str);
650	(*len) ++;
651
652	return ret;
653	}
654
655	void string_sort (char **array, int len) {
656	qsort((void)array, (unsigned int)(len), sizeof(char), compare_str);
657	}
658
659	static int compare_str (const void e1, const void e2) {
660	#ifdef __WIN32__
661	return _stricmp(((char)e1), ((char**)e2));
662	#else
663	return strcasecmp(((char)e1), ((char**)e2));
664	#endif
665	}
666
667	void string_free (char **array, int len) {
668	for (int i = 0; i < len; i++)
669	free (array[i]);
670	free (array);
671	}
672
673	// returns a date of form _dec_ 31, 1999
674	// input is date of type 19991231
675	// at least the year must be present in date
676	text_t format_date (const text_t &date) {
677	text_t::const_iterator here = date.begin();
678	text_t::const_iterator end = date.end();
679
680	text_t year, month, day, dreturn;
681	int i;
682
683	for (i = 0; i < 4 && here != end; i++) {
684	year.push_back(*here);
685	here ++;
686	}
687	if (year.empty()) return "";
688
689	for (i = 0; i < 2 && here != end; i++) {
690	month.push_back(*here);
691	here ++;
692	}
693	for (i = 0; i < 2 && here != end; i++) {
694	day.push_back(*here);
695	here ++;
696	}
697
698	if (!month.empty()) format_month(month);
699
700	if (!day.empty()) format_day(day);
701
702	if (!month.empty()) {
703	dreturn += month + " ";
704	if (!day.empty()) {
705	dreturn += day + ", ";
706	}
707	}
708	dreturn += year;
709	return dreturn;
710	}
711
712	void format_month (text_t &month) {
713	if (month == "01") month = "_jan_";
714	else if (month == "02") month = "_feb_";
715	else if (month == "03") month = "_mar_";
716	else if (month == "04") month = "_apr_";
717	else if (month == "05") month = "_may_";
718	else if (month == "06") month = "_jun_";
719	else if (month == "07") month = "_jul_";
720	else if (month == "08") month = "_aug_";
721	else if (month == "09") month = "_sep_";
722	else if (month == "10") month = "_oct_";
723	else if (month == "11") month = "_nov_";
724	else if (month == "12") month = "_dec_";
725	else month.clear();
726	}
727
728	void format_day(text_t &day) {
729	if (day[0] == '0') {
730	char tmp = day[1];
731	day.clear();
732	day.push_back(tmp);
733	}
734	}
735

Note: See TracBrowser for help on using the repository browser.

Download in other formats: