Context Navigation

source: trunk/gsdl/src/colservr/gdbmclass.cpp@ 543

Last change on this file since 543 was 534, checked in by sjboddie, 25 years ago
added gpl notice
Property svn:executable set to ``* Property svn:keywords set to `Author Date Id Revision`
File size: 19.0 KB

Line
1	/**********************************************************************
2	*
3	* gdbmclass.cpp --
4	* Copyright (C) 1999 The New Zealand Digital Library Project
5	*
6	* A component of the Greenstone digital library software
7	* from the New Zealand Digital Library Project at the
8	* University of Waikato, New Zealand.
9	*
10	* This program is free software; you can redistribute it and/or modify
11	* it under the terms of the GNU General Public License as published by
12	* the Free Software Foundation; either version 2 of the License, or
13	* (at your option) any later version.
14	*
15	* This program is distributed in the hope that it will be useful,
16	* but WITHOUT ANY WARRANTY; without even the implied warranty of
17	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18	* GNU General Public License for more details.
19	*
20	* You should have received a copy of the GNU General Public License
21	* along with this program; if not, write to the Free Software
22	* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23	*
24	* $Id: gdbmclass.cpp 534 1999-09-07 04:57:43Z sjboddie $
25	*
26	*********************************************************************/
27
28	/*
29	$Log$
30	Revision 1.6 1999/09/07 04:57:21 sjboddie
31	added gpl notice
32
33	Revision 1.5 1999/01/25 03:59:40 sjboddie
34	fixed a bug in sorting code
35
36	Revision 1.4 1999/01/21 21:20:08 sjboddie
37	removed unused collection parameter from several functions
38
39	Revision 1.3 1999/01/19 01:38:15 rjmcnab
40
41	Made the source more portable.
42
43	Revision 1.2 1999/01/12 01:51:01 rjmcnab
44
45	Standard header.
46
47	Revision 1.1 1999/01/08 09:02:15 rjmcnab
48
49	Moved from src/library.
50
51	*/
52
53
54	#include "text_t.h"
55	#include "gdbmclass.h"
56	#include "unitool.h"
57	#include "gsdlunicode.h"
58	#include "fileutil.h"
59	#include <ctype.h>
60	#include <string.h>
61
62	#if defined(GSDL_NEED_STRINGS_H)
63	#include <strings.h>
64	#endif
65
66	#if defined(GSDL_USE_OBJECTSPACE)
67	# include <ospace\std\algorithm>
68	#elif defined(GSDL_USE_STL_H)
69	# if defined(GSDL_USE_ALGO_H)
70	# include <algo.h>
71	# else
72	# include <algorithm.h>
73	# endif
74	#else
75	# include <algorithm>
76	#endif
77
78
79	static int compare_str (const char e1, const char e2) {
80	#ifdef __WIN32__
81	return _stricmp(e1, e2);
82	#else
83	return strcasecmp(e1, e2);
84	#endif
85	}
86
87	static int compare_str (const void e1, const void e2) {
88	#ifdef __WIN32__
89	return _stricmp(((char)e1), ((char**)e2));
90	#else
91	return strcasecmp(((char)e1), ((char**)e2));
92	#endif
93	}
94
95
96
97	void gdbm_info::clear () {
98	docnum = 0; // 'd'
99	title.clear(); // 't'
100	parent.clear(); // 'p'
101	classification.clear(); // 'x'
102	contents.clear(); // 'c'
103	jobnum.clear(); // 'j'
104	OID.clear(); // 'o'
105	author.clear(); // 'a'
106	source.clear(); // 's'
107	date.clear(); // 'i'
108	}
109
110
111
112
113
114	// returns 0 if failed, 1 if opened
115	int gdbmclass::opendatabase (const text_t &filename) {
116	text_t data_location;
117	int block_size = 0;
118
119	if (gdbmfile != NULL) {
120	if (openfile == filename) return 1;
121	else closedatabase ();
122	}
123
124	openfile = filename;
125
126	char *namebuffer = filename.getcstr();
127	gdbmfile = gdbm_open (namebuffer, block_size, GDBM_READER, 00664, NULL);
128	delete namebuffer;
129
130	if (gdbmfile == NULL && logout != NULL) {
131	outconvertclass text_t2ascii;
132	(*logout) << text_t2ascii << "database open failed on: " << filename << "\n";
133	}
134
135	return (gdbmfile != NULL);
136	}
137
138
139	void gdbmclass::closedatabase () {
140	if (gdbmfile == NULL) return;
141
142	gdbm_close (gdbmfile);
143	gdbmfile = NULL;
144	openfile.clear();
145	}
146
147
148	// returns 0 on success, -1 on failure
149	// key and collection aren't references as they might be aliases to
150	// something in info
151	int gdbmclass::getinfo (text_t key, gdbm_info &info) {
152	text_t data;
153
154	if (!getkeydata (key, data)) return -1;
155	text_t::iterator here = data.begin ();
156	text_t::iterator end = data.end ();
157
158	text_t ikey, ivalue;
159	info.clear (); // reset info
160
161	while (getinfoline(here, end, ikey, ivalue)) {
162	if (ikey == "d") { info.docnum = ivalue.getint(); }
163	else if (ikey == "t") { info.title = ivalue; }
164	else if (ikey == "p") { info.parent = ivalue; }
165	else if (ikey == "x") { info.classification = ivalue; }
166	else if (ikey == "c") { info.contents = ivalue; }
167	else if (ikey == "j") { info.jobnum = ivalue; }
168	else if (ikey == "o") { info.OID = ivalue; }
169	else if (ikey == "a") { info.author = ivalue; }
170	else if (ikey == "s") { info.source = ivalue; }
171	else if (ikey == "i") { info.date = ivalue; }
172	}
173
174	return 0;
175	}
176
177
178	// returns 1 if the key exists
179	int gdbmclass::exists (text_t key) {
180	text_t data;
181	return getkeydata (key, data);
182	}
183
184
185	// returns 1 if successful
186	int gdbmclass::getkeydata (text_t key, text_t &data) {
187	datum key_data;
188	datum return_data;
189
190	if (gdbmfile == NULL) return 0;
191
192	// get a utf-8 encoded c string of the unicode key
193	key_data.dptr = (to_utf8(key)).getcstr();
194	if (key_data.dptr == NULL) {
195	if (logout != NULL) (*logout) << "gdbmclass: out of memory\n";
196	return 0;
197	}
198	key_data.dsize = strlen (key_data.dptr);
199
200	// fetch the result
201	return_data = gdbm_fetch (gdbmfile, key_data);
202	delete key_data.dptr;
203
204	if (return_data.dptr == NULL) return 0;
205
206	data.setcarr (return_data.dptr, return_data.dsize);
207	free (return_data.dptr);
208	data = to_uni(data); // convert to unicode
209
210	return 1;
211	}
212
213
214	// parses a line of the form <key>value\n
215	// returns 1 if successful
216	int gdbmclass::getinfoline (text_t::iterator &here, text_t::iterator end,
217	text_t &key, text_t &value) {
218	key.clear();
219	value.clear();
220
221	// ignore white space
222	while (here != end && is_unicode_space (*here)) here++;
223
224	// get the '<'
225	if (here == end \|\| *here != '<') return 0;
226	here++;
227
228	// get the key
229	while (here != end && *here != '>') {
230	key.push_back(*here);
231	here++;
232	}
233
234	// get the '>'
235	if (here == end \|\| *here != '>') return 0;
236	here++;
237
238	// get the value
239	while (here != end && *here != '\n') {
240	value.push_back(*here);
241	here++;
242	}
243
244	return 1;
245	}
246
247
248
249
250
251	// a few useful functions
252
253	//////////////////////////////////////////////////////////////////////////////////////////
254	// functions for testing classification strings
255
256
257	// returns 1 if targetdoc is top level of a book (i.e. =~ /B\.\d+$/) - otherwise 0;
258	int is_top_level (const text_t &targetdoc) {
259
260	text_t::const_iterator here = targetdoc.begin();
261	text_t::const_iterator end = targetdoc.end();
262
263	// look for the 'B'
264	here = findchar (here, end, 'B');
265
266	// there must be exactly one dot after the 'B'
267	if ((here != end) && (countchar (here, end, '.') == 1))
268	return 1;
269
270	return 0;
271	}
272
273	// returns 1 if targetdoc is any level of a book (i.e. contains 'B') - otherwise 0
274	int is_book (const text_t &targetdoc) {
275
276	text_t::const_iterator here = targetdoc.begin();
277	text_t::const_iterator end = targetdoc.end();
278
279	here = findchar (here, end, 'B');
280	if (here != end) return 1;
281	return 0;
282	}
283
284	// returns (in book_top) the top level of the book in targetdoc
285	void get_book_top (const text_t &targetdoc, text_t &book_top) {
286
287	text_t::const_iterator here = targetdoc.begin();
288	text_t::const_iterator end = targetdoc.end();
289
290	book_top.clear();
291
292	// look for the 'B'
293	here = findchar (here, end, 'B');
294
295	// copy up to the second '.'
296	int founddot = 0;
297	while (here != end) {
298	if (*here == '.') {
299	if (founddot) return;
300	founddot = 1;
301	}
302	book_top.push_back(*here);
303	here++;
304	}
305	}
306
307	// returns (in book) the book section part of the classification
308	// contained in targetdoc
309	void get_book (const text_t &targetdoc, text_t &book) {
310
311	text_t::const_iterator here = targetdoc.begin();
312	text_t::const_iterator end = targetdoc.end();
313
314	book.clear ();
315
316	// look for the 'B'
317	here = findchar (here, end, 'B');
318
319	// copy the rest of the string
320	while (here != end) {
321	book.push_back(*here);
322	here ++;
323	}
324	}
325
326	// get_parent_section removes the last part from section (i.e.=~ s/\.\d+$//)
327	void get_parent_section (text_t &section) {
328	int founddot = 0;
329	text_t::iterator end;
330	while (!founddot && !section.empty()) {
331	end = section.end();
332	end --;
333	if (*end == '.') founddot = 1;
334	section.pop_back();
335	}
336	}
337
338	// same as above but also returns ths child section that's removed
339	void get_parent_section (text_t &parentsection, text_t &childsection) {
340	int founddot = 0;
341	text_t tmp;
342	childsection.clear();
343	text_t::iterator end;
344	while (!founddot && !parentsection.empty()) {
345	end = parentsection.end();
346	end --;
347	if (*end == '.') founddot = 1;
348	else tmp.push_back(*end); childsection = tmp + childsection; tmp.clear();
349	parentsection.pop_back();
350	}
351	}
352
353	// count_dots returns the number of dots ('.') there are
354	// in a range of a targetdoc string
355	int count_dots(text_t::const_iterator first, text_t::const_iterator last) {
356	return countchar (first, last, '.');
357	}
358
359	int count_dots (const text_t &targetdoc) {
360	return count_dots(targetdoc.begin(), targetdoc.end());
361	}
362
363	// returns 1 if targetdoc is a first level descendant
364	// (i.e. B.n.1, B.n.1.1, B.n.1.1.1 etc.) - otherwise 0
365	int is_section_top(const text_t &targetdoc) {
366	text_t::const_iterator here = targetdoc.begin();
367	text_t::const_iterator end = targetdoc.end();
368
369	// look for the 'B'
370	here = findchar (here, end, 'B');
371	here = findchar (here, end, '.');
372	if (here != end) here++; // skip over the '.'
373	here = findchar (here, end, '.');
374
375	// make sure that all '.' are followed by a '1'
376	while (here != end) {
377	if (*here != '.') return 0;
378	here ++;
379
380	if (here != end) {
381	if (*here != '1') return 0;
382	here ++;
383	}
384	}
385	return 1;
386	}
387
388	// seperate_parts seperates targetdoc into its classification and booksection
389	// if classification isn't supplied it gets the first classification for the
390	// book from the gdbm
391	// if booksection doesn't exist it remains blank
392	void separate_parts(const text_t &targetdoc, gdbmclass &gdbm,
393	text_t &classification, text_t &booksection) {
394
395	split_targetdoc (targetdoc, classification, booksection);
396
397	if (classification.empty()) {
398	// no classification included so get first one for this book
399	gdbm_info info;
400	text_t book_top;
401	vector<text_t> classarray;
402	get_book_top (targetdoc, book_top);
403	gdbm.getinfo(book_top, info);
404	splitstring (info.classification, classarray);
405	if (!classarray.empty()) classification = classarray[0];
406	else classification = "C.1";
407	}
408	}
409
410	// split_targetdoc splits up a string containing a classification
411	// and book (or one or the other)
412	void split_targetdoc(const text_t &targetdoc, text_t &classification,
413	text_t &booksection) {
414
415	classification.clear ();
416	booksection.clear();
417
418	text_t::const_iterator here = targetdoc.begin();
419	text_t::const_iterator end = targetdoc.end();
420
421	// copy everything up to the first 'B'
422	while (here != end) {
423	if (*here == 'B') break;
424	classification.push_back(*here);
425	here++;
426	}
427
428	// remove middle '.'
429	if (!classification.empty() &&
430	classification[classification.size()-1] == '.')
431	classification.pop_back();
432
433	// copy the rest of the string
434	while (here != end) {
435	booksection.push_back(*here);
436	here++;
437	}
438	}
439
440	// splitstring splits a colon seperated string into an array
441	void splitstring (const text_t &string, vector<text_t> &array) {
442	splitchar (string.begin(), string.end(), ':', array);
443	}
444
445	// get_parents returns the parents array containing all the parents of the
446	// document specified by classification and booksection
447	void get_parents (const text_t &targetdoc, vector<text_t> &parents)
448	{
449	text_t::const_iterator here = targetdoc.begin ();
450	text_t::const_iterator end = targetdoc.end ();
451
452	text_t currentparent;
453	text_t newsuffixpart;
454	text_t newsuffix;
455	bool first = true;
456	while (here != end)
457	{
458	// if there is a newsuffix add it to the current parent
459	// and add that parent to the parents vector
460	if (!newsuffix.empty())
461	{
462	currentparent += newsuffix;
463	parents.push_back (currentparent);
464	}
465
466	// keep getting suffixes until one is found which starts with
467	// a number
468	newsuffix.clear();
469	do
470	{
471	here = getdelimitstr (here, end, '.', newsuffixpart);
472	if (!first) newsuffix.push_back ('.');
473	first = false;
474	newsuffix += newsuffixpart;
475	}
476	while ((here != end) && !newsuffixpart.empty() &&
477	(newsuffixpart[0] < '0' \|\| newsuffixpart[0] > '9'));
478	}
479	}
480
481
482	// get_siblings returns the siblings array containing all the siblings of the current
483	// classification or booksection
484	void get_siblings (const text_t &classification, const text_t &booksection,
485	gdbmclass &gdbm, vector<text_t> &siblings) {
486
487	gdbm_info info;
488
489	if (booksection.empty() && classification.size() == 1) {
490	// top level classification has no siblings
491	return;
492
493	} else if (booksection.empty()) {
494	// get classification siblings
495	gdbm.getinfo(classification, info);
496	gdbm.getinfo(info.parent, info); // info is now parent info
497	splitstring(info.contents, siblings);
498	return;
499
500	} else {
501	// get book section siblings
502	if (is_top_level(booksection)) {
503	// top level of book so siblings are children of classification
504	gdbm.getinfo(classification, info);
505	splitstring(info.contents, siblings);
506
507	// add classifications to book sections
508	for (unsigned int i = 0; i < siblings.size(); i++) {
509	if (is_book(siblings[i])) siblings[i] = classification + "." + siblings[i];
510	}
511
512	} else {
513	// siblings come from immediate parent
514	gdbm.getinfo(booksection, info);
515	gdbm.getinfo(info.parent, info); // info is now parent info
516	splitstring(info.contents, siblings);
517
518	// add classifications to book sections
519	for (unsigned int i = 0; i < siblings.size(); i++) {
520	if (is_book(siblings[i])) siblings[i] = classification + "." + siblings[i];
521	}
522	}
523	}
524	}
525
526	// compares section 1 and section 2 and returns 1 if section2 belongs to
527	// the same chapter as section1 (i.e. is sibling of or child of or child of sibling)
528	int are_same_chapter(text_t section1, text_t section2)
529	{
530	get_parent_section(section1);
531
532	while (!section2.empty()) {
533	get_parent_section(section2);
534	if (section2 == section1) return 1;
535	}
536	return 0;
537	}
538
539	// get_first_section gets the first section from a colon separated
540	// list (instring)
541	void get_first_section(const text_t &instring, text_t &returnstring) {
542
543	returnstring.clear();
544
545	text_t::const_iterator here = instring.begin();
546	text_t::const_iterator end = instring.end();
547
548	while (here != end) {
549	if (*here == ':') return;
550	returnstring.push_back(*here);
551	here ++;
552	}
553	}
554
555
556	// removes html tags from string - everything after < will be removed
557	// if < occurs without >
558	void remove_tags (text_t &text)
559	{
560	text_t::const_iterator here = text.begin ();
561	text_t::const_iterator end = text.end ();
562	int found = 0;
563	text_t tmp;
564
565	while (here != end) {
566	if (*here == '<') {found = 1; here ++; continue;}
567	if (*here == '>') {found = 0; here ++; continue;}
568
569	if (!found) tmp.push_back(*here);
570	here ++;
571	}
572	text = tmp;
573	}
574
575	// checks text to see if it is a number (i.e. contains only 0-9)
576	// returns 1 if true, 0 if false
577	int is_number (text_t &text) {
578
579	text_t::const_iterator here = text.begin();
580	text_t::const_iterator end = text.end();
581
582	while (here != end) {
583	if ((here!='0') && (here!='1') && (*here!='2') &&
584	(here!='3') && (here!='4') && (*here!='5') &&
585	(here!='6') && (here!='7') && (*here!='8') &&
586	(*here!='9')) return 0;
587	here ++;
588	}
589	return 1;
590	}
591
592	// functions related to sorting
593
594	// returns whatever comes after ':#:' in str
595	// -- this is a nasty hack that I'm sure Rodger will want to change ;-)
596	text_t get_section_str(const text_t &str) {
597
598	text_t ret;
599	int found = 0;
600
601	text_t::const_iterator here = str.begin();
602	text_t::const_iterator end = str.end();
603
604	while (here != end) {
605	if (found) {
606	ret.push_back(*here);
607	} else {
608	here = findchar (here, end, ':');
609	if (((here+1) == '#') && ((here+2) == ':')) {
610	found = 1;
611	here = here+2;
612	}
613	}
614	here ++;
615	}
616	return ret;
617	}
618
619	// removes leading spaces and leading 'the' 'a' and 'an'
620	// from string
621	void alphabetize_string_english (text_t &text) {
622
623	if (text.empty()) return;
624
625	text_t firstword;
626	char *word;
627
628	text_t::iterator here = text.begin();
629	text_t::const_iterator end = text.end();
630
631	if ((here != ' ') && (here != 'a') && (*here != 'A') &&
632	(here != 't') && (here != 'T')) return;
633
634	int foundchar = 0;
635	while (here != end) {
636	if (*here == ' ' && !foundchar) {here ++; continue;}
637	if (*here == ' ' && foundchar) {
638	text.erase(text.begin(), (here+1));
639	break;
640	}
641	foundchar ++;
642	if (foundchar == 1) {
643	getdelimitstr (here, end, ' ', firstword);
644	word = firstword.getcstr();
645	if ((compare_str(word, "the") != 0) && (compare_str(word, "a") != 0) &&
646	(compare_str(word, "an") != 0)) break;
647	}
648	here ++;
649	}
650	delete word;
651	}
652
653	// removes leading space, puts last name before
654	// any preceeding names
655	void alphabetize_string_name (text_t &text) {
656
657	if (text.empty()) return;
658
659	text_t lastname;
660	char *lname;
661	vector<text_t> words;
662	splitchar (text.begin(), text.end(), ' ', words);
663	lastname = words.back();
664	words.pop_back();
665	lname = lastname.getcstr();
666
667	while ((compare_str(lname, "jnr") == 0) \|\| (compare_str(lname, "snr") == 0) \|\|
668	(compare_str(lname, "esq") == 0)) {
669	lastname = words.back();
670	words.pop_back();
671	lname = lastname.getcstr();
672	}
673
674	text.clear();
675	joinchar (words, ' ', text);
676	text = lastname + text;
677	}
678
679	char string_add (char array, int len, char str) {
680	char **ret;
681
682	ret = (char*)realloc(array, (len+1)sizeof(char));
683	ret[len] = (char)strdup(str);
684	(*len) ++;
685
686	return ret;
687	}
688
689	void string_sort (char **array, int len) {
690	qsort((void)array, (unsigned int)(len), sizeof(char), compare_str);
691	}
692
693	void string_free (char **array, int len) {
694	for (int i = 0; i < len; i++)
695	free (array[i]);
696	free (array);
697	}
698
699	// returns a date of form _dec_ 31, 1999
700	// input is date of type 19991231
701	// at least the year must be present in date
702	text_t format_date (const text_t &date) {
703	text_t::const_iterator here = date.begin();
704	text_t::const_iterator end = date.end();
705
706	text_t year, month, day, dreturn;
707	int i;
708
709	for (i = 0; i < 4 && here != end; i++) {
710	year.push_back(*here);
711	here ++;
712	}
713	if (year.empty()) return "";
714
715	for (i = 0; i < 2 && here != end; i++) {
716	month.push_back(*here);
717	here ++;
718	}
719	for (i = 0; i < 2 && here != end; i++) {
720	day.push_back(*here);
721	here ++;
722	}
723
724	if (!month.empty()) format_month(month);
725
726	if (!day.empty()) format_day(day);
727
728	if (!month.empty()) {
729	dreturn += month + " ";
730	if (!day.empty()) {
731	dreturn += day + ", ";
732	}
733	}
734	dreturn += year;
735	return dreturn;
736	}
737
738	void format_month (text_t &month) {
739	if (month == "01") month = "_jan_";
740	else if (month == "02") month = "_feb_";
741	else if (month == "03") month = "_mar_";
742	else if (month == "04") month = "_apr_";
743	else if (month == "05") month = "_may_";
744	else if (month == "06") month = "_jun_";
745	else if (month == "07") month = "_jul_";
746	else if (month == "08") month = "_aug_";
747	else if (month == "09") month = "_sep_";
748	else if (month == "10") month = "_oct_";
749	else if (month == "11") month = "_nov_";
750	else if (month == "12") month = "_dec_";
751	else month.clear();
752	}
753
754	void format_day(text_t &day) {
755	if (day[0] == '0') {
756	char tmp = day[1];
757	day.clear();
758	day.push_back(tmp);
759	}
760	}
761

Note: See TracBrowser for help on using the repository browser.

Download in other formats: