Context Navigation

source: trunk/gsdl/src/colservr/gdbmclass.cpp@ 308

Last change on this file since 308 was 125, checked in by sjboddie, 25 years ago
fixed a bug in sorting code
Property svn:executable set to ``* Property svn:keywords set to `Author Date Id Revision`
File size: 18.2 KB

Line
1	/**********************************************************************
2	*
3	* gdbmclass.cpp --
4	* Copyright (C) 1999 The New Zealand Digital Library Project
5	*
6	* PUT COPYRIGHT NOTICE HERE
7	*
8	* $Id: gdbmclass.cpp 125 1999-01-25 03:59:40Z sjboddie $
9	*
10	*********************************************************************/
11
12	/*
13	$Log$
14	Revision 1.5 1999/01/25 03:59:40 sjboddie
15	fixed a bug in sorting code
16
17	Revision 1.4 1999/01/21 21:20:08 sjboddie
18	removed unused collection parameter from several functions
19
20	Revision 1.3 1999/01/19 01:38:15 rjmcnab
21
22	Made the source more portable.
23
24	Revision 1.2 1999/01/12 01:51:01 rjmcnab
25
26	Standard header.
27
28	Revision 1.1 1999/01/08 09:02:15 rjmcnab
29
30	Moved from src/library.
31
32	*/
33
34
35	#include "text_t.h"
36	#include "gdbmclass.h"
37	#include "unitool.h"
38	#include "gsdlunicode.h"
39	#include "fileutil.h"
40	#include <ctype.h>
41	#include <string.h>
42
43	#if defined(GSDL_NEED_STRINGS_H)
44	#include <strings.h>
45	#endif
46
47	#if defined(GSDL_USE_OBJECTSPACE)
48	# include <ospace\std\algorithm>
49	#elif defined(GSDL_USE_STL_H)
50	# if defined(GSDL_USE_ALGO_H)
51	# include <algo.h>
52	# else
53	# include <algorithm.h>
54	# endif
55	#else
56	# include <algorithm>
57	#endif
58
59
60	static int compare_str (const char e1, const char e2) {
61	#ifdef __WIN32__
62	return _stricmp(e1, e2);
63	#else
64	return strcasecmp(e1, e2);
65	#endif
66	}
67
68	static int compare_str (const void e1, const void e2) {
69	#ifdef __WIN32__
70	return _stricmp(((char)e1), ((char**)e2));
71	#else
72	return strcasecmp(((char)e1), ((char**)e2));
73	#endif
74	}
75
76
77
78	void gdbm_info::clear () {
79	docnum = 0; // 'd'
80	title.clear(); // 't'
81	parent.clear(); // 'p'
82	classification.clear(); // 'x'
83	contents.clear(); // 'c'
84	jobnum.clear(); // 'j'
85	OID.clear(); // 'o'
86	author.clear(); // 'a'
87	source.clear(); // 's'
88	date.clear(); // 'i'
89	}
90
91
92
93
94
95	// returns 0 if failed, 1 if opened
96	int gdbmclass::opendatabase (const text_t &filename) {
97	text_t data_location;
98	int block_size = 0;
99
100	if (gdbmfile != NULL) {
101	if (openfile == filename) return 1;
102	else closedatabase ();
103	}
104
105	openfile = filename;
106
107	char *namebuffer = filename.getcstr();
108	gdbmfile = gdbm_open (namebuffer, block_size, GDBM_READER, 00664, NULL);
109	delete namebuffer;
110
111	if (gdbmfile == NULL && logout != NULL) {
112	outconvertclass text_t2ascii;
113	(*logout) << text_t2ascii << "database open failed on: " << filename << "\n";
114	}
115
116	return (gdbmfile != NULL);
117	}
118
119
120	void gdbmclass::closedatabase () {
121	if (gdbmfile == NULL) return;
122
123	gdbm_close (gdbmfile);
124	gdbmfile = NULL;
125	openfile.clear();
126	}
127
128
129	// returns 0 on success, -1 on failure
130	// key and collection aren't references as they might be aliases to
131	// something in info
132	int gdbmclass::getinfo (text_t key, gdbm_info &info) {
133	text_t data;
134
135	if (!getkeydata (key, data)) return -1;
136	text_t::iterator here = data.begin ();
137	text_t::iterator end = data.end ();
138
139	text_t ikey, ivalue;
140	info.clear (); // reset info
141
142	while (getinfoline(here, end, ikey, ivalue)) {
143	if (ikey == "d") { info.docnum = ivalue.getint(); }
144	else if (ikey == "t") { info.title = ivalue; }
145	else if (ikey == "p") { info.parent = ivalue; }
146	else if (ikey == "x") { info.classification = ivalue; }
147	else if (ikey == "c") { info.contents = ivalue; }
148	else if (ikey == "j") { info.jobnum = ivalue; }
149	else if (ikey == "o") { info.OID = ivalue; }
150	else if (ikey == "a") { info.author = ivalue; }
151	else if (ikey == "s") { info.source = ivalue; }
152	else if (ikey == "i") { info.date = ivalue; }
153	}
154
155	return 0;
156	}
157
158
159	// returns 1 if the key exists
160	int gdbmclass::exists (text_t key) {
161	text_t data;
162	return getkeydata (key, data);
163	}
164
165
166	// returns 1 if successful
167	int gdbmclass::getkeydata (text_t key, text_t &data) {
168	datum key_data;
169	datum return_data;
170
171	if (gdbmfile == NULL) return 0;
172
173	// get a utf-8 encoded c string of the unicode key
174	key_data.dptr = (to_utf8(key)).getcstr();
175	if (key_data.dptr == NULL) {
176	if (logout != NULL) (*logout) << "gdbmclass: out of memory\n";
177	return 0;
178	}
179	key_data.dsize = strlen (key_data.dptr);
180
181	// fetch the result
182	return_data = gdbm_fetch (gdbmfile, key_data);
183	delete key_data.dptr;
184
185	if (return_data.dptr == NULL) return 0;
186
187	data.setcarr (return_data.dptr, return_data.dsize);
188	free (return_data.dptr);
189	data = to_uni(data); // convert to unicode
190
191	return 1;
192	}
193
194
195	// parses a line of the form <key>value\n
196	// returns 1 if successful
197	int gdbmclass::getinfoline (text_t::iterator &here, text_t::iterator end,
198	text_t &key, text_t &value) {
199	key.clear();
200	value.clear();
201
202	// ignore white space
203	while (here != end && is_unicode_space (*here)) here++;
204
205	// get the '<'
206	if (here == end \|\| *here != '<') return 0;
207	here++;
208
209	// get the key
210	while (here != end && *here != '>') {
211	key.push_back(*here);
212	here++;
213	}
214
215	// get the '>'
216	if (here == end \|\| *here != '>') return 0;
217	here++;
218
219	// get the value
220	while (here != end && *here != '\n') {
221	value.push_back(*here);
222	here++;
223	}
224
225	return 1;
226	}
227
228
229
230
231
232	// a few useful functions
233
234	//////////////////////////////////////////////////////////////////////////////////////////
235	// functions for testing classification strings
236
237
238	// returns 1 if targetdoc is top level of a book (i.e. =~ /B\.\d+$/) - otherwise 0;
239	int is_top_level (const text_t &targetdoc) {
240
241	text_t::const_iterator here = targetdoc.begin();
242	text_t::const_iterator end = targetdoc.end();
243
244	// look for the 'B'
245	here = findchar (here, end, 'B');
246
247	// there must be exactly one dot after the 'B'
248	if ((here != end) && (countchar (here, end, '.') == 1))
249	return 1;
250
251	return 0;
252	}
253
254	// returns 1 if targetdoc is any level of a book (i.e. contains 'B') - otherwise 0
255	int is_book (const text_t &targetdoc) {
256
257	text_t::const_iterator here = targetdoc.begin();
258	text_t::const_iterator end = targetdoc.end();
259
260	here = findchar (here, end, 'B');
261	if (here != end) return 1;
262	return 0;
263	}
264
265	// returns (in book_top) the top level of the book in targetdoc
266	void get_book_top (const text_t &targetdoc, text_t &book_top) {
267
268	text_t::const_iterator here = targetdoc.begin();
269	text_t::const_iterator end = targetdoc.end();
270
271	book_top.clear();
272
273	// look for the 'B'
274	here = findchar (here, end, 'B');
275
276	// copy up to the second '.'
277	int founddot = 0;
278	while (here != end) {
279	if (*here == '.') {
280	if (founddot) return;
281	founddot = 1;
282	}
283	book_top.push_back(*here);
284	here++;
285	}
286	}
287
288	// returns (in book) the book section part of the classification
289	// contained in targetdoc
290	void get_book (const text_t &targetdoc, text_t &book) {
291
292	text_t::const_iterator here = targetdoc.begin();
293	text_t::const_iterator end = targetdoc.end();
294
295	book.clear ();
296
297	// look for the 'B'
298	here = findchar (here, end, 'B');
299
300	// copy the rest of the string
301	while (here != end) {
302	book.push_back(*here);
303	here ++;
304	}
305	}
306
307	// get_parent_section removes the last part from section (i.e.=~ s/\.\d+$//)
308	void get_parent_section (text_t &section) {
309	int founddot = 0;
310	text_t::iterator end;
311	while (!founddot && !section.empty()) {
312	end = section.end();
313	end --;
314	if (*end == '.') founddot = 1;
315	section.pop_back();
316	}
317	}
318
319	// same as above but also returns ths child section that's removed
320	void get_parent_section (text_t &parentsection, text_t &childsection) {
321	int founddot = 0;
322	text_t tmp;
323	childsection.clear();
324	text_t::iterator end;
325	while (!founddot && !parentsection.empty()) {
326	end = parentsection.end();
327	end --;
328	if (*end == '.') founddot = 1;
329	else tmp.push_back(*end); childsection = tmp + childsection; tmp.clear();
330	parentsection.pop_back();
331	}
332	}
333
334	// count_dots returns the number of dots ('.') there are
335	// in a range of a targetdoc string
336	int count_dots(text_t::const_iterator first, text_t::const_iterator last) {
337	return countchar (first, last, '.');
338	}
339
340	int count_dots (const text_t &targetdoc) {
341	return count_dots(targetdoc.begin(), targetdoc.end());
342	}
343
344	// returns 1 if targetdoc is a first level descendant
345	// (i.e. B.n.1, B.n.1.1, B.n.1.1.1 etc.) - otherwise 0
346	int is_section_top(const text_t &targetdoc) {
347	text_t::const_iterator here = targetdoc.begin();
348	text_t::const_iterator end = targetdoc.end();
349
350	// look for the 'B'
351	here = findchar (here, end, 'B');
352	here = findchar (here, end, '.');
353	if (here != end) here++; // skip over the '.'
354	here = findchar (here, end, '.');
355
356	// make sure that all '.' are followed by a '1'
357	while (here != end) {
358	if (*here != '.') return 0;
359	here ++;
360
361	if (here != end) {
362	if (*here != '1') return 0;
363	here ++;
364	}
365	}
366	return 1;
367	}
368
369	// seperate_parts seperates targetdoc into its classification and booksection
370	// if classification isn't supplied it gets the first classification for the
371	// book from the gdbm
372	// if booksection doesn't exist it remains blank
373	void separate_parts(const text_t &targetdoc, gdbmclass &gdbm,
374	text_t &classification, text_t &booksection) {
375
376	split_targetdoc (targetdoc, classification, booksection);
377
378	if (classification.empty()) {
379	// no classification included so get first one for this book
380	gdbm_info info;
381	text_t book_top;
382	vector<text_t> classarray;
383	get_book_top (targetdoc, book_top);
384	gdbm.getinfo(book_top, info);
385	splitstring (info.classification, classarray);
386	if (!classarray.empty()) classification = classarray[0];
387	else classification = "C.1";
388	}
389	}
390
391	// split_targetdoc splits up a string containing a classification
392	// and book (or one or the other)
393	void split_targetdoc(const text_t &targetdoc, text_t &classification,
394	text_t &booksection) {
395
396	classification.clear ();
397	booksection.clear();
398
399	text_t::const_iterator here = targetdoc.begin();
400	text_t::const_iterator end = targetdoc.end();
401
402	// copy everything up to the first 'B'
403	while (here != end) {
404	if (*here == 'B') break;
405	classification.push_back(*here);
406	here++;
407	}
408
409	// remove middle '.'
410	if (!classification.empty() &&
411	classification[classification.size()-1] == '.')
412	classification.pop_back();
413
414	// copy the rest of the string
415	while (here != end) {
416	booksection.push_back(*here);
417	here++;
418	}
419	}
420
421	// splitstring splits a colon seperated string into an array
422	void splitstring (const text_t &string, vector<text_t> &array) {
423	splitchar (string.begin(), string.end(), ':', array);
424	}
425
426	// get_parents returns the parents array containing all the parents of the
427	// document specified by classification and booksection
428	void get_parents (const text_t &targetdoc, vector<text_t> &parents)
429	{
430	text_t::const_iterator here = targetdoc.begin ();
431	text_t::const_iterator end = targetdoc.end ();
432
433	text_t currentparent;
434	text_t newsuffixpart;
435	text_t newsuffix;
436	bool first = true;
437	while (here != end)
438	{
439	// if there is a newsuffix add it to the current parent
440	// and add that parent to the parents vector
441	if (!newsuffix.empty())
442	{
443	currentparent += newsuffix;
444	parents.push_back (currentparent);
445	}
446
447	// keep getting suffixes until one is found which starts with
448	// a number
449	newsuffix.clear();
450	do
451	{
452	here = getdelimitstr (here, end, '.', newsuffixpart);
453	if (!first) newsuffix.push_back ('.');
454	first = false;
455	newsuffix += newsuffixpart;
456	}
457	while ((here != end) && !newsuffixpart.empty() &&
458	(newsuffixpart[0] < '0' \|\| newsuffixpart[0] > '9'));
459	}
460	}
461
462
463	// get_siblings returns the siblings array containing all the siblings of the current
464	// classification or booksection
465	void get_siblings (const text_t &classification, const text_t &booksection,
466	gdbmclass &gdbm, vector<text_t> &siblings) {
467
468	gdbm_info info;
469
470	if (booksection.empty() && classification.size() == 1) {
471	// top level classification has no siblings
472	return;
473
474	} else if (booksection.empty()) {
475	// get classification siblings
476	gdbm.getinfo(classification, info);
477	gdbm.getinfo(info.parent, info); // info is now parent info
478	splitstring(info.contents, siblings);
479	return;
480
481	} else {
482	// get book section siblings
483	if (is_top_level(booksection)) {
484	// top level of book so siblings are children of classification
485	gdbm.getinfo(classification, info);
486	splitstring(info.contents, siblings);
487
488	// add classifications to book sections
489	for (unsigned int i = 0; i < siblings.size(); i++) {
490	if (is_book(siblings[i])) siblings[i] = classification + "." + siblings[i];
491	}
492
493	} else {
494	// siblings come from immediate parent
495	gdbm.getinfo(booksection, info);
496	gdbm.getinfo(info.parent, info); // info is now parent info
497	splitstring(info.contents, siblings);
498
499	// add classifications to book sections
500	for (unsigned int i = 0; i < siblings.size(); i++) {
501	if (is_book(siblings[i])) siblings[i] = classification + "." + siblings[i];
502	}
503	}
504	}
505	}
506
507	// compares section 1 and section 2 and returns 1 if section2 belongs to
508	// the same chapter as section1 (i.e. is sibling of or child of or child of sibling)
509	int are_same_chapter(text_t section1, text_t section2)
510	{
511	get_parent_section(section1);
512
513	while (!section2.empty()) {
514	get_parent_section(section2);
515	if (section2 == section1) return 1;
516	}
517	return 0;
518	}
519
520	// get_first_section gets the first section from a colon separated
521	// list (instring)
522	void get_first_section(const text_t &instring, text_t &returnstring) {
523
524	returnstring.clear();
525
526	text_t::const_iterator here = instring.begin();
527	text_t::const_iterator end = instring.end();
528
529	while (here != end) {
530	if (*here == ':') return;
531	returnstring.push_back(*here);
532	here ++;
533	}
534	}
535
536
537	// removes html tags from string - everything after < will be removed
538	// if < occurs without >
539	void remove_tags (text_t &text)
540	{
541	text_t::const_iterator here = text.begin ();
542	text_t::const_iterator end = text.end ();
543	int found = 0;
544	text_t tmp;
545
546	while (here != end) {
547	if (*here == '<') {found = 1; here ++; continue;}
548	if (*here == '>') {found = 0; here ++; continue;}
549
550	if (!found) tmp.push_back(*here);
551	here ++;
552	}
553	text = tmp;
554	}
555
556	// checks text to see if it is a number (i.e. contains only 0-9)
557	// returns 1 if true, 0 if false
558	int is_number (text_t &text) {
559
560	text_t::const_iterator here = text.begin();
561	text_t::const_iterator end = text.end();
562
563	while (here != end) {
564	if ((here!='0') && (here!='1') && (*here!='2') &&
565	(here!='3') && (here!='4') && (*here!='5') &&
566	(here!='6') && (here!='7') && (*here!='8') &&
567	(*here!='9')) return 0;
568	here ++;
569	}
570	return 1;
571	}
572
573	// functions related to sorting
574
575	// returns whatever comes after ':#:' in str
576	// -- this is a nasty hack that I'm sure Rodger will want to change ;-)
577	text_t get_section_str(const text_t &str) {
578
579	text_t ret;
580	int found = 0;
581
582	text_t::const_iterator here = str.begin();
583	text_t::const_iterator end = str.end();
584
585	while (here != end) {
586	if (found) {
587	ret.push_back(*here);
588	} else {
589	here = findchar (here, end, ':');
590	if (((here+1) == '#') && ((here+2) == ':')) {
591	found = 1;
592	here = here+2;
593	}
594	}
595	here ++;
596	}
597	return ret;
598	}
599
600	// removes leading spaces and leading 'the' 'a' and 'an'
601	// from string
602	void alphabetize_string_english (text_t &text) {
603
604	if (text.empty()) return;
605
606	text_t firstword;
607	char *word;
608
609	text_t::iterator here = text.begin();
610	text_t::const_iterator end = text.end();
611
612	if ((here != ' ') && (here != 'a') && (*here != 'A') &&
613	(here != 't') && (here != 'T')) return;
614
615	int foundchar = 0;
616	while (here != end) {
617	if (*here == ' ' && !foundchar) {here ++; continue;}
618	if (*here == ' ' && foundchar) {
619	text.erase(text.begin(), (here+1));
620	break;
621	}
622	foundchar ++;
623	if (foundchar == 1) {
624	getdelimitstr (here, end, ' ', firstword);
625	word = firstword.getcstr();
626	if ((compare_str(word, "the") != 0) && (compare_str(word, "a") != 0) &&
627	(compare_str(word, "an") != 0)) break;
628	}
629	here ++;
630	}
631	delete word;
632	}
633
634	// removes leading space, puts last name before
635	// any preceeding names
636	void alphabetize_string_name (text_t &text) {
637
638	if (text.empty()) return;
639
640	text_t lastname;
641	char *lname;
642	vector<text_t> words;
643	splitchar (text.begin(), text.end(), ' ', words);
644	lastname = words.back();
645	words.pop_back();
646	lname = lastname.getcstr();
647
648	while ((compare_str(lname, "jnr") == 0) \|\| (compare_str(lname, "snr") == 0) \|\|
649	(compare_str(lname, "esq") == 0)) {
650	lastname = words.back();
651	words.pop_back();
652	lname = lastname.getcstr();
653	}
654
655	text.clear();
656	joinchar (words, ' ', text);
657	text = lastname + text;
658	}
659
660	char string_add (char array, int len, char str) {
661	char **ret;
662
663	ret = (char*)realloc(array, (len+1)sizeof(char));
664	ret[len] = (char)strdup(str);
665	(*len) ++;
666
667	return ret;
668	}
669
670	void string_sort (char **array, int len) {
671	qsort((void)array, (unsigned int)(len), sizeof(char), compare_str);
672	}
673
674	void string_free (char **array, int len) {
675	for (int i = 0; i < len; i++)
676	free (array[i]);
677	free (array);
678	}
679
680	// returns a date of form _dec_ 31, 1999
681	// input is date of type 19991231
682	// at least the year must be present in date
683	text_t format_date (const text_t &date) {
684	text_t::const_iterator here = date.begin();
685	text_t::const_iterator end = date.end();
686
687	text_t year, month, day, dreturn;
688	int i;
689
690	for (i = 0; i < 4 && here != end; i++) {
691	year.push_back(*here);
692	here ++;
693	}
694	if (year.empty()) return "";
695
696	for (i = 0; i < 2 && here != end; i++) {
697	month.push_back(*here);
698	here ++;
699	}
700	for (i = 0; i < 2 && here != end; i++) {
701	day.push_back(*here);
702	here ++;
703	}
704
705	if (!month.empty()) format_month(month);
706
707	if (!day.empty()) format_day(day);
708
709	if (!month.empty()) {
710	dreturn += month + " ";
711	if (!day.empty()) {
712	dreturn += day + ", ";
713	}
714	}
715	dreturn += year;
716	return dreturn;
717	}
718
719	void format_month (text_t &month) {
720	if (month == "01") month = "_jan_";
721	else if (month == "02") month = "_feb_";
722	else if (month == "03") month = "_mar_";
723	else if (month == "04") month = "_apr_";
724	else if (month == "05") month = "_may_";
725	else if (month == "06") month = "_jun_";
726	else if (month == "07") month = "_jul_";
727	else if (month == "08") month = "_aug_";
728	else if (month == "09") month = "_sep_";
729	else if (month == "10") month = "_oct_";
730	else if (month == "11") month = "_nov_";
731	else if (month == "12") month = "_dec_";
732	else month.clear();
733	}
734
735	void format_day(text_t &day) {
736	if (day[0] == '0') {
737	char tmp = day[1];
738	day.clear();
739	day.push_back(tmp);
740	}
741	}
742

Note: See TracBrowser for help on using the repository browser.

Download in other formats: