Context Navigation

source: trunk/gsdl/src/colservr/phrasesearch.cpp@ 500

Last change on this file since 500 was 500, checked in by rjmcnab, 25 years ago
fixed small problem
Property svn:executable set to ``* Property svn:keywords set to `Author Date Id Revision`
File size: 5.2 KB

Line
1	/**********************************************************************
2	*
3	* phrasesearch.cpp -- tools to search for a phrase in a larger text
4	* Copyright (C) 1999 DigiLib Systems Limited
5	*
6	* PUT COPYRIGHT NOTICE HERE
7	*
8	* $Id: phrasesearch.cpp 500 1999-08-31 22:45:12Z rjmcnab $
9	*
10	*********************************************************************/
11
12	/*
13	$Log$
14	Revision 1.5 1999/08/31 22:45:12 rjmcnab
15	fixed small problem
16
17	Revision 1.4 1999/07/16 00:15:48 sjboddie
18	changed to use termfreqclassarray type
19
20	Revision 1.3 1999/07/07 06:19:45 rjmcnab
21	Added ability to combine two or more independant queries.
22
23	Revision 1.2 1999/07/01 09:25:54 rjmcnab
24	fixed bug :-^
25
26	Revision 1.1 1999/07/01 04:01:46 rjmcnab
27	Initial revision.
28
29	*/
30
31
32	#include "phrasesearch.h"
33	#include "gsdlunicode.h"
34
35	inline unsigned char parse_nonword_word (unsigned char here, unsigned char *end,
36	text_t &word) {
37	int c_len = 0;
38	unsigned short c = 0;
39
40	word.clear();
41
42	// parse non word
43	while (here <= end) {
44	c_len = parse_utf8_char (here, end, &c);
45	if (c == '(') {
46	// found a note, look for '}'
47	while (here <= end && c != ')') {
48	c_len = parse_utf8_char (here, end, &c);
49	here += c_len;
50	}
51	}
52	if (c == '{') {
53	// found a composite character, look for '}'
54	while (here <= end && c != '}') {
55	c_len = parse_utf8_char (here, end, &c);
56	here += c_len;
57	}
58	}
59	if (is_unicode_letdig(c)) {
60	while (c_len > 0) {
61	// this is in a word
62	word.push_back(*here);
63	here++; c_len--;
64	}
65	break;
66	}
67	here += c_len;
68	}
69
70	// parse word
71	while (here <= end) {
72	c_len = parse_utf8_char (here, end, &c);
73	if (!is_unicode_letdig(c)) {
74	here += c_len; // it is ok to skip a nonword character
75	break;
76	}
77	while (c_len > 0) {
78	word.push_back(*here);
79	here++; c_len--;
80	}
81	}
82
83	return here;
84	}
85
86
87	bool doc_phrase_search (unsigned char *doc, int doclen,
88	const termfreqclassarray &phrase) {
89	// note: this uses the most braindead search routine :-)
90	// however its not so bad as there shouldn't be many partial
91	// matches
92
93	// a null phrase matches anything
94	if (phrase.empty()) return true;
95
96	// if there is nothing then there can't be a match
97	if (doc == NULL \|\| doclen == 0) return false;
98
99	text_t doc_word;
100	doc_word.reserve (16);
101
102	bool first = true;
103
104	unsigned char *doc_here = doc;
105	unsigned char *doc_herefirstword = doc;
106	unsigned char *doc_end = doc+doclen-1; // unitool conventions :-/
107
108	while (doc_here <= doc_end) {
109	first = true;
110
111	// there will be at least one member of phrase (see above)
112	termfreqclassarray::const_iterator phrase_here = phrase.begin();
113	termfreqclassarray::const_iterator phrase_end = phrase.end();
114	do {
115	// get the next non-word ... and ignore it, then get the next word
116	doc_here = parse_nonword_word (doc_here, doc_end, doc_word);
117	if (first) {doc_herefirstword = doc_here; first = false;}
118
119	// break if this word is not the next in the phrase
120	if ((*phrase_here).utf8equivterms.find (doc_word) ==
121	(*phrase_here).utf8equivterms.end()) break;
122
123	phrase_here++;
124	} while (doc_here <= doc_end && phrase_here != phrase_end);
125
126	// see if we found a phrase
127	if (phrase_here == phrase_end) return true;
128
129	doc_here = doc_herefirstword; // set the counter back
130	}
131
132	return false;
133	}
134
135
136	// looks for the stemmed phrase in the metadata or text associated with
137	// an OID. This function has not been coded with all situations in mind
138	bool OID_phrase_search (mgsearchclass &mgsearch,
139	gdbmclass &gdbm,
140	const text_t &index,
141	const text_t &subcollection,
142	const text_t &language,
143	const text_t &longindex,
144	const text_t &collection,
145	const termfreqclassarray &phrase,
146	int docnum) {
147	// disect the long index to find out where the text should come from
148	text_t level, gran;
149	text_t::const_iterator longindex_here = longindex.begin();
150	text_t::const_iterator longindex_end = longindex.end();
151	longindex_here = getdelimitstr (longindex_here, longindex_end, ':', level);
152	longindex_here = getdelimitstr (longindex_here, longindex_end, ':', gran);
153
154	if (gran.empty()) return false;
155
156	if (gran == "text") {
157	char *doc = NULL;
158	int doclen = 0;
159
160	// get text from mg.
161	if (!mgsearch.mgdocument (index, subcollection, language, collection,
162	docnum, doc, doclen)) return false;
163	return doc_phrase_search ((unsigned char *)doc, doclen, phrase);
164	}
165
166	// get OID
167	char *metadata = NULL;
168	text_t::size_type metadata_len = 0;
169	infodbclass docnum_info;
170	infodbclass OID_info;
171
172	if (!gdbm.getinfo (docnum, docnum_info)) return false;
173	text_t &OID = docnum_info["section"];
174	if (OID.empty()) return false;
175
176	// get field
177	if (!gdbm.getinfo (OID, OID_info)) return false;
178
179	bool result = false;
180	text_tarray *tarr_ptr = OID_info.getmultinfo (gran);
181	if (tarr_ptr != NULL ) {
182	text_tarray::const_iterator subvalue_here = (*tarr_ptr).begin();
183	text_tarray::const_iterator subvalue_end = (*tarr_ptr).end();
184	while (subvalue_here != subvalue_end) {
185	if (subvalue_here != NULL) {
186	metadata = (to_utf8(*subvalue_here)).getcarr(metadata_len);
187	result = doc_phrase_search ((unsigned char *)metadata, metadata_len, phrase);
188	delete [] metadata;
189
190	if (result) return true;
191	}
192
193	subvalue_here++;
194	}
195	}
196
197	return result;
198	}

Note: See TracBrowser for help on using the repository browser.

Download in other formats: