Context Navigation

source: trunk/gsdl/src/colservr/phrasesearch.cpp@ 2146

Last change on this file since 2146 was 2146, checked in by sjboddie, 23 years ago
Fixed a bug that was preventing phrase searching from working on document level indexes
Property svn:executable set to ``* Property svn:keywords set to `Author Date Id Revision`
File size: 7.9 KB

Line
1	/**********************************************************************
2	*
3	* phrasesearch.cpp -- tools to search for a phrase in a larger text
4	* Copyright (C) 1999 DigiLib Systems Limited
5	*
6	* A component of the Greenstone digital library software
7	* from the New Zealand Digital Library Project at the
8	* University of Waikato, New Zealand.
9	*
10	* This program is free software; you can redistribute it and/or modify
11	* it under the terms of the GNU General Public License as published by
12	* the Free Software Foundation; either version 2 of the License, or
13	* (at your option) any later version.
14	*
15	* This program is distributed in the hope that it will be useful,
16	* but WITHOUT ANY WARRANTY; without even the implied warranty of
17	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18	* GNU General Public License for more details.
19	*
20	* You should have received a copy of the GNU General Public License
21	* along with this program; if not, write to the Free Software
22	* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23	*
24	*********************************************************************/
25
26	#include "phrasesearch.h"
27	#include "gsdlunicode.h"
28
29	inline unsigned char parse_nonword_word (unsigned char here, unsigned char *end,
30	text_t &word) {
31	int c_len = 0;
32	unsigned short c = 0;
33
34	word.clear();
35
36	// parse non word
37	while (here <= end) {
38	c_len = parse_utf8_char (here, end, &c);
39	if (c == '(') {
40	// found a note, look for '}'
41	while (here <= end && c != ')') {
42	c_len = parse_utf8_char (here, end, &c);
43	here += c_len;
44	}
45	}
46	if (c == '{') {
47	// found a composite character, look for '}'
48	while (here <= end && c != '}') {
49	c_len = parse_utf8_char (here, end, &c);
50	here += c_len;
51	}
52	}
53	if (is_unicode_letdig(c)) {
54	while (c_len > 0) {
55	// this is in a word
56	word.push_back(*here);
57	here++; c_len--;
58	}
59	break;
60	}
61	here += c_len;
62	}
63
64	// parse word
65	while (here <= end) {
66	c_len = parse_utf8_char (here, end, &c);
67	if (!is_unicode_letdig(c)) {
68	here += c_len; // it is ok to skip a nonword character
69	break;
70	}
71	while (c_len > 0) {
72	word.push_back(*here);
73	here++; c_len--;
74	}
75	}
76
77	return here;
78	}
79
80	static void get_all_docnums (gdbmclass &gdbm, text_t OID, vector<int> &docnum_list) {
81
82	infodbclass OID_info;
83
84	// get OID
85	if (!gdbm.getinfo (OID, OID_info)) return;
86	if (OID_info["hastxt"] == "1" && !OID_info["docnum"].empty()) {
87	docnum_list.push_back (OID_info["docnum"].getint());
88	}
89
90	// get contents set
91	if (OID_info["contains"].empty()) return;
92	text_tarray contains; text_t tmptext;
93	text_t::iterator contains_here = OID_info["contains"].begin();
94	text_t::iterator contains_end = OID_info["contains"].end();
95	while (contains_here != contains_end) {
96	if (*contains_here == '"') tmptext += OID;
97	else if (*contains_here == ';') {
98	if (!tmptext.empty()) contains.push_back (tmptext);
99	tmptext.clear();
100	} else tmptext.push_back(*contains_here);
101	contains_here++;
102	}
103	if (!tmptext.empty()) contains.push_back (tmptext);
104
105	text_tarray::const_iterator here = contains.begin();
106	text_tarray::const_iterator end = contains.end();
107	while (here != end) {
108	get_all_docnums (gdbm, *here, docnum_list);
109	here ++;
110	}
111	}
112
113	bool doc_phrase_search (unsigned char *doc, int doclen,
114	const termfreqclassarray &phrase) {
115	// note: this uses the most braindead search routine :-)
116	// however its not so bad as there shouldn't be many partial
117	// matches
118
119	// a null phrase matches anything
120	if (phrase.empty()) return true;
121
122	// if there is nothing then there can't be a match
123	if (doc == NULL \|\| doclen == 0) return false;
124
125	text_t doc_word;
126	doc_word.reserve (16);
127
128	bool first = true;
129
130	unsigned char *doc_here = doc;
131	unsigned char *doc_herefirstword = doc;
132	unsigned char *doc_end = doc+doclen-1; // unitool conventions :-/
133
134	while (doc_here <= doc_end) {
135	first = true;
136
137	// there will be at least one member of phrase (see above)
138	termfreqclassarray::const_iterator phrase_here = phrase.begin();
139	termfreqclassarray::const_iterator phrase_end = phrase.end();
140	do {
141	// get the next non-word ... and ignore it, then get the next word
142	doc_here = parse_nonword_word (doc_here, doc_end, doc_word);
143	if (first) {doc_herefirstword = doc_here; first = false;}
144
145	// break if this word is not the next in the phrase
146	if ((*phrase_here).utf8equivterms.find (doc_word) ==
147	(*phrase_here).utf8equivterms.end()) break;
148
149	phrase_here++;
150	} while (doc_here <= doc_end && phrase_here != phrase_end);
151
152	// see if we found a phrase
153	if (phrase_here == phrase_end) return true;
154
155	doc_here = doc_herefirstword; // set the counter back
156	}
157
158	return false;
159	}
160
161	// looks for the stemmed phrase in the metadata or text associated with
162	// an OID. This function has not been coded with all situations in mind
163	bool OID_phrase_search (mgsearchclass &mgsearch,
164	gdbmclass &gdbm,
165	const text_t &index,
166	const text_t &subcollection,
167	const text_t &language,
168	const text_t &longindex,
169	const text_t &collection,
170	const termfreqclassarray &phrase,
171	int docnum) {
172
173	// get OID
174	infodbclass docnum_info;
175	if (!gdbm.getinfo (docnum, docnum_info)) return false;
176	text_t &OID = docnum_info["section"];
177	if (OID.empty()) return false;
178
179	// disect the long index to find out where the text should come from
180	text_t gran, type;
181	text_t::const_iterator longindex_here = longindex.begin();
182	text_t::const_iterator longindex_end = longindex.end();
183	longindex_here = getdelimitstr (longindex_here, longindex_end, ':', gran);
184	longindex_here = getdelimitstr (longindex_here, longindex_end, ':', type);
185
186	if (gran.empty()) return false;
187
188	// note that we're treating indexes of type 'all' (i.e. text,Title,Creator)
189	// or other composite indexes that contain "text" as if they were simply 'text' indexes
190	if (type == "text" \|\| type == "all" \|\| findword(type.begin(),type.end(),"text")) {
191	char *doc = NULL;
192	int doclen = 0;
193
194	// get text from mg.
195	if (gran == "document") {
196
197	// if this is a document level index (which should only happen if
198	// there are no matching indexes with a finer granularity -- see
199	// mgqueryfilterclass::mg_parse_query_params) then we must do the
200	// phrase search on the entire document (i.e. all the sections)
201	// -- this is going to make a slow process even slower
202	vector<int> docnum_list; text_t fulldoc;
203	get_all_docnums (gdbm, OID, docnum_list);
204	vector<int>::const_iterator this_docnum = docnum_list.begin();
205	vector<int>::const_iterator end_docnum = docnum_list.end();
206	while (this_docnum != end_docnum) {
207	if (mgsearch.mgdocument (index, subcollection, language, collection,
208	*this_docnum, doc, doclen)) {
209	fulldoc.appendcstr (doc);
210	}
211	this_docnum ++;
212	}
213	doc = fulldoc.getcstr();
214	doclen = fulldoc.size();
215	bool rv = doc_phrase_search ((unsigned char *)doc, doclen, phrase);
216	delete doc;
217	return rv;
218
219	} else {
220
221	if (!mgsearch.mgdocument (index, subcollection, language, collection,
222	docnum, doc, doclen)) return false;
223	return doc_phrase_search ((unsigned char *)doc, doclen, phrase);
224	}
225	}
226
227	char *metadata = NULL;
228	text_t::size_type metadata_len = 0;
229	infodbclass OID_info;
230
231	// get field
232	if (!gdbm.getinfo (OID, OID_info)) return false;
233
234	bool result = false;
235	text_tarray *tarr_ptr = OID_info.getmultinfo (type);
236	if (tarr_ptr != NULL ) {
237	text_tarray::const_iterator subvalue_here = (*tarr_ptr).begin();
238	text_tarray::const_iterator subvalue_end = (*tarr_ptr).end();
239	while (subvalue_here != subvalue_end) {
240	if (subvalue_here != NULL) {
241	metadata = (to_utf8(*subvalue_here)).getcarr(metadata_len);
242	result = doc_phrase_search ((unsigned char *)metadata, metadata_len, phrase);
243	delete [] metadata;
244
245	if (result) return true;
246	}
247
248	subvalue_here++;
249	}
250	}
251
252	return result;
253	}

Note: See TracBrowser for help on using the repository browser.

Download in other formats: