Context Navigation

source: gsdl/trunk/src/colservr/phrasesearch.cpp@ 15757

Last change on this file since 15757 was 15558, checked in by mdewsnip, 16 years ago
(Adding new DB support) Changed lots of "gdbm"s to "db"s, in preparation for adding new DB types.
Property svn:executable set to ``* Property svn:keywords set to `Author Date Id Revision`
File size: 8.1 KB

Line
1	/**********************************************************************
2	*
3	* phrasesearch.cpp -- tools to search for a phrase in a larger text
4	* Copyright (C) 1999 DigiLib Systems Limited
5	*
6	* A component of the Greenstone digital library software
7	* from the New Zealand Digital Library Project at the
8	* University of Waikato, New Zealand.
9	*
10	* This program is free software; you can redistribute it and/or modify
11	* it under the terms of the GNU General Public License as published by
12	* the Free Software Foundation; either version 2 of the License, or
13	* (at your option) any later version.
14	*
15	* This program is distributed in the hope that it will be useful,
16	* but WITHOUT ANY WARRANTY; without even the implied warranty of
17	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18	* GNU General Public License for more details.
19	*
20	* You should have received a copy of the GNU General Public License
21	* along with this program; if not, write to the Free Software
22	* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23	*
24	*********************************************************************/
25
26	#include "phrasesearch.h"
27	#include "gsdlunicode.h"
28
29	inline unsigned char parse_nonword_word (unsigned char here, unsigned char *end,
30	text_t &word) {
31	int c_len = 0;
32	unsigned short c = 0;
33
34	word.clear();
35
36	// parse non word
37	while (here <= end) {
38	c_len = parse_utf8_char (here, end, &c);
39	if (c == '(') {
40	// found a note, look for '}'
41	while (here <= end && c != ')') {
42	c_len = parse_utf8_char (here, end, &c);
43	here += c_len;
44	}
45	}
46	if (c == '{') {
47	// found a composite character, look for '}'
48	while (here <= end && c != '}') {
49	c_len = parse_utf8_char (here, end, &c);
50	here += c_len;
51	}
52	}
53	if (is_unicode_letdig(c)) {
54	while (c_len > 0) {
55	// this is in a word
56	word.push_back(*here);
57	++here; --c_len;
58	}
59	break;
60	}
61	here += c_len;
62	}
63
64	// parse word
65	while (here <= end) {
66	c_len = parse_utf8_char (here, end, &c);
67	if (!is_unicode_letdig(c)) {
68	here += c_len; // it is ok to skip a nonword character
69	break;
70	}
71	while (c_len > 0) {
72	word.push_back(*here);
73	++here; --c_len;
74	}
75	}
76
77	return here;
78	}
79
80	static void get_all_docnums (dbclass &db, text_t OID, vector<int> &docnum_list) {
81
82	infodbclass OID_info;
83
84	// get OID
85	if (!db.getinfo (OID, OID_info)) return;
86	if (OID_info["hastxt"] == "1" && !OID_info["docnum"].empty()) {
87	docnum_list.push_back (OID_info["docnum"].getint());
88	}
89
90	// get contents set
91	if (OID_info["contains"].empty()) return;
92	text_tarray contains; text_t tmptext;
93	text_t::iterator contains_here = OID_info["contains"].begin();
94	text_t::iterator contains_end = OID_info["contains"].end();
95	while (contains_here != contains_end) {
96	if (*contains_here == '"') tmptext += OID;
97	else if (*contains_here == ';') {
98	if (!tmptext.empty()) contains.push_back (tmptext);
99	tmptext.clear();
100	} else tmptext.push_back(*contains_here);
101	++contains_here;
102	}
103	if (!tmptext.empty()) contains.push_back (tmptext);
104
105	text_tarray::const_iterator here = contains.begin();
106	text_tarray::const_iterator end = contains.end();
107	while (here != end) {
108	get_all_docnums (db, *here, docnum_list);
109	++here;
110	}
111	}
112
113	bool doc_phrase_search (unsigned char *doc, int doclen,
114	const termfreqclassarray &phrase) {
115	// note: this uses the most braindead search routine :-)
116	// however its not so bad as there shouldn't be many partial
117	// matches
118
119	// a null phrase matches anything
120	if (phrase.empty()) return true;
121
122	// if there is nothing then there can't be a match
123	if (doc == NULL \|\| doclen == 0) return false;
124
125	text_t doc_word;
126	doc_word.reserve (16);
127
128	bool first = true;
129
130	unsigned char *doc_here = doc;
131	unsigned char *doc_herefirstword = doc;
132	unsigned char *doc_end = doc+doclen-1; // unitool conventions :-/
133
134	while (doc_here <= doc_end) {
135	first = true;
136
137	// there will be at least one member of phrase (see above)
138	termfreqclassarray::const_iterator phrase_here = phrase.begin();
139	termfreqclassarray::const_iterator phrase_end = phrase.end();
140	do {
141	// get the next non-word ... and ignore it, then get the next word
142	doc_here = parse_nonword_word (doc_here, doc_end, doc_word);
143	if (first) {doc_herefirstword = doc_here; first = false;}
144
145	// break if this word is not the next in the phrase
146	if ((*phrase_here).utf8equivterms.find (doc_word) ==
147	(*phrase_here).utf8equivterms.end()) break;
148
149	++phrase_here;
150	} while (doc_here <= doc_end && phrase_here != phrase_end);
151
152	// see if we found a phrase
153	if (phrase_here == phrase_end) return true;
154
155	doc_here = doc_herefirstword; // set the counter back
156	}
157
158	return false;
159	}
160
161	// looks for the stemmed phrase in the metadata or text associated with
162	// an OID. This function has not been coded with all situations in mind
163	bool OID_phrase_search (mgsearchclass &mgsearch,
164	dbclass &db,
165	const text_t &index,
166	const text_t &subcollection,
167	const text_t &language,
168	const text_t &longindex,
169	const text_t &collection,
170	const termfreqclassarray &phrase,
171	int docnum) {
172
173	// get OID
174	infodbclass docnum_info;
175	if (!db.getinfo (docnum, docnum_info)) return false;
176	text_t &OID = docnum_info["section"];
177	if (OID.empty()) return false;
178
179	// disect the long index to find out where the text should come from
180	text_t gran, type;
181	text_t::const_iterator longindex_here = longindex.begin();
182	text_t::const_iterator longindex_end = longindex.end();
183	longindex_here = getdelimitstr (longindex_here, longindex_end, ':', gran);
184	longindex_here = getdelimitstr (longindex_here, longindex_end, ':', type);
185
186	if (gran.empty()) return false;
187
188	// note that we're treating indexes of type 'all' (i.e. text,Title,Creator)
189	// or other composite indexes that contain "text" as if they were simply 'text' indexes
190	if ((type == "text") \|\| (type == "all") \|\| (findword(type.begin(),type.end(),"text") != type.end())) {
191	char *doc = NULL;
192	int doclen = 0;
193
194	// get text from mg.
195	if (gran == "document") {
196
197	// if this is a document level index (which should only happen if
198	// there are no matching indexes with a finer granularity -- see
199	// mgqueryfilterclass::mg_parse_query_params) then we must do the
200	// phrase search on the entire document (i.e. all the sections)
201	// -- this is going to make a slow process even slower
202	vector<int> docnum_list; text_t fulldoc;
203	get_all_docnums (db, OID, docnum_list);
204	vector<int>::const_iterator this_docnum = docnum_list.begin();
205	vector<int>::const_iterator end_docnum = docnum_list.end();
206	while (this_docnum != end_docnum) {
207	if (mgsearch.mgdocument (index, subcollection, language, collection,
208	*this_docnum, doc, doclen)) {
209	fulldoc.appendcstr (doc);
210	}
211	++this_docnum;
212	}
213	doc = fulldoc.getcstr();
214	doclen = fulldoc.size();
215	bool rv = doc_phrase_search ((unsigned char *)doc, doclen, phrase);
216	delete []doc;
217	return rv;
218
219	} else {
220
221	if (!mgsearch.mgdocument (index, subcollection, language, collection,
222	docnum, doc, doclen)) return false;
223	return doc_phrase_search ((unsigned char *)doc, doclen, phrase);
224	}
225	}
226
227	char *metadata = NULL;
228	text_t::size_type metadata_len = 0;
229	infodbclass OID_info;
230
231	// get field
232	if (!db.getinfo (OID, OID_info)) return false;
233
234	bool result = false;
235
236	// need to look through all the metadata values in the index
237	text_tarray keys;
238	splitchar(type.begin(), type.end(), ',', keys);
239
240	text_tarray::const_iterator keyhere = keys.begin();
241	text_tarray::const_iterator keyend = keys.end();
242	while (keyhere != keyend) {
243	text_tarray tarr_ptr = OID_info.getmultinfo (keyhere);
244	if (tarr_ptr != NULL ) {
245	text_tarray::const_iterator subvalue_here = (*tarr_ptr).begin();
246	text_tarray::const_iterator subvalue_end = (*tarr_ptr).end();
247	while (subvalue_here != subvalue_end) {
248	metadata = (to_utf8(*subvalue_here)).getcarr(metadata_len);
249	result = doc_phrase_search ((unsigned char *)metadata, metadata_len, phrase);
250	delete [] metadata;
251
252	if (result) return true;
253	++subvalue_here;
254	}
255	}
256	++keyhere;
257	}
258	return result;
259	}

Note: See TracBrowser for help on using the repository browser.

Download in other formats: