source: trunk/gsdl/src/colservr/phrasesearch.cpp@ 2146

Last change on this file since 2146 was 2146, checked in by sjboddie, 23 years ago

Fixed a bug that was preventing phrase searching from working on
document level indexes

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 7.9 KB
Line 
1/**********************************************************************
2 *
3 * phrasesearch.cpp -- tools to search for a phrase in a larger text
4 * Copyright (C) 1999 DigiLib Systems Limited
5 *
6 * A component of the Greenstone digital library software
7 * from the New Zealand Digital Library Project at the
8 * University of Waikato, New Zealand.
9 *
10 * This program is free software; you can redistribute it and/or modify
11 * it under the terms of the GNU General Public License as published by
12 * the Free Software Foundation; either version 2 of the License, or
13 * (at your option) any later version.
14 *
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 * GNU General Public License for more details.
19 *
20 * You should have received a copy of the GNU General Public License
21 * along with this program; if not, write to the Free Software
22 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23 *
24 *********************************************************************/
25
26#include "phrasesearch.h"
27#include "gsdlunicode.h"
28
29inline unsigned char *parse_nonword_word (unsigned char *here, unsigned char *end,
30 text_t &word) {
31 int c_len = 0;
32 unsigned short c = 0;
33
34 word.clear();
35
36 // parse non word
37 while (here <= end) {
38 c_len = parse_utf8_char (here, end, &c);
39 if (c == '(') {
40 // found a note, look for '}'
41 while (here <= end && c != ')') {
42 c_len = parse_utf8_char (here, end, &c);
43 here += c_len;
44 }
45 }
46 if (c == '{') {
47 // found a composite character, look for '}'
48 while (here <= end && c != '}') {
49 c_len = parse_utf8_char (here, end, &c);
50 here += c_len;
51 }
52 }
53 if (is_unicode_letdig(c)) {
54 while (c_len > 0) {
55 // this is in a word
56 word.push_back(*here);
57 here++; c_len--;
58 }
59 break;
60 }
61 here += c_len;
62 }
63
64 // parse word
65 while (here <= end) {
66 c_len = parse_utf8_char (here, end, &c);
67 if (!is_unicode_letdig(c)) {
68 here += c_len; // it is ok to skip a nonword character
69 break;
70 }
71 while (c_len > 0) {
72 word.push_back(*here);
73 here++; c_len--;
74 }
75 }
76
77 return here;
78}
79
80static void get_all_docnums (gdbmclass &gdbm, text_t OID, vector<int> &docnum_list) {
81
82 infodbclass OID_info;
83
84 // get OID
85 if (!gdbm.getinfo (OID, OID_info)) return;
86 if (OID_info["hastxt"] == "1" && !OID_info["docnum"].empty()) {
87 docnum_list.push_back (OID_info["docnum"].getint());
88 }
89
90 // get contents set
91 if (OID_info["contains"].empty()) return;
92 text_tarray contains; text_t tmptext;
93 text_t::iterator contains_here = OID_info["contains"].begin();
94 text_t::iterator contains_end = OID_info["contains"].end();
95 while (contains_here != contains_end) {
96 if (*contains_here == '"') tmptext += OID;
97 else if (*contains_here == ';') {
98 if (!tmptext.empty()) contains.push_back (tmptext);
99 tmptext.clear();
100 } else tmptext.push_back(*contains_here);
101 contains_here++;
102 }
103 if (!tmptext.empty()) contains.push_back (tmptext);
104
105 text_tarray::const_iterator here = contains.begin();
106 text_tarray::const_iterator end = contains.end();
107 while (here != end) {
108 get_all_docnums (gdbm, *here, docnum_list);
109 here ++;
110 }
111}
112
113bool doc_phrase_search (unsigned char *doc, int doclen,
114 const termfreqclassarray &phrase) {
115 // note: this uses the most braindead search routine :-)
116 // however its not so bad as there shouldn't be many partial
117 // matches
118
119 // a null phrase matches anything
120 if (phrase.empty()) return true;
121
122 // if there is nothing then there can't be a match
123 if (doc == NULL || doclen == 0) return false;
124
125 text_t doc_word;
126 doc_word.reserve (16);
127
128 bool first = true;
129
130 unsigned char *doc_here = doc;
131 unsigned char *doc_herefirstword = doc;
132 unsigned char *doc_end = doc+doclen-1; // unitool conventions :-/
133
134 while (doc_here <= doc_end) {
135 first = true;
136
137 // there will be at least one member of phrase (see above)
138 termfreqclassarray::const_iterator phrase_here = phrase.begin();
139 termfreqclassarray::const_iterator phrase_end = phrase.end();
140 do {
141 // get the next non-word ... and ignore it, then get the next word
142 doc_here = parse_nonword_word (doc_here, doc_end, doc_word);
143 if (first) {doc_herefirstword = doc_here; first = false;}
144
145 // break if this word is not the next in the phrase
146 if ((*phrase_here).utf8equivterms.find (doc_word) ==
147 (*phrase_here).utf8equivterms.end()) break;
148
149 phrase_here++;
150 } while (doc_here <= doc_end && phrase_here != phrase_end);
151
152 // see if we found a phrase
153 if (phrase_here == phrase_end) return true;
154
155 doc_here = doc_herefirstword; // set the counter back
156 }
157
158 return false;
159}
160
161// looks for the stemmed phrase in the metadata or text associated with
162// an OID. This function has not been coded with all situations in mind
163bool OID_phrase_search (mgsearchclass &mgsearch,
164 gdbmclass &gdbm,
165 const text_t &index,
166 const text_t &subcollection,
167 const text_t &language,
168 const text_t &longindex,
169 const text_t &collection,
170 const termfreqclassarray &phrase,
171 int docnum) {
172
173 // get OID
174 infodbclass docnum_info;
175 if (!gdbm.getinfo (docnum, docnum_info)) return false;
176 text_t &OID = docnum_info["section"];
177 if (OID.empty()) return false;
178
179 // disect the long index to find out where the text should come from
180 text_t gran, type;
181 text_t::const_iterator longindex_here = longindex.begin();
182 text_t::const_iterator longindex_end = longindex.end();
183 longindex_here = getdelimitstr (longindex_here, longindex_end, ':', gran);
184 longindex_here = getdelimitstr (longindex_here, longindex_end, ':', type);
185
186 if (gran.empty()) return false;
187
188 // note that we're treating indexes of type 'all' (i.e. text,Title,Creator)
189 // or other composite indexes that contain "text" as if they were simply 'text' indexes
190 if (type == "text" || type == "all" || findword(type.begin(),type.end(),"text")) {
191 char *doc = NULL;
192 int doclen = 0;
193
194 // get text from mg.
195 if (gran == "document") {
196
197 // if this is a document level index (which should only happen if
198 // there are no matching indexes with a finer granularity -- see
199 // mgqueryfilterclass::mg_parse_query_params) then we must do the
200 // phrase search on the entire document (i.e. all the sections)
201 // -- this is going to make a slow process even slower
202 vector<int> docnum_list; text_t fulldoc;
203 get_all_docnums (gdbm, OID, docnum_list);
204 vector<int>::const_iterator this_docnum = docnum_list.begin();
205 vector<int>::const_iterator end_docnum = docnum_list.end();
206 while (this_docnum != end_docnum) {
207 if (mgsearch.mgdocument (index, subcollection, language, collection,
208 *this_docnum, doc, doclen)) {
209 fulldoc.appendcstr (doc);
210 }
211 this_docnum ++;
212 }
213 doc = fulldoc.getcstr();
214 doclen = fulldoc.size();
215 bool rv = doc_phrase_search ((unsigned char *)doc, doclen, phrase);
216 delete doc;
217 return rv;
218
219 } else {
220
221 if (!mgsearch.mgdocument (index, subcollection, language, collection,
222 docnum, doc, doclen)) return false;
223 return doc_phrase_search ((unsigned char *)doc, doclen, phrase);
224 }
225 }
226
227 char *metadata = NULL;
228 text_t::size_type metadata_len = 0;
229 infodbclass OID_info;
230
231 // get field
232 if (!gdbm.getinfo (OID, OID_info)) return false;
233
234 bool result = false;
235 text_tarray *tarr_ptr = OID_info.getmultinfo (type);
236 if (tarr_ptr != NULL ) {
237 text_tarray::const_iterator subvalue_here = (*tarr_ptr).begin();
238 text_tarray::const_iterator subvalue_end = (*tarr_ptr).end();
239 while (subvalue_here != subvalue_end) {
240 if (subvalue_here != NULL) {
241 metadata = (to_utf8(*subvalue_here)).getcarr(metadata_len);
242 result = doc_phrase_search ((unsigned char *)metadata, metadata_len, phrase);
243 delete [] metadata;
244
245 if (result) return true;
246 }
247
248 subvalue_here++;
249 }
250 }
251
252 return result;
253}
Note: See TracBrowser for help on using the repository browser.