source: main/trunk/greenstone2/runtime-src/src/colservr/phrasesearch.cpp@ 27065

Last change on this file since 27065 was 21324, checked in by ak19, 14 years ago

Changes to makefiles, configure files, and source code to work with the new configure flags that allow indexers to be individually compiled up by setting each indexer to be enabled or disabled (enable-mg, enable-mgpp, enable-lucene)

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 8.1 KB
RevLine 
[328]1/**********************************************************************
2 *
3 * phrasesearch.cpp -- tools to search for a phrase in a larger text
4 * Copyright (C) 1999 DigiLib Systems Limited
5 *
[534]6 * A component of the Greenstone digital library software
7 * from the New Zealand Digital Library Project at the
8 * University of Waikato, New Zealand.
[328]9 *
[534]10 * This program is free software; you can redistribute it and/or modify
11 * it under the terms of the GNU General Public License as published by
12 * the Free Software Foundation; either version 2 of the License, or
13 * (at your option) any later version.
14 *
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 * GNU General Public License for more details.
19 *
20 * You should have received a copy of the GNU General Public License
21 * along with this program; if not, write to the Free Software
22 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23 *
[328]24 *********************************************************************/
25
26#include "phrasesearch.h"
27#include "gsdlunicode.h"
28
29inline unsigned char *parse_nonword_word (unsigned char *here, unsigned char *end,
30 text_t &word) {
31 int c_len = 0;
32 unsigned short c = 0;
33
34 word.clear();
35
36 // parse non word
37 while (here <= end) {
38 c_len = parse_utf8_char (here, end, &c);
[500]39 if (c == '(') {
40 // found a note, look for '}'
41 while (here <= end && c != ')') {
42 c_len = parse_utf8_char (here, end, &c);
43 here += c_len;
44 }
45 }
46 if (c == '{') {
47 // found a composite character, look for '}'
48 while (here <= end && c != '}') {
49 c_len = parse_utf8_char (here, end, &c);
50 here += c_len;
51 }
52 }
[328]53 if (is_unicode_letdig(c)) {
[333]54 while (c_len > 0) {
[500]55 // this is in a word
[333]56 word.push_back(*here);
[9620]57 ++here; --c_len;
[333]58 }
[328]59 break;
60 }
[333]61 here += c_len;
[328]62 }
63
64 // parse word
65 while (here <= end) {
66 c_len = parse_utf8_char (here, end, &c);
[333]67 if (!is_unicode_letdig(c)) {
68 here += c_len; // it is ok to skip a nonword character
69 break;
70 }
71 while (c_len > 0) {
72 word.push_back(*here);
[9620]73 ++here; --c_len;
[333]74 }
[328]75 }
76
77 return here;
78}
79
[15558]80static void get_all_docnums (dbclass &db, text_t OID, vector<int> &docnum_list) {
[328]81
[2146]82 infodbclass OID_info;
83
84 // get OID
[15558]85 if (!db.getinfo (OID, OID_info)) return;
[2146]86 if (OID_info["hastxt"] == "1" && !OID_info["docnum"].empty()) {
87 docnum_list.push_back (OID_info["docnum"].getint());
88 }
89
90 // get contents set
91 if (OID_info["contains"].empty()) return;
92 text_tarray contains; text_t tmptext;
93 text_t::iterator contains_here = OID_info["contains"].begin();
94 text_t::iterator contains_end = OID_info["contains"].end();
95 while (contains_here != contains_end) {
96 if (*contains_here == '"') tmptext += OID;
97 else if (*contains_here == ';') {
98 if (!tmptext.empty()) contains.push_back (tmptext);
99 tmptext.clear();
100 } else tmptext.push_back(*contains_here);
[9620]101 ++contains_here;
[2146]102 }
103 if (!tmptext.empty()) contains.push_back (tmptext);
104
105 text_tarray::const_iterator here = contains.begin();
106 text_tarray::const_iterator end = contains.end();
107 while (here != end) {
[15558]108 get_all_docnums (db, *here, docnum_list);
[9620]109 ++here;
[2146]110 }
111}
112
[21324]113#ifdef ENABLE_MG
[328]114bool doc_phrase_search (unsigned char *doc, int doclen,
[395]115 const termfreqclassarray &phrase) {
[328]116 // note: this uses the most braindead search routine :-)
117 // however its not so bad as there shouldn't be many partial
118 // matches
119
120 // a null phrase matches anything
121 if (phrase.empty()) return true;
122
123 // if there is nothing then there can't be a match
124 if (doc == NULL || doclen == 0) return false;
125
126 text_t doc_word;
127 doc_word.reserve (16);
128
129 bool first = true;
130
131 unsigned char *doc_here = doc;
132 unsigned char *doc_herefirstword = doc;
133 unsigned char *doc_end = doc+doclen-1; // unitool conventions :-/
134
135 while (doc_here <= doc_end) {
136 first = true;
[2146]137
[328]138 // there will be at least one member of phrase (see above)
[395]139 termfreqclassarray::const_iterator phrase_here = phrase.begin();
140 termfreqclassarray::const_iterator phrase_end = phrase.end();
[328]141 do {
142 // get the next non-word ... and ignore it, then get the next word
143 doc_here = parse_nonword_word (doc_here, doc_end, doc_word);
144 if (first) {doc_herefirstword = doc_here; first = false;}
145
146 // break if this word is not the next in the phrase
147 if ((*phrase_here).utf8equivterms.find (doc_word) ==
148 (*phrase_here).utf8equivterms.end()) break;
149
[9620]150 ++phrase_here;
[328]151 } while (doc_here <= doc_end && phrase_here != phrase_end);
152
153 // see if we found a phrase
154 if (phrase_here == phrase_end) return true;
155
156 doc_here = doc_herefirstword; // set the counter back
157 }
158
159 return false;
160}
161
162// looks for the stemmed phrase in the metadata or text associated with
163// an OID. This function has not been coded with all situations in mind
164bool OID_phrase_search (mgsearchclass &mgsearch,
[15558]165 dbclass &db,
[328]166 const text_t &index,
[351]167 const text_t &subcollection,
168 const text_t &language,
[328]169 const text_t &longindex,
170 const text_t &collection,
[395]171 const termfreqclassarray &phrase,
[328]172 int docnum) {
[2146]173
174 // get OID
175 infodbclass docnum_info;
[15558]176 if (!db.getinfo (docnum, docnum_info)) return false;
[2146]177 text_t &OID = docnum_info["section"];
178 if (OID.empty()) return false;
179
[328]180 // disect the long index to find out where the text should come from
[2146]181 text_t gran, type;
[328]182 text_t::const_iterator longindex_here = longindex.begin();
183 text_t::const_iterator longindex_end = longindex.end();
184 longindex_here = getdelimitstr (longindex_here, longindex_end, ':', gran);
[2146]185 longindex_here = getdelimitstr (longindex_here, longindex_end, ':', type);
[328]186
187 if (gran.empty()) return false;
188
[2146]189 // note that we're treating indexes of type 'all' (i.e. text,Title,Creator)
190 // or other composite indexes that contain "text" as if they were simply 'text' indexes
[2937]191 if ((type == "text") || (type == "all") || (findword(type.begin(),type.end(),"text") != type.end())) {
[328]192 char *doc = NULL;
193 int doclen = 0;
194
195 // get text from mg.
[2146]196 if (gran == "document") {
[328]197
[2146]198 // if this is a document level index (which should only happen if
199 // there are no matching indexes with a finer granularity -- see
200 // mgqueryfilterclass::mg_parse_query_params) then we must do the
201 // phrase search on the entire document (i.e. all the sections)
202 // -- this is going to make a slow process even slower
203 vector<int> docnum_list; text_t fulldoc;
[15558]204 get_all_docnums (db, OID, docnum_list);
[2146]205 vector<int>::const_iterator this_docnum = docnum_list.begin();
206 vector<int>::const_iterator end_docnum = docnum_list.end();
207 while (this_docnum != end_docnum) {
208 if (mgsearch.mgdocument (index, subcollection, language, collection,
209 *this_docnum, doc, doclen)) {
210 fulldoc.appendcstr (doc);
211 }
[9620]212 ++this_docnum;
[2146]213 }
214 doc = fulldoc.getcstr();
215 doclen = fulldoc.size();
216 bool rv = doc_phrase_search ((unsigned char *)doc, doclen, phrase);
[9631]217 delete []doc;
[2146]218 return rv;
219
220 } else {
221
222 if (!mgsearch.mgdocument (index, subcollection, language, collection,
223 docnum, doc, doclen)) return false;
224 return doc_phrase_search ((unsigned char *)doc, doclen, phrase);
225 }
226 }
227
[328]228 char *metadata = NULL;
229 text_t::size_type metadata_len = 0;
230 infodbclass OID_info;
231
232 // get field
[15558]233 if (!db.getinfo (OID, OID_info)) return false;
[328]234
[500]235 bool result = false;
236
[5141]237 // need to look through all the metadata values in the index
238 text_tarray keys;
239 splitchar(type.begin(), type.end(), ',', keys);
240
241 text_tarray::const_iterator keyhere = keys.begin();
242 text_tarray::const_iterator keyend = keys.end();
243 while (keyhere != keyend) {
244 text_tarray *tarr_ptr = OID_info.getmultinfo (*keyhere);
245 if (tarr_ptr != NULL ) {
246 text_tarray::const_iterator subvalue_here = (*tarr_ptr).begin();
247 text_tarray::const_iterator subvalue_end = (*tarr_ptr).end();
248 while (subvalue_here != subvalue_end) {
249 metadata = (to_utf8(*subvalue_here)).getcarr(metadata_len);
250 result = doc_phrase_search ((unsigned char *)metadata, metadata_len, phrase);
251 delete [] metadata;
252
253 if (result) return true;
[9620]254 ++subvalue_here;
[5141]255 }
[500]256 }
[9620]257 ++keyhere;
[500]258 }
[328]259 return result;
260}
[21324]261#endif
Note: See TracBrowser for help on using the repository browser.