source: trunk/gsdl/src/colservr/phrasesearch.cpp@ 1661

Last change on this file since 1661 was 1661, checked in by nzdl, 24 years ago

Fixed a couple of minor bugs in phrase searching code

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 5.8 KB
Line 
1/**********************************************************************
2 *
3 * phrasesearch.cpp -- tools to search for a phrase in a larger text
4 * Copyright (C) 1999 DigiLib Systems Limited
5 *
6 * A component of the Greenstone digital library software
7 * from the New Zealand Digital Library Project at the
8 * University of Waikato, New Zealand.
9 *
10 * This program is free software; you can redistribute it and/or modify
11 * it under the terms of the GNU General Public License as published by
12 * the Free Software Foundation; either version 2 of the License, or
13 * (at your option) any later version.
14 *
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 * GNU General Public License for more details.
19 *
20 * You should have received a copy of the GNU General Public License
21 * along with this program; if not, write to the Free Software
22 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23 *
24 *********************************************************************/
25
26#include "phrasesearch.h"
27#include "gsdlunicode.h"
28
29inline unsigned char *parse_nonword_word (unsigned char *here, unsigned char *end,
30 text_t &word) {
31 int c_len = 0;
32 unsigned short c = 0;
33
34 word.clear();
35
36 // parse non word
37 while (here <= end) {
38 c_len = parse_utf8_char (here, end, &c);
39 if (c == '(') {
40 // found a note, look for '}'
41 while (here <= end && c != ')') {
42 c_len = parse_utf8_char (here, end, &c);
43 here += c_len;
44 }
45 }
46 if (c == '{') {
47 // found a composite character, look for '}'
48 while (here <= end && c != '}') {
49 c_len = parse_utf8_char (here, end, &c);
50 here += c_len;
51 }
52 }
53 if (is_unicode_letdig(c)) {
54 while (c_len > 0) {
55 // this is in a word
56 word.push_back(*here);
57 here++; c_len--;
58 }
59 break;
60 }
61 here += c_len;
62 }
63
64 // parse word
65 while (here <= end) {
66 c_len = parse_utf8_char (here, end, &c);
67 if (!is_unicode_letdig(c)) {
68 here += c_len; // it is ok to skip a nonword character
69 break;
70 }
71 while (c_len > 0) {
72 word.push_back(*here);
73 here++; c_len--;
74 }
75 }
76
77 return here;
78}
79
80
81bool doc_phrase_search (unsigned char *doc, int doclen,
82 const termfreqclassarray &phrase) {
83 // note: this uses the most braindead search routine :-)
84 // however its not so bad as there shouldn't be many partial
85 // matches
86
87 // a null phrase matches anything
88 if (phrase.empty()) return true;
89
90 // if there is nothing then there can't be a match
91 if (doc == NULL || doclen == 0) return false;
92
93 text_t doc_word;
94 doc_word.reserve (16);
95
96 bool first = true;
97
98 unsigned char *doc_here = doc;
99 unsigned char *doc_herefirstword = doc;
100 unsigned char *doc_end = doc+doclen-1; // unitool conventions :-/
101
102 while (doc_here <= doc_end) {
103 first = true;
104
105 // there will be at least one member of phrase (see above)
106 termfreqclassarray::const_iterator phrase_here = phrase.begin();
107 termfreqclassarray::const_iterator phrase_end = phrase.end();
108 do {
109 // get the next non-word ... and ignore it, then get the next word
110 doc_here = parse_nonword_word (doc_here, doc_end, doc_word);
111 if (first) {doc_herefirstword = doc_here; first = false;}
112
113 // break if this word is not the next in the phrase
114 if ((*phrase_here).utf8equivterms.find (doc_word) ==
115 (*phrase_here).utf8equivterms.end()) break;
116
117 phrase_here++;
118 } while (doc_here <= doc_end && phrase_here != phrase_end);
119
120 // see if we found a phrase
121 if (phrase_here == phrase_end) return true;
122
123 doc_here = doc_herefirstword; // set the counter back
124 }
125
126 return false;
127}
128
129
130// looks for the stemmed phrase in the metadata or text associated with
131// an OID. This function has not been coded with all situations in mind
132bool OID_phrase_search (mgsearchclass &mgsearch,
133 gdbmclass &gdbm,
134 const text_t &index,
135 const text_t &subcollection,
136 const text_t &language,
137 const text_t &longindex,
138 const text_t &collection,
139 const termfreqclassarray &phrase,
140 int docnum) {
141 // disect the long index to find out where the text should come from
142 text_t level, gran;
143 text_t::const_iterator longindex_here = longindex.begin();
144 text_t::const_iterator longindex_end = longindex.end();
145 longindex_here = getdelimitstr (longindex_here, longindex_end, ':', level);
146 longindex_here = getdelimitstr (longindex_here, longindex_end, ':', gran);
147
148 if (gran.empty()) return false;
149
150 // note that we're treating indexes with granularity of 'all' (i.e. text,Title,Creator)
151 // as if they were simply 'text' indexes
152 if (gran == "text" || gran == "all") {
153 char *doc = NULL;
154 int doclen = 0;
155
156 // get text from mg.
157 if (!mgsearch.mgdocument (index, subcollection, language, collection,
158 docnum, doc, doclen)) return false;
159 return doc_phrase_search ((unsigned char *)doc, doclen, phrase);
160 }
161
162 // get OID
163 char *metadata = NULL;
164 text_t::size_type metadata_len = 0;
165 infodbclass docnum_info;
166 infodbclass OID_info;
167
168 if (!gdbm.getinfo (docnum, docnum_info)) return false;
169 text_t &OID = docnum_info["section"];
170 if (OID.empty()) return false;
171
172 // get field
173 if (!gdbm.getinfo (OID, OID_info)) return false;
174
175 bool result = false;
176 text_tarray *tarr_ptr = OID_info.getmultinfo (gran);
177 if (tarr_ptr != NULL ) {
178 text_tarray::const_iterator subvalue_here = (*tarr_ptr).begin();
179 text_tarray::const_iterator subvalue_end = (*tarr_ptr).end();
180 while (subvalue_here != subvalue_end) {
181 if (subvalue_here != NULL) {
182 metadata = (to_utf8(*subvalue_here)).getcarr(metadata_len);
183 result = doc_phrase_search ((unsigned char *)metadata, metadata_len, phrase);
184 delete [] metadata;
185
186 if (result) return true;
187 }
188
189 subvalue_here++;
190 }
191 }
192
193 return result;
194}
Note: See TracBrowser for help on using the repository browser.