source: main/tags/2.10/gsdl/src/colservr/phrasesearch.cpp@ 32704

Last change on this file since 32704 was 534, checked in by sjboddie, 25 years ago

added gpl notice

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 6.1 KB
Line 
1/**********************************************************************
2 *
3 * phrasesearch.cpp -- tools to search for a phrase in a larger text
4 * Copyright (C) 1999 DigiLib Systems Limited
5 *
6 * A component of the Greenstone digital library software
7 * from the New Zealand Digital Library Project at the
8 * University of Waikato, New Zealand.
9 *
10 * This program is free software; you can redistribute it and/or modify
11 * it under the terms of the GNU General Public License as published by
12 * the Free Software Foundation; either version 2 of the License, or
13 * (at your option) any later version.
14 *
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 * GNU General Public License for more details.
19 *
20 * You should have received a copy of the GNU General Public License
21 * along with this program; if not, write to the Free Software
22 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23 *
24 * $Id: phrasesearch.cpp 534 1999-09-07 04:57:43Z sjboddie $
25 *
26 *********************************************************************/
27
28/*
29 $Log$
30 Revision 1.6 1999/09/07 04:57:23 sjboddie
31 added gpl notice
32
33 Revision 1.5 1999/08/31 22:45:12 rjmcnab
34 fixed small problem
35
36 Revision 1.4 1999/07/16 00:15:48 sjboddie
37 changed to use termfreqclassarray type
38
39 Revision 1.3 1999/07/07 06:19:45 rjmcnab
40 Added ability to combine two or more independant queries.
41
42 Revision 1.2 1999/07/01 09:25:54 rjmcnab
43 fixed bug :-^
44
45 Revision 1.1 1999/07/01 04:01:46 rjmcnab
46 Initial revision.
47
48 */
49
50
51#include "phrasesearch.h"
52#include "gsdlunicode.h"
53
54inline unsigned char *parse_nonword_word (unsigned char *here, unsigned char *end,
55 text_t &word) {
56 int c_len = 0;
57 unsigned short c = 0;
58
59 word.clear();
60
61 // parse non word
62 while (here <= end) {
63 c_len = parse_utf8_char (here, end, &c);
64 if (c == '(') {
65 // found a note, look for '}'
66 while (here <= end && c != ')') {
67 c_len = parse_utf8_char (here, end, &c);
68 here += c_len;
69 }
70 }
71 if (c == '{') {
72 // found a composite character, look for '}'
73 while (here <= end && c != '}') {
74 c_len = parse_utf8_char (here, end, &c);
75 here += c_len;
76 }
77 }
78 if (is_unicode_letdig(c)) {
79 while (c_len > 0) {
80 // this is in a word
81 word.push_back(*here);
82 here++; c_len--;
83 }
84 break;
85 }
86 here += c_len;
87 }
88
89 // parse word
90 while (here <= end) {
91 c_len = parse_utf8_char (here, end, &c);
92 if (!is_unicode_letdig(c)) {
93 here += c_len; // it is ok to skip a nonword character
94 break;
95 }
96 while (c_len > 0) {
97 word.push_back(*here);
98 here++; c_len--;
99 }
100 }
101
102 return here;
103}
104
105
106bool doc_phrase_search (unsigned char *doc, int doclen,
107 const termfreqclassarray &phrase) {
108 // note: this uses the most braindead search routine :-)
109 // however its not so bad as there shouldn't be many partial
110 // matches
111
112 // a null phrase matches anything
113 if (phrase.empty()) return true;
114
115 // if there is nothing then there can't be a match
116 if (doc == NULL || doclen == 0) return false;
117
118 text_t doc_word;
119 doc_word.reserve (16);
120
121 bool first = true;
122
123 unsigned char *doc_here = doc;
124 unsigned char *doc_herefirstword = doc;
125 unsigned char *doc_end = doc+doclen-1; // unitool conventions :-/
126
127 while (doc_here <= doc_end) {
128 first = true;
129
130 // there will be at least one member of phrase (see above)
131 termfreqclassarray::const_iterator phrase_here = phrase.begin();
132 termfreqclassarray::const_iterator phrase_end = phrase.end();
133 do {
134 // get the next non-word ... and ignore it, then get the next word
135 doc_here = parse_nonword_word (doc_here, doc_end, doc_word);
136 if (first) {doc_herefirstword = doc_here; first = false;}
137
138 // break if this word is not the next in the phrase
139 if ((*phrase_here).utf8equivterms.find (doc_word) ==
140 (*phrase_here).utf8equivterms.end()) break;
141
142 phrase_here++;
143 } while (doc_here <= doc_end && phrase_here != phrase_end);
144
145 // see if we found a phrase
146 if (phrase_here == phrase_end) return true;
147
148 doc_here = doc_herefirstword; // set the counter back
149 }
150
151 return false;
152}
153
154
155// looks for the stemmed phrase in the metadata or text associated with
156// an OID. This function has not been coded with all situations in mind
157bool OID_phrase_search (mgsearchclass &mgsearch,
158 gdbmclass &gdbm,
159 const text_t &index,
160 const text_t &subcollection,
161 const text_t &language,
162 const text_t &longindex,
163 const text_t &collection,
164 const termfreqclassarray &phrase,
165 int docnum) {
166 // disect the long index to find out where the text should come from
167 text_t level, gran;
168 text_t::const_iterator longindex_here = longindex.begin();
169 text_t::const_iterator longindex_end = longindex.end();
170 longindex_here = getdelimitstr (longindex_here, longindex_end, ':', level);
171 longindex_here = getdelimitstr (longindex_here, longindex_end, ':', gran);
172
173 if (gran.empty()) return false;
174
175 if (gran == "text") {
176 char *doc = NULL;
177 int doclen = 0;
178
179 // get text from mg.
180 if (!mgsearch.mgdocument (index, subcollection, language, collection,
181 docnum, doc, doclen)) return false;
182 return doc_phrase_search ((unsigned char *)doc, doclen, phrase);
183 }
184
185 // get OID
186 char *metadata = NULL;
187 text_t::size_type metadata_len = 0;
188 infodbclass docnum_info;
189 infodbclass OID_info;
190
191 if (!gdbm.getinfo (docnum, docnum_info)) return false;
192 text_t &OID = docnum_info["section"];
193 if (OID.empty()) return false;
194
195 // get field
196 if (!gdbm.getinfo (OID, OID_info)) return false;
197
198 bool result = false;
199 text_tarray *tarr_ptr = OID_info.getmultinfo (gran);
200 if (tarr_ptr != NULL ) {
201 text_tarray::const_iterator subvalue_here = (*tarr_ptr).begin();
202 text_tarray::const_iterator subvalue_end = (*tarr_ptr).end();
203 while (subvalue_here != subvalue_end) {
204 if (subvalue_here != NULL) {
205 metadata = (to_utf8(*subvalue_here)).getcarr(metadata_len);
206 result = doc_phrase_search ((unsigned char *)metadata, metadata_len, phrase);
207 delete [] metadata;
208
209 if (result) return true;
210 }
211
212 subvalue_here++;
213 }
214 }
215
216 return result;
217}
Note: See TracBrowser for help on using the repository browser.