source: trunk/gsdl/src/colservr/phrasesearch.cpp@ 500

Last change on this file since 500 was 500, checked in by rjmcnab, 25 years ago

fixed small problem

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 5.2 KB
Line 
1/**********************************************************************
2 *
3 * phrasesearch.cpp -- tools to search for a phrase in a larger text
4 * Copyright (C) 1999 DigiLib Systems Limited
5 *
6 * PUT COPYRIGHT NOTICE HERE
7 *
8 * $Id: phrasesearch.cpp 500 1999-08-31 22:45:12Z rjmcnab $
9 *
10 *********************************************************************/
11
12/*
13 $Log$
14 Revision 1.5 1999/08/31 22:45:12 rjmcnab
15 fixed small problem
16
17 Revision 1.4 1999/07/16 00:15:48 sjboddie
18 changed to use termfreqclassarray type
19
20 Revision 1.3 1999/07/07 06:19:45 rjmcnab
21 Added ability to combine two or more independant queries.
22
23 Revision 1.2 1999/07/01 09:25:54 rjmcnab
24 fixed bug :-^
25
26 Revision 1.1 1999/07/01 04:01:46 rjmcnab
27 Initial revision.
28
29 */
30
31
32#include "phrasesearch.h"
33#include "gsdlunicode.h"
34
35inline unsigned char *parse_nonword_word (unsigned char *here, unsigned char *end,
36 text_t &word) {
37 int c_len = 0;
38 unsigned short c = 0;
39
40 word.clear();
41
42 // parse non word
43 while (here <= end) {
44 c_len = parse_utf8_char (here, end, &c);
45 if (c == '(') {
46 // found a note, look for '}'
47 while (here <= end && c != ')') {
48 c_len = parse_utf8_char (here, end, &c);
49 here += c_len;
50 }
51 }
52 if (c == '{') {
53 // found a composite character, look for '}'
54 while (here <= end && c != '}') {
55 c_len = parse_utf8_char (here, end, &c);
56 here += c_len;
57 }
58 }
59 if (is_unicode_letdig(c)) {
60 while (c_len > 0) {
61 // this is in a word
62 word.push_back(*here);
63 here++; c_len--;
64 }
65 break;
66 }
67 here += c_len;
68 }
69
70 // parse word
71 while (here <= end) {
72 c_len = parse_utf8_char (here, end, &c);
73 if (!is_unicode_letdig(c)) {
74 here += c_len; // it is ok to skip a nonword character
75 break;
76 }
77 while (c_len > 0) {
78 word.push_back(*here);
79 here++; c_len--;
80 }
81 }
82
83 return here;
84}
85
86
87bool doc_phrase_search (unsigned char *doc, int doclen,
88 const termfreqclassarray &phrase) {
89 // note: this uses the most braindead search routine :-)
90 // however its not so bad as there shouldn't be many partial
91 // matches
92
93 // a null phrase matches anything
94 if (phrase.empty()) return true;
95
96 // if there is nothing then there can't be a match
97 if (doc == NULL || doclen == 0) return false;
98
99 text_t doc_word;
100 doc_word.reserve (16);
101
102 bool first = true;
103
104 unsigned char *doc_here = doc;
105 unsigned char *doc_herefirstword = doc;
106 unsigned char *doc_end = doc+doclen-1; // unitool conventions :-/
107
108 while (doc_here <= doc_end) {
109 first = true;
110
111 // there will be at least one member of phrase (see above)
112 termfreqclassarray::const_iterator phrase_here = phrase.begin();
113 termfreqclassarray::const_iterator phrase_end = phrase.end();
114 do {
115 // get the next non-word ... and ignore it, then get the next word
116 doc_here = parse_nonword_word (doc_here, doc_end, doc_word);
117 if (first) {doc_herefirstword = doc_here; first = false;}
118
119 // break if this word is not the next in the phrase
120 if ((*phrase_here).utf8equivterms.find (doc_word) ==
121 (*phrase_here).utf8equivterms.end()) break;
122
123 phrase_here++;
124 } while (doc_here <= doc_end && phrase_here != phrase_end);
125
126 // see if we found a phrase
127 if (phrase_here == phrase_end) return true;
128
129 doc_here = doc_herefirstword; // set the counter back
130 }
131
132 return false;
133}
134
135
136// looks for the stemmed phrase in the metadata or text associated with
137// an OID. This function has not been coded with all situations in mind
138bool OID_phrase_search (mgsearchclass &mgsearch,
139 gdbmclass &gdbm,
140 const text_t &index,
141 const text_t &subcollection,
142 const text_t &language,
143 const text_t &longindex,
144 const text_t &collection,
145 const termfreqclassarray &phrase,
146 int docnum) {
147 // disect the long index to find out where the text should come from
148 text_t level, gran;
149 text_t::const_iterator longindex_here = longindex.begin();
150 text_t::const_iterator longindex_end = longindex.end();
151 longindex_here = getdelimitstr (longindex_here, longindex_end, ':', level);
152 longindex_here = getdelimitstr (longindex_here, longindex_end, ':', gran);
153
154 if (gran.empty()) return false;
155
156 if (gran == "text") {
157 char *doc = NULL;
158 int doclen = 0;
159
160 // get text from mg.
161 if (!mgsearch.mgdocument (index, subcollection, language, collection,
162 docnum, doc, doclen)) return false;
163 return doc_phrase_search ((unsigned char *)doc, doclen, phrase);
164 }
165
166 // get OID
167 char *metadata = NULL;
168 text_t::size_type metadata_len = 0;
169 infodbclass docnum_info;
170 infodbclass OID_info;
171
172 if (!gdbm.getinfo (docnum, docnum_info)) return false;
173 text_t &OID = docnum_info["section"];
174 if (OID.empty()) return false;
175
176 // get field
177 if (!gdbm.getinfo (OID, OID_info)) return false;
178
179 bool result = false;
180 text_tarray *tarr_ptr = OID_info.getmultinfo (gran);
181 if (tarr_ptr != NULL ) {
182 text_tarray::const_iterator subvalue_here = (*tarr_ptr).begin();
183 text_tarray::const_iterator subvalue_end = (*tarr_ptr).end();
184 while (subvalue_here != subvalue_end) {
185 if (subvalue_here != NULL) {
186 metadata = (to_utf8(*subvalue_here)).getcarr(metadata_len);
187 result = doc_phrase_search ((unsigned char *)metadata, metadata_len, phrase);
188 delete [] metadata;
189
190 if (result) return true;
191 }
192
193 subvalue_here++;
194 }
195 }
196
197 return result;
198}
Note: See TracBrowser for help on using the repository browser.