source: trunk/gsdl/src/colservr/phrasesearch.cpp@ 351

Last change on this file since 351 was 351, checked in by rjmcnab, 25 years ago

Added ability to combine two or more independant queries.

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 4.4 KB
Line 
1/**********************************************************************
2 *
3 * phrasesearch.cpp -- tools to search for a phrase in a larger text
4 * Copyright (C) 1999 DigiLib Systems Limited
5 *
6 * PUT COPYRIGHT NOTICE HERE
7 *
8 * $Id: phrasesearch.cpp 351 1999-07-07 06:19:47Z rjmcnab $
9 *
10 *********************************************************************/
11
12/*
13 $Log$
14 Revision 1.3 1999/07/07 06:19:45 rjmcnab
15 Added ability to combine two or more independant queries.
16
17 Revision 1.2 1999/07/01 09:25:54 rjmcnab
18 fixed bug :-^
19
20 Revision 1.1 1999/07/01 04:01:46 rjmcnab
21 Initial revision.
22
23 */
24
25
26#include "phrasesearch.h"
27#include "gsdlunicode.h"
28
29inline unsigned char *parse_nonword_word (unsigned char *here, unsigned char *end,
30 text_t &word) {
31 int c_len = 0;
32 unsigned short c = 0;
33
34 word.clear();
35
36 // parse non word
37 while (here <= end) {
38 c_len = parse_utf8_char (here, end, &c);
39 if (is_unicode_letdig(c)) {
40 while (c_len > 0) {
41 word.push_back(*here);
42 here++; c_len--;
43 }
44 break;
45 }
46 here += c_len;
47 }
48
49 // parse word
50 while (here <= end) {
51 c_len = parse_utf8_char (here, end, &c);
52 if (!is_unicode_letdig(c)) {
53 here += c_len; // it is ok to skip a nonword character
54 break;
55 }
56 while (c_len > 0) {
57 word.push_back(*here);
58 here++; c_len--;
59 }
60 }
61
62 return here;
63}
64
65
66bool doc_phrase_search (unsigned char *doc, int doclen,
67 const vector<termfreqclass> &phrase) {
68 // note: this uses the most braindead search routine :-)
69 // however its not so bad as there shouldn't be many partial
70 // matches
71
72 // a null phrase matches anything
73 if (phrase.empty()) return true;
74
75 // if there is nothing then there can't be a match
76 if (doc == NULL || doclen == 0) return false;
77
78 text_t doc_word;
79 doc_word.reserve (16);
80
81 bool first = true;
82
83 unsigned char *doc_here = doc;
84 unsigned char *doc_herefirstword = doc;
85 unsigned char *doc_end = doc+doclen-1; // unitool conventions :-/
86
87 while (doc_here <= doc_end) {
88 first = true;
89
90 // there will be at least one member of phrase (see above)
91 vector<termfreqclass>::const_iterator phrase_here = phrase.begin();
92 vector<termfreqclass>::const_iterator phrase_end = phrase.end();
93 do {
94 // get the next non-word ... and ignore it, then get the next word
95 doc_here = parse_nonword_word (doc_here, doc_end, doc_word);
96 if (first) {doc_herefirstword = doc_here; first = false;}
97
98 // break if this word is not the next in the phrase
99 if ((*phrase_here).utf8equivterms.find (doc_word) ==
100 (*phrase_here).utf8equivterms.end()) break;
101
102 phrase_here++;
103 } while (doc_here <= doc_end && phrase_here != phrase_end);
104
105 // see if we found a phrase
106 if (phrase_here == phrase_end) return true;
107
108 doc_here = doc_herefirstword; // set the counter back
109 }
110
111 return false;
112}
113
114
115// looks for the stemmed phrase in the metadata or text associated with
116// an OID. This function has not been coded with all situations in mind
117bool OID_phrase_search (mgsearchclass &mgsearch,
118 gdbmclass &gdbm,
119 const text_t &index,
120 const text_t &subcollection,
121 const text_t &language,
122 const text_t &longindex,
123 const text_t &collection,
124 const vector<termfreqclass> &phrase,
125 int docnum) {
126 // disect the long index to find out where the text should come from
127 text_t level, gran;
128 text_t::const_iterator longindex_here = longindex.begin();
129 text_t::const_iterator longindex_end = longindex.end();
130 longindex_here = getdelimitstr (longindex_here, longindex_end, ':', level);
131 longindex_here = getdelimitstr (longindex_here, longindex_end, ':', gran);
132
133 if (gran.empty()) return false;
134
135 if (gran == "text") {
136 char *doc = NULL;
137 int doclen = 0;
138
139 // get text from mg.
140 if (!mgsearch.mgdocument (index, subcollection, language, collection,
141 docnum, doc, doclen)) return false;
142 return doc_phrase_search ((unsigned char *)doc, doclen, phrase);
143 }
144
145 // get OID
146 char *metadata = NULL;
147 text_t::size_type metadata_len = 0;
148 infodbclass docnum_info;
149 infodbclass OID_info;
150
151 if (!gdbm.getinfo (docnum, docnum_info)) return false;
152 text_t &OID = docnum_info["section"];
153 if (OID.empty()) return false;
154
155 // get field
156 if (!gdbm.getinfo (OID, OID_info)) return false;
157
158 metadata = (to_utf8(OID_info[gran])).getcarr(metadata_len);
159 bool result = doc_phrase_search ((unsigned char *)metadata, metadata_len, phrase);
160 delete [] metadata;
161
162 return result;
163}
Note: See TracBrowser for help on using the repository browser.