source: trunk/gsdl/src/colservr/phrasesearch.cpp@ 333

Last change on this file since 333 was 333, checked in by rjmcnab, 25 years ago

fixed bug :-

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 4.2 KB
Line 
1/**********************************************************************
2 *
3 * phrasesearch.cpp -- tools to search for a phrase in a larger text
4 * Copyright (C) 1999 DigiLib Systems Limited
5 *
6 * PUT COPYRIGHT NOTICE HERE
7 *
8 * $Id: phrasesearch.cpp 333 1999-07-01 09:25:54Z rjmcnab $
9 *
10 *********************************************************************/
11
12/*
13 $Log$
14 Revision 1.2 1999/07/01 09:25:54 rjmcnab
15 fixed bug :-^
16
17 Revision 1.1 1999/07/01 04:01:46 rjmcnab
18 Initial revision.
19
20 */
21
22
23#include "phrasesearch.h"
24#include "gsdlunicode.h"
25
26inline unsigned char *parse_nonword_word (unsigned char *here, unsigned char *end,
27 text_t &word) {
28 int c_len = 0;
29 unsigned short c = 0;
30
31 word.clear();
32
33 // parse non word
34 while (here <= end) {
35 c_len = parse_utf8_char (here, end, &c);
36 if (is_unicode_letdig(c)) {
37 while (c_len > 0) {
38 word.push_back(*here);
39 here++; c_len--;
40 }
41 break;
42 }
43 here += c_len;
44 }
45
46 // parse word
47 while (here <= end) {
48 c_len = parse_utf8_char (here, end, &c);
49 if (!is_unicode_letdig(c)) {
50 here += c_len; // it is ok to skip a nonword character
51 break;
52 }
53 while (c_len > 0) {
54 word.push_back(*here);
55 here++; c_len--;
56 }
57 }
58
59 return here;
60}
61
62
63bool doc_phrase_search (unsigned char *doc, int doclen,
64 const vector<termfreqclass> &phrase) {
65 // note: this uses the most braindead search routine :-)
66 // however its not so bad as there shouldn't be many partial
67 // matches
68
69 // a null phrase matches anything
70 if (phrase.empty()) return true;
71
72 // if there is nothing then there can't be a match
73 if (doc == NULL || doclen == 0) return false;
74
75 text_t doc_word;
76 doc_word.reserve (16);
77
78 bool first = true;
79
80 unsigned char *doc_here = doc;
81 unsigned char *doc_herefirstword = doc;
82 unsigned char *doc_end = doc+doclen-1; // unitool conventions :-/
83
84 while (doc_here <= doc_end) {
85 first = true;
86
87 // there will be at least one member of phrase (see above)
88 vector<termfreqclass>::const_iterator phrase_here = phrase.begin();
89 vector<termfreqclass>::const_iterator phrase_end = phrase.end();
90 do {
91 // get the next non-word ... and ignore it, then get the next word
92 doc_here = parse_nonword_word (doc_here, doc_end, doc_word);
93 if (first) {doc_herefirstword = doc_here; first = false;}
94
95 // break if this word is not the next in the phrase
96 if ((*phrase_here).utf8equivterms.find (doc_word) ==
97 (*phrase_here).utf8equivterms.end()) break;
98
99 phrase_here++;
100 } while (doc_here <= doc_end && phrase_here != phrase_end);
101
102 // see if we found a phrase
103 if (phrase_here == phrase_end) return true;
104
105 doc_here = doc_herefirstword; // set the counter back
106 }
107
108 return false;
109}
110
111
112// looks for the stemmed phrase in the metadata or text associated with
113// an OID. This function has not been coded with all situations in mind
114bool OID_phrase_search (mgsearchclass &mgsearch,
115 gdbmclass &gdbm,
116 const text_t &index,
117 const text_t &longindex,
118 const text_t &collection,
119 const vector<termfreqclass> &phrase,
120 int docnum) {
121 // disect the long index to find out where the text should come from
122 text_t level, gran;
123 text_t::const_iterator longindex_here = longindex.begin();
124 text_t::const_iterator longindex_end = longindex.end();
125 longindex_here = getdelimitstr (longindex_here, longindex_end, ':', level);
126 longindex_here = getdelimitstr (longindex_here, longindex_end, ':', gran);
127
128 if (gran.empty()) return false;
129
130 if (gran == "text") {
131 char *doc = NULL;
132 int doclen = 0;
133
134 // get text from mg.
135 if (!mgsearch.mgdocument (index, collection, docnum, doc, doclen)) return false;
136 return doc_phrase_search ((unsigned char *)doc, doclen, phrase);
137 }
138
139 // get OID
140 char *metadata = NULL;
141 text_t::size_type metadata_len = 0;
142 infodbclass docnum_info;
143 infodbclass OID_info;
144
145 if (!gdbm.getinfo (docnum, docnum_info)) return false;
146 text_t &OID = docnum_info["section"];
147 if (OID.empty()) return false;
148
149 // get field
150 if (!gdbm.getinfo (OID, OID_info)) return false;
151
152 metadata = (to_utf8(OID_info[gran])).getcarr(metadata_len);
153 bool result = doc_phrase_search ((unsigned char *)metadata, metadata_len, phrase);
154 delete [] metadata;
155
156 return result;
157}
Note: See TracBrowser for help on using the repository browser.