source: trunk/gsdl/src/colservr/phrasesearch.cpp@ 395

Last change on this file since 395 was 395, checked in by sjboddie, 25 years ago

changed to use termfreqclassarray type

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 4.4 KB
Line 
1/**********************************************************************
2 *
3 * phrasesearch.cpp -- tools to search for a phrase in a larger text
4 * Copyright (C) 1999 DigiLib Systems Limited
5 *
6 * PUT COPYRIGHT NOTICE HERE
7 *
8 * $Id: phrasesearch.cpp 395 1999-07-16 00:15:49Z sjboddie $
9 *
10 *********************************************************************/
11
12/*
13 $Log$
14 Revision 1.4 1999/07/16 00:15:48 sjboddie
15 changed to use termfreqclassarray type
16
17 Revision 1.3 1999/07/07 06:19:45 rjmcnab
18 Added ability to combine two or more independant queries.
19
20 Revision 1.2 1999/07/01 09:25:54 rjmcnab
21 fixed bug :-^
22
23 Revision 1.1 1999/07/01 04:01:46 rjmcnab
24 Initial revision.
25
26 */
27
28
29#include "phrasesearch.h"
30#include "gsdlunicode.h"
31
32inline unsigned char *parse_nonword_word (unsigned char *here, unsigned char *end,
33 text_t &word) {
34 int c_len = 0;
35 unsigned short c = 0;
36
37 word.clear();
38
39 // parse non word
40 while (here <= end) {
41 c_len = parse_utf8_char (here, end, &c);
42 if (is_unicode_letdig(c)) {
43 while (c_len > 0) {
44 word.push_back(*here);
45 here++; c_len--;
46 }
47 break;
48 }
49 here += c_len;
50 }
51
52 // parse word
53 while (here <= end) {
54 c_len = parse_utf8_char (here, end, &c);
55 if (!is_unicode_letdig(c)) {
56 here += c_len; // it is ok to skip a nonword character
57 break;
58 }
59 while (c_len > 0) {
60 word.push_back(*here);
61 here++; c_len--;
62 }
63 }
64
65 return here;
66}
67
68
69bool doc_phrase_search (unsigned char *doc, int doclen,
70 const termfreqclassarray &phrase) {
71 // note: this uses the most braindead search routine :-)
72 // however its not so bad as there shouldn't be many partial
73 // matches
74
75 // a null phrase matches anything
76 if (phrase.empty()) return true;
77
78 // if there is nothing then there can't be a match
79 if (doc == NULL || doclen == 0) return false;
80
81 text_t doc_word;
82 doc_word.reserve (16);
83
84 bool first = true;
85
86 unsigned char *doc_here = doc;
87 unsigned char *doc_herefirstword = doc;
88 unsigned char *doc_end = doc+doclen-1; // unitool conventions :-/
89
90 while (doc_here <= doc_end) {
91 first = true;
92
93 // there will be at least one member of phrase (see above)
94 termfreqclassarray::const_iterator phrase_here = phrase.begin();
95 termfreqclassarray::const_iterator phrase_end = phrase.end();
96 do {
97 // get the next non-word ... and ignore it, then get the next word
98 doc_here = parse_nonword_word (doc_here, doc_end, doc_word);
99 if (first) {doc_herefirstword = doc_here; first = false;}
100
101 // break if this word is not the next in the phrase
102 if ((*phrase_here).utf8equivterms.find (doc_word) ==
103 (*phrase_here).utf8equivterms.end()) break;
104
105 phrase_here++;
106 } while (doc_here <= doc_end && phrase_here != phrase_end);
107
108 // see if we found a phrase
109 if (phrase_here == phrase_end) return true;
110
111 doc_here = doc_herefirstword; // set the counter back
112 }
113
114 return false;
115}
116
117
118// looks for the stemmed phrase in the metadata or text associated with
119// an OID. This function has not been coded with all situations in mind
120bool OID_phrase_search (mgsearchclass &mgsearch,
121 gdbmclass &gdbm,
122 const text_t &index,
123 const text_t &subcollection,
124 const text_t &language,
125 const text_t &longindex,
126 const text_t &collection,
127 const termfreqclassarray &phrase,
128 int docnum) {
129 // disect the long index to find out where the text should come from
130 text_t level, gran;
131 text_t::const_iterator longindex_here = longindex.begin();
132 text_t::const_iterator longindex_end = longindex.end();
133 longindex_here = getdelimitstr (longindex_here, longindex_end, ':', level);
134 longindex_here = getdelimitstr (longindex_here, longindex_end, ':', gran);
135
136 if (gran.empty()) return false;
137
138 if (gran == "text") {
139 char *doc = NULL;
140 int doclen = 0;
141
142 // get text from mg.
143 if (!mgsearch.mgdocument (index, subcollection, language, collection,
144 docnum, doc, doclen)) return false;
145 return doc_phrase_search ((unsigned char *)doc, doclen, phrase);
146 }
147
148 // get OID
149 char *metadata = NULL;
150 text_t::size_type metadata_len = 0;
151 infodbclass docnum_info;
152 infodbclass OID_info;
153
154 if (!gdbm.getinfo (docnum, docnum_info)) return false;
155 text_t &OID = docnum_info["section"];
156 if (OID.empty()) return false;
157
158 // get field
159 if (!gdbm.getinfo (OID, OID_info)) return false;
160
161 metadata = (to_utf8(OID_info[gran])).getcarr(metadata_len);
162 bool result = doc_phrase_search ((unsigned char *)metadata, metadata_len, phrase);
163 delete [] metadata;
164
165 return result;
166}
Note: See TracBrowser for help on using the repository browser.