source: trunk/gsdl/src/colservr/phrasesearch.cpp@ 328

Last change on this file since 328 was 328, checked in by rjmcnab, 25 years ago

Initial revision.

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 4.0 KB
Line 
1/**********************************************************************
2 *
3 * phrasesearch.cpp -- tools to search for a phrase in a larger text
4 * Copyright (C) 1999 DigiLib Systems Limited
5 *
6 * PUT COPYRIGHT NOTICE HERE
7 *
8 * $Id: phrasesearch.cpp 328 1999-07-01 04:01:47Z rjmcnab $
9 *
10 *********************************************************************/
11
12/*
13 $Log$
14 Revision 1.1 1999/07/01 04:01:46 rjmcnab
15 Initial revision.
16
17 */
18
19
20#include "phrasesearch.h"
21#include "gsdlunicode.h"
22
23inline unsigned char *parse_nonword_word (unsigned char *here, unsigned char *end,
24 text_t &word) {
25 int c_len = 0;
26 unsigned short c = 0;
27
28 word.clear();
29
30 // parse non word
31 while (here <= end) {
32 c_len = parse_utf8_char (here, end, &c);
33 here += c_len;
34 if (is_unicode_letdig(c)) {
35 word.push_back(c);
36 break;
37 }
38 }
39
40 // parse word
41 while (here <= end) {
42 c_len = parse_utf8_char (here, end, &c);
43 here += c_len; // it is ok to skip a nonword character
44 if (!is_unicode_letdig(c)) break;
45 word.push_back(c);
46 }
47
48 return here;
49}
50
51
52bool doc_phrase_search (unsigned char *doc, int doclen,
53 const vector<termfreqclass> &phrase) {
54 // note: this uses the most braindead search routine :-)
55 // however its not so bad as there shouldn't be many partial
56 // matches
57
58 // a null phrase matches anything
59 if (phrase.empty()) return true;
60
61 // if there is nothing then there can't be a match
62 if (doc == NULL || doclen == 0) return false;
63
64 text_t doc_word;
65 doc_word.reserve (16);
66
67 bool first = true;
68
69 unsigned char *doc_here = doc;
70 unsigned char *doc_herefirstword = doc;
71 unsigned char *doc_end = doc+doclen-1; // unitool conventions :-/
72
73 while (doc_here <= doc_end) {
74 first = true;
75
76 // there will be at least one member of phrase (see above)
77 vector<termfreqclass>::const_iterator phrase_here = phrase.begin();
78 vector<termfreqclass>::const_iterator phrase_end = phrase.end();
79 do {
80 // get the next non-word ... and ignore it, then get the next word
81 doc_here = parse_nonword_word (doc_here, doc_end, doc_word);
82 if (first) {doc_herefirstword = doc_here; first = false;}
83
84 // break if this word is not the next in the phrase
85 if ((*phrase_here).utf8equivterms.find (doc_word) ==
86 (*phrase_here).utf8equivterms.end()) break;
87
88 phrase_here++;
89 } while (doc_here <= doc_end && phrase_here != phrase_end);
90
91 // see if we found a phrase
92 if (phrase_here == phrase_end) return true;
93
94 doc_here = doc_herefirstword; // set the counter back
95 }
96
97 return false;
98}
99
100
101// looks for the stemmed phrase in the metadata or text associated with
102// an OID. This function has not been coded with all situations in mind
103bool OID_phrase_search (mgsearchclass &mgsearch,
104 gdbmclass &gdbm,
105 const text_t &index,
106 const text_t &longindex,
107 const text_t &collection,
108 const vector<termfreqclass> &phrase,
109 int docnum) {
110 // disect the long index to find out where the text should come from
111 text_t level, gran;
112 text_t::const_iterator longindex_here = longindex.begin();
113 text_t::const_iterator longindex_end = longindex.end();
114 longindex_here = getdelimitstr (longindex_here, longindex_end, ':', level);
115 longindex_here = getdelimitstr (longindex_here, longindex_end, ':', gran);
116
117 if (gran.empty()) return false;
118
119 if (gran == "text") {
120 char *doc = NULL;
121 int doclen = 0;
122
123 // get text from mg.
124 if (!mgsearch.mgdocument (index, collection, docnum, doc, doclen)) return false;
125 return doc_phrase_search ((unsigned char *)doc, doclen, phrase);
126 }
127
128 // get OID
129 char *metadata = NULL;
130 text_t::size_type metadata_len = 0;
131 infodbclass docnum_info;
132 infodbclass OID_info;
133
134 if (!gdbm.getinfo (docnum, docnum_info)) return false;
135 text_t &OID = docnum_info["section"];
136 if (OID.empty()) return false;
137
138 // get field
139 if (!gdbm.getinfo (OID, OID_info)) return false;
140
141 metadata = (to_utf8(OID_info[gran])).getcarr(metadata_len);
142 bool result = doc_phrase_search ((unsigned char *)metadata, metadata_len, phrase);
143 delete [] metadata;
144
145 return result;
146}
Note: See TracBrowser for help on using the repository browser.