[328] | 1 | /**********************************************************************
|
---|
| 2 | *
|
---|
| 3 | * phrasesearch.cpp -- tools to search for a phrase in a larger text
|
---|
| 4 | * Copyright (C) 1999 DigiLib Systems Limited
|
---|
| 5 | *
|
---|
[534] | 6 | * A component of the Greenstone digital library software
|
---|
| 7 | * from the New Zealand Digital Library Project at the
|
---|
| 8 | * University of Waikato, New Zealand.
|
---|
[328] | 9 | *
|
---|
[534] | 10 | * This program is free software; you can redistribute it and/or modify
|
---|
| 11 | * it under the terms of the GNU General Public License as published by
|
---|
| 12 | * the Free Software Foundation; either version 2 of the License, or
|
---|
| 13 | * (at your option) any later version.
|
---|
| 14 | *
|
---|
| 15 | * This program is distributed in the hope that it will be useful,
|
---|
| 16 | * but WITHOUT ANY WARRANTY; without even the implied warranty of
|
---|
| 17 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
---|
| 18 | * GNU General Public License for more details.
|
---|
| 19 | *
|
---|
| 20 | * You should have received a copy of the GNU General Public License
|
---|
| 21 | * along with this program; if not, write to the Free Software
|
---|
| 22 | * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
|
---|
| 23 | *
|
---|
[328] | 24 | *********************************************************************/
|
---|
| 25 |
|
---|
| 26 | #include "phrasesearch.h"
|
---|
| 27 | #include "gsdlunicode.h"
|
---|
| 28 |
|
---|
| 29 | inline unsigned char *parse_nonword_word (unsigned char *here, unsigned char *end,
|
---|
| 30 | text_t &word) {
|
---|
| 31 | int c_len = 0;
|
---|
| 32 | unsigned short c = 0;
|
---|
| 33 |
|
---|
| 34 | word.clear();
|
---|
| 35 |
|
---|
| 36 | // parse non word
|
---|
| 37 | while (here <= end) {
|
---|
| 38 | c_len = parse_utf8_char (here, end, &c);
|
---|
[500] | 39 | if (c == '(') {
|
---|
| 40 | // found a note, look for '}'
|
---|
| 41 | while (here <= end && c != ')') {
|
---|
| 42 | c_len = parse_utf8_char (here, end, &c);
|
---|
| 43 | here += c_len;
|
---|
| 44 | }
|
---|
| 45 | }
|
---|
| 46 | if (c == '{') {
|
---|
| 47 | // found a composite character, look for '}'
|
---|
| 48 | while (here <= end && c != '}') {
|
---|
| 49 | c_len = parse_utf8_char (here, end, &c);
|
---|
| 50 | here += c_len;
|
---|
| 51 | }
|
---|
| 52 | }
|
---|
[328] | 53 | if (is_unicode_letdig(c)) {
|
---|
[333] | 54 | while (c_len > 0) {
|
---|
[500] | 55 | // this is in a word
|
---|
[333] | 56 | word.push_back(*here);
|
---|
[9620] | 57 | ++here; --c_len;
|
---|
[333] | 58 | }
|
---|
[328] | 59 | break;
|
---|
| 60 | }
|
---|
[333] | 61 | here += c_len;
|
---|
[328] | 62 | }
|
---|
| 63 |
|
---|
| 64 | // parse word
|
---|
| 65 | while (here <= end) {
|
---|
| 66 | c_len = parse_utf8_char (here, end, &c);
|
---|
[333] | 67 | if (!is_unicode_letdig(c)) {
|
---|
| 68 | here += c_len; // it is ok to skip a nonword character
|
---|
| 69 | break;
|
---|
| 70 | }
|
---|
| 71 | while (c_len > 0) {
|
---|
| 72 | word.push_back(*here);
|
---|
[9620] | 73 | ++here; --c_len;
|
---|
[333] | 74 | }
|
---|
[328] | 75 | }
|
---|
| 76 |
|
---|
| 77 | return here;
|
---|
| 78 | }
|
---|
| 79 |
|
---|
[15558] | 80 | static void get_all_docnums (dbclass &db, text_t OID, vector<int> &docnum_list) {
|
---|
[328] | 81 |
|
---|
[2146] | 82 | infodbclass OID_info;
|
---|
| 83 |
|
---|
| 84 | // get OID
|
---|
[15558] | 85 | if (!db.getinfo (OID, OID_info)) return;
|
---|
[2146] | 86 | if (OID_info["hastxt"] == "1" && !OID_info["docnum"].empty()) {
|
---|
| 87 | docnum_list.push_back (OID_info["docnum"].getint());
|
---|
| 88 | }
|
---|
| 89 |
|
---|
| 90 | // get contents set
|
---|
| 91 | if (OID_info["contains"].empty()) return;
|
---|
| 92 | text_tarray contains; text_t tmptext;
|
---|
| 93 | text_t::iterator contains_here = OID_info["contains"].begin();
|
---|
| 94 | text_t::iterator contains_end = OID_info["contains"].end();
|
---|
| 95 | while (contains_here != contains_end) {
|
---|
| 96 | if (*contains_here == '"') tmptext += OID;
|
---|
| 97 | else if (*contains_here == ';') {
|
---|
| 98 | if (!tmptext.empty()) contains.push_back (tmptext);
|
---|
| 99 | tmptext.clear();
|
---|
| 100 | } else tmptext.push_back(*contains_here);
|
---|
[9620] | 101 | ++contains_here;
|
---|
[2146] | 102 | }
|
---|
| 103 | if (!tmptext.empty()) contains.push_back (tmptext);
|
---|
| 104 |
|
---|
| 105 | text_tarray::const_iterator here = contains.begin();
|
---|
| 106 | text_tarray::const_iterator end = contains.end();
|
---|
| 107 | while (here != end) {
|
---|
[15558] | 108 | get_all_docnums (db, *here, docnum_list);
|
---|
[9620] | 109 | ++here;
|
---|
[2146] | 110 | }
|
---|
| 111 | }
|
---|
| 112 |
|
---|
[21324] | 113 | #ifdef ENABLE_MG
|
---|
[328] | 114 | bool doc_phrase_search (unsigned char *doc, int doclen,
|
---|
[395] | 115 | const termfreqclassarray &phrase) {
|
---|
[328] | 116 | // note: this uses the most braindead search routine :-)
|
---|
| 117 | // however its not so bad as there shouldn't be many partial
|
---|
| 118 | // matches
|
---|
| 119 |
|
---|
| 120 | // a null phrase matches anything
|
---|
| 121 | if (phrase.empty()) return true;
|
---|
| 122 |
|
---|
| 123 | // if there is nothing then there can't be a match
|
---|
| 124 | if (doc == NULL || doclen == 0) return false;
|
---|
| 125 |
|
---|
| 126 | text_t doc_word;
|
---|
| 127 | doc_word.reserve (16);
|
---|
| 128 |
|
---|
| 129 | bool first = true;
|
---|
| 130 |
|
---|
| 131 | unsigned char *doc_here = doc;
|
---|
| 132 | unsigned char *doc_herefirstword = doc;
|
---|
| 133 | unsigned char *doc_end = doc+doclen-1; // unitool conventions :-/
|
---|
| 134 |
|
---|
| 135 | while (doc_here <= doc_end) {
|
---|
| 136 | first = true;
|
---|
[2146] | 137 |
|
---|
[328] | 138 | // there will be at least one member of phrase (see above)
|
---|
[395] | 139 | termfreqclassarray::const_iterator phrase_here = phrase.begin();
|
---|
| 140 | termfreqclassarray::const_iterator phrase_end = phrase.end();
|
---|
[328] | 141 | do {
|
---|
| 142 | // get the next non-word ... and ignore it, then get the next word
|
---|
| 143 | doc_here = parse_nonword_word (doc_here, doc_end, doc_word);
|
---|
| 144 | if (first) {doc_herefirstword = doc_here; first = false;}
|
---|
| 145 |
|
---|
| 146 | // break if this word is not the next in the phrase
|
---|
| 147 | if ((*phrase_here).utf8equivterms.find (doc_word) ==
|
---|
| 148 | (*phrase_here).utf8equivterms.end()) break;
|
---|
| 149 |
|
---|
[9620] | 150 | ++phrase_here;
|
---|
[328] | 151 | } while (doc_here <= doc_end && phrase_here != phrase_end);
|
---|
| 152 |
|
---|
| 153 | // see if we found a phrase
|
---|
| 154 | if (phrase_here == phrase_end) return true;
|
---|
| 155 |
|
---|
| 156 | doc_here = doc_herefirstword; // set the counter back
|
---|
| 157 | }
|
---|
| 158 |
|
---|
| 159 | return false;
|
---|
| 160 | }
|
---|
| 161 |
|
---|
| 162 | // looks for the stemmed phrase in the metadata or text associated with
|
---|
| 163 | // an OID. This function has not been coded with all situations in mind
|
---|
| 164 | bool OID_phrase_search (mgsearchclass &mgsearch,
|
---|
[15558] | 165 | dbclass &db,
|
---|
[328] | 166 | const text_t &index,
|
---|
[351] | 167 | const text_t &subcollection,
|
---|
| 168 | const text_t &language,
|
---|
[328] | 169 | const text_t &longindex,
|
---|
| 170 | const text_t &collection,
|
---|
[395] | 171 | const termfreqclassarray &phrase,
|
---|
[328] | 172 | int docnum) {
|
---|
[2146] | 173 |
|
---|
| 174 | // get OID
|
---|
| 175 | infodbclass docnum_info;
|
---|
[15558] | 176 | if (!db.getinfo (docnum, docnum_info)) return false;
|
---|
[2146] | 177 | text_t &OID = docnum_info["section"];
|
---|
| 178 | if (OID.empty()) return false;
|
---|
| 179 |
|
---|
[328] | 180 | // disect the long index to find out where the text should come from
|
---|
[2146] | 181 | text_t gran, type;
|
---|
[328] | 182 | text_t::const_iterator longindex_here = longindex.begin();
|
---|
| 183 | text_t::const_iterator longindex_end = longindex.end();
|
---|
| 184 | longindex_here = getdelimitstr (longindex_here, longindex_end, ':', gran);
|
---|
[2146] | 185 | longindex_here = getdelimitstr (longindex_here, longindex_end, ':', type);
|
---|
[328] | 186 |
|
---|
| 187 | if (gran.empty()) return false;
|
---|
| 188 |
|
---|
[2146] | 189 | // note that we're treating indexes of type 'all' (i.e. text,Title,Creator)
|
---|
| 190 | // or other composite indexes that contain "text" as if they were simply 'text' indexes
|
---|
[2937] | 191 | if ((type == "text") || (type == "all") || (findword(type.begin(),type.end(),"text") != type.end())) {
|
---|
[328] | 192 | char *doc = NULL;
|
---|
| 193 | int doclen = 0;
|
---|
| 194 |
|
---|
| 195 | // get text from mg.
|
---|
[2146] | 196 | if (gran == "document") {
|
---|
[328] | 197 |
|
---|
[2146] | 198 | // if this is a document level index (which should only happen if
|
---|
| 199 | // there are no matching indexes with a finer granularity -- see
|
---|
| 200 | // mgqueryfilterclass::mg_parse_query_params) then we must do the
|
---|
| 201 | // phrase search on the entire document (i.e. all the sections)
|
---|
| 202 | // -- this is going to make a slow process even slower
|
---|
| 203 | vector<int> docnum_list; text_t fulldoc;
|
---|
[15558] | 204 | get_all_docnums (db, OID, docnum_list);
|
---|
[2146] | 205 | vector<int>::const_iterator this_docnum = docnum_list.begin();
|
---|
| 206 | vector<int>::const_iterator end_docnum = docnum_list.end();
|
---|
| 207 | while (this_docnum != end_docnum) {
|
---|
| 208 | if (mgsearch.mgdocument (index, subcollection, language, collection,
|
---|
| 209 | *this_docnum, doc, doclen)) {
|
---|
| 210 | fulldoc.appendcstr (doc);
|
---|
| 211 | }
|
---|
[9620] | 212 | ++this_docnum;
|
---|
[2146] | 213 | }
|
---|
| 214 | doc = fulldoc.getcstr();
|
---|
| 215 | doclen = fulldoc.size();
|
---|
| 216 | bool rv = doc_phrase_search ((unsigned char *)doc, doclen, phrase);
|
---|
[9631] | 217 | delete []doc;
|
---|
[2146] | 218 | return rv;
|
---|
| 219 |
|
---|
| 220 | } else {
|
---|
| 221 |
|
---|
| 222 | if (!mgsearch.mgdocument (index, subcollection, language, collection,
|
---|
| 223 | docnum, doc, doclen)) return false;
|
---|
| 224 | return doc_phrase_search ((unsigned char *)doc, doclen, phrase);
|
---|
| 225 | }
|
---|
| 226 | }
|
---|
| 227 |
|
---|
[328] | 228 | char *metadata = NULL;
|
---|
| 229 | text_t::size_type metadata_len = 0;
|
---|
| 230 | infodbclass OID_info;
|
---|
| 231 |
|
---|
| 232 | // get field
|
---|
[15558] | 233 | if (!db.getinfo (OID, OID_info)) return false;
|
---|
[328] | 234 |
|
---|
[500] | 235 | bool result = false;
|
---|
| 236 |
|
---|
[5141] | 237 | // need to look through all the metadata values in the index
|
---|
| 238 | text_tarray keys;
|
---|
| 239 | splitchar(type.begin(), type.end(), ',', keys);
|
---|
| 240 |
|
---|
| 241 | text_tarray::const_iterator keyhere = keys.begin();
|
---|
| 242 | text_tarray::const_iterator keyend = keys.end();
|
---|
| 243 | while (keyhere != keyend) {
|
---|
| 244 | text_tarray *tarr_ptr = OID_info.getmultinfo (*keyhere);
|
---|
| 245 | if (tarr_ptr != NULL ) {
|
---|
| 246 | text_tarray::const_iterator subvalue_here = (*tarr_ptr).begin();
|
---|
| 247 | text_tarray::const_iterator subvalue_end = (*tarr_ptr).end();
|
---|
| 248 | while (subvalue_here != subvalue_end) {
|
---|
| 249 | metadata = (to_utf8(*subvalue_here)).getcarr(metadata_len);
|
---|
| 250 | result = doc_phrase_search ((unsigned char *)metadata, metadata_len, phrase);
|
---|
| 251 | delete [] metadata;
|
---|
| 252 |
|
---|
| 253 | if (result) return true;
|
---|
[9620] | 254 | ++subvalue_here;
|
---|
[5141] | 255 | }
|
---|
[500] | 256 | }
|
---|
[9620] | 257 | ++keyhere;
|
---|
[500] | 258 | }
|
---|
[328] | 259 | return result;
|
---|
| 260 | }
|
---|
[21324] | 261 | #endif
|
---|