/********************************************************************** * * phrasesearch.cpp -- tools to search for a phrase in a larger text * Copyright (C) 1999 DigiLib Systems Limited * * A component of the Greenstone digital library software * from the New Zealand Digital Library Project at the * University of Waikato, New Zealand. * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. * *********************************************************************/ #include "phrasesearch.h" #include "gsdlunicode.h" inline unsigned char *parse_nonword_word (unsigned char *here, unsigned char *end, text_t &word) { int c_len = 0; unsigned short c = 0; word.clear(); // parse non word while (here <= end) { c_len = parse_utf8_char (here, end, &c); if (c == '(') { // found a note, look for '}' while (here <= end && c != ')') { c_len = parse_utf8_char (here, end, &c); here += c_len; } } if (c == '{') { // found a composite character, look for '}' while (here <= end && c != '}') { c_len = parse_utf8_char (here, end, &c); here += c_len; } } if (is_unicode_letdig(c)) { while (c_len > 0) { // this is in a word word.push_back(*here); ++here; --c_len; } break; } here += c_len; } // parse word while (here <= end) { c_len = parse_utf8_char (here, end, &c); if (!is_unicode_letdig(c)) { here += c_len; // it is ok to skip a nonword character break; } while (c_len > 0) { word.push_back(*here); ++here; --c_len; } } return here; } static void get_all_docnums (dbclass &db, text_t OID, vector &docnum_list) { infodbclass OID_info; // get OID if (!db.getinfo (OID, OID_info)) return; if (OID_info["hastxt"] == "1" && !OID_info["docnum"].empty()) { docnum_list.push_back (OID_info["docnum"].getint()); } // get contents set if (OID_info["contains"].empty()) return; text_tarray contains; text_t tmptext; text_t::iterator contains_here = OID_info["contains"].begin(); text_t::iterator contains_end = OID_info["contains"].end(); while (contains_here != contains_end) { if (*contains_here == '"') tmptext += OID; else if (*contains_here == ';') { if (!tmptext.empty()) contains.push_back (tmptext); tmptext.clear(); } else tmptext.push_back(*contains_here); ++contains_here; } if (!tmptext.empty()) contains.push_back (tmptext); text_tarray::const_iterator here = contains.begin(); text_tarray::const_iterator end = contains.end(); while (here != end) { get_all_docnums (db, *here, docnum_list); ++here; } } #ifdef ENABLE_MG bool doc_phrase_search (unsigned char *doc, int doclen, const termfreqclassarray &phrase) { // note: this uses the most braindead search routine :-) // however its not so bad as there shouldn't be many partial // matches // a null phrase matches anything if (phrase.empty()) return true; // if there is nothing then there can't be a match if (doc == NULL || doclen == 0) return false; text_t doc_word; doc_word.reserve (16); bool first = true; unsigned char *doc_here = doc; unsigned char *doc_herefirstword = doc; unsigned char *doc_end = doc+doclen-1; // unitool conventions :-/ while (doc_here <= doc_end) { first = true; // there will be at least one member of phrase (see above) termfreqclassarray::const_iterator phrase_here = phrase.begin(); termfreqclassarray::const_iterator phrase_end = phrase.end(); do { // get the next non-word ... and ignore it, then get the next word doc_here = parse_nonword_word (doc_here, doc_end, doc_word); if (first) {doc_herefirstword = doc_here; first = false;} // break if this word is not the next in the phrase if ((*phrase_here).utf8equivterms.find (doc_word) == (*phrase_here).utf8equivterms.end()) break; ++phrase_here; } while (doc_here <= doc_end && phrase_here != phrase_end); // see if we found a phrase if (phrase_here == phrase_end) return true; doc_here = doc_herefirstword; // set the counter back } return false; } // looks for the stemmed phrase in the metadata or text associated with // an OID. This function has not been coded with all situations in mind bool OID_phrase_search (mgsearchclass &mgsearch, dbclass &db, const text_t &index, const text_t &subcollection, const text_t &language, const text_t &longindex, const text_t &collection, const termfreqclassarray &phrase, int docnum) { // get OID infodbclass docnum_info; if (!db.getinfo (docnum, docnum_info)) return false; text_t &OID = docnum_info["section"]; if (OID.empty()) return false; // disect the long index to find out where the text should come from text_t gran, type; text_t::const_iterator longindex_here = longindex.begin(); text_t::const_iterator longindex_end = longindex.end(); longindex_here = getdelimitstr (longindex_here, longindex_end, ':', gran); longindex_here = getdelimitstr (longindex_here, longindex_end, ':', type); if (gran.empty()) return false; // note that we're treating indexes of type 'all' (i.e. text,Title,Creator) // or other composite indexes that contain "text" as if they were simply 'text' indexes if ((type == "text") || (type == "all") || (findword(type.begin(),type.end(),"text") != type.end())) { char *doc = NULL; int doclen = 0; // get text from mg. if (gran == "document") { // if this is a document level index (which should only happen if // there are no matching indexes with a finer granularity -- see // mgqueryfilterclass::mg_parse_query_params) then we must do the // phrase search on the entire document (i.e. all the sections) // -- this is going to make a slow process even slower vector docnum_list; text_t fulldoc; get_all_docnums (db, OID, docnum_list); vector::const_iterator this_docnum = docnum_list.begin(); vector::const_iterator end_docnum = docnum_list.end(); while (this_docnum != end_docnum) { if (mgsearch.mgdocument (index, subcollection, language, collection, *this_docnum, doc, doclen)) { fulldoc.appendcstr (doc); } ++this_docnum; } doc = fulldoc.getcstr(); doclen = fulldoc.size(); bool rv = doc_phrase_search ((unsigned char *)doc, doclen, phrase); delete []doc; return rv; } else { if (!mgsearch.mgdocument (index, subcollection, language, collection, docnum, doc, doclen)) return false; return doc_phrase_search ((unsigned char *)doc, doclen, phrase); } } char *metadata = NULL; text_t::size_type metadata_len = 0; infodbclass OID_info; // get field if (!db.getinfo (OID, OID_info)) return false; bool result = false; // need to look through all the metadata values in the index text_tarray keys; splitchar(type.begin(), type.end(), ',', keys); text_tarray::const_iterator keyhere = keys.begin(); text_tarray::const_iterator keyend = keys.end(); while (keyhere != keyend) { text_tarray *tarr_ptr = OID_info.getmultinfo (*keyhere); if (tarr_ptr != NULL ) { text_tarray::const_iterator subvalue_here = (*tarr_ptr).begin(); text_tarray::const_iterator subvalue_end = (*tarr_ptr).end(); while (subvalue_here != subvalue_end) { metadata = (to_utf8(*subvalue_here)).getcarr(metadata_len); result = doc_phrase_search ((unsigned char *)metadata, metadata_len, phrase); delete [] metadata; if (result) return true; ++subvalue_here; } } ++keyhere; } return result; } #endif