Context Navigation

phrasesearch.cpp@ 27065

Last change on this file since 27065 was 21324, checked in by ak19, 14 years ago
Changes to makefiles, configure files, and source code to work with the new configure flags that allow indexers to be individually compiled up by setting each indexer to be enabled or disabled (enable-mg, enable-mgpp, enable-lucene)
Property svn:executable set to ``* Property svn:keywords set to `Author Date Id Revision`
File size: 8.1 KB

Rev	Line
[328]	1	/**********************************************************************
	2	*
	3	* phrasesearch.cpp -- tools to search for a phrase in a larger text
	4	* Copyright (C) 1999 DigiLib Systems Limited
	5	*
[534]	6	* A component of the Greenstone digital library software
	7	* from the New Zealand Digital Library Project at the
	8	* University of Waikato, New Zealand.
[328]	9	*
[534]	10	* This program is free software; you can redistribute it and/or modify
	11	* it under the terms of the GNU General Public License as published by
	12	* the Free Software Foundation; either version 2 of the License, or
	13	* (at your option) any later version.
	14	*
	15	* This program is distributed in the hope that it will be useful,
	16	* but WITHOUT ANY WARRANTY; without even the implied warranty of
	17	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	18	* GNU General Public License for more details.
	19	*
	20	* You should have received a copy of the GNU General Public License
	21	* along with this program; if not, write to the Free Software
	22	* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
	23	*
[328]	24	*********************************************************************/
	25
	26	#include "phrasesearch.h"
	27	#include "gsdlunicode.h"
	28
	29	inline unsigned char parse_nonword_word (unsigned char here, unsigned char *end,
	30	text_t &word) {
	31	int c_len = 0;
	32	unsigned short c = 0;
	33
	34	word.clear();
	35
	36	// parse non word
	37	while (here <= end) {
	38	c_len = parse_utf8_char (here, end, &c);
[500]	39	if (c == '(') {
	40	// found a note, look for '}'
	41	while (here <= end && c != ')') {
	42	c_len = parse_utf8_char (here, end, &c);
	43	here += c_len;
	44	}
	45	}
	46	if (c == '{') {
	47	// found a composite character, look for '}'
	48	while (here <= end && c != '}') {
	49	c_len = parse_utf8_char (here, end, &c);
	50	here += c_len;
	51	}
	52	}
[328]	53	if (is_unicode_letdig(c)) {
[333]	54	while (c_len > 0) {
[500]	55	// this is in a word
[333]	56	word.push_back(*here);
[9620]	57	++here; --c_len;
[333]	58	}
[328]	59	break;
	60	}
[333]	61	here += c_len;
[328]	62	}
	63
	64	// parse word
	65	while (here <= end) {
	66	c_len = parse_utf8_char (here, end, &c);
[333]	67	if (!is_unicode_letdig(c)) {
	68	here += c_len; // it is ok to skip a nonword character
	69	break;
	70	}
	71	while (c_len > 0) {
	72	word.push_back(*here);
[9620]	73	++here; --c_len;
[333]	74	}
[328]	75	}
	76
	77	return here;
	78	}
	79
[15558]	80	static void get_all_docnums (dbclass &db, text_t OID, vector<int> &docnum_list) {
[328]	81
[2146]	82	infodbclass OID_info;
	83
	84	// get OID
[15558]	85	if (!db.getinfo (OID, OID_info)) return;
[2146]	86	if (OID_info["hastxt"] == "1" && !OID_info["docnum"].empty()) {
	87	docnum_list.push_back (OID_info["docnum"].getint());
	88	}
	89
	90	// get contents set
	91	if (OID_info["contains"].empty()) return;
	92	text_tarray contains; text_t tmptext;
	93	text_t::iterator contains_here = OID_info["contains"].begin();
	94	text_t::iterator contains_end = OID_info["contains"].end();
	95	while (contains_here != contains_end) {
	96	if (*contains_here == '"') tmptext += OID;
	97	else if (*contains_here == ';') {
	98	if (!tmptext.empty()) contains.push_back (tmptext);
	99	tmptext.clear();
	100	} else tmptext.push_back(*contains_here);
[9620]	101	++contains_here;
[2146]	102	}
	103	if (!tmptext.empty()) contains.push_back (tmptext);
	104
	105	text_tarray::const_iterator here = contains.begin();
	106	text_tarray::const_iterator end = contains.end();
	107	while (here != end) {
[15558]	108	get_all_docnums (db, *here, docnum_list);
[9620]	109	++here;
[2146]	110	}
	111	}
	112
[21324]	113	#ifdef ENABLE_MG
[328]	114	bool doc_phrase_search (unsigned char *doc, int doclen,
[395]	115	const termfreqclassarray &phrase) {
[328]	116	// note: this uses the most braindead search routine :-)
	117	// however its not so bad as there shouldn't be many partial
	118	// matches
	119
	120	// a null phrase matches anything
	121	if (phrase.empty()) return true;
	122
	123	// if there is nothing then there can't be a match
	124	if (doc == NULL \|\| doclen == 0) return false;
	125
	126	text_t doc_word;
	127	doc_word.reserve (16);
	128
	129	bool first = true;
	130
	131	unsigned char *doc_here = doc;
	132	unsigned char *doc_herefirstword = doc;
	133	unsigned char *doc_end = doc+doclen-1; // unitool conventions :-/
	134
	135	while (doc_here <= doc_end) {
	136	first = true;
[2146]	137
[328]	138	// there will be at least one member of phrase (see above)
[395]	139	termfreqclassarray::const_iterator phrase_here = phrase.begin();
	140	termfreqclassarray::const_iterator phrase_end = phrase.end();
[328]	141	do {
	142	// get the next non-word ... and ignore it, then get the next word
	143	doc_here = parse_nonword_word (doc_here, doc_end, doc_word);
	144	if (first) {doc_herefirstword = doc_here; first = false;}
	145
	146	// break if this word is not the next in the phrase
	147	if ((*phrase_here).utf8equivterms.find (doc_word) ==
	148	(*phrase_here).utf8equivterms.end()) break;
	149
[9620]	150	++phrase_here;
[328]	151	} while (doc_here <= doc_end && phrase_here != phrase_end);
	152
	153	// see if we found a phrase
	154	if (phrase_here == phrase_end) return true;
	155
	156	doc_here = doc_herefirstword; // set the counter back
	157	}
	158
	159	return false;
	160	}
	161
	162	// looks for the stemmed phrase in the metadata or text associated with
	163	// an OID. This function has not been coded with all situations in mind
	164	bool OID_phrase_search (mgsearchclass &mgsearch,
[15558]	165	dbclass &db,
[328]	166	const text_t &index,
[351]	167	const text_t &subcollection,
	168	const text_t &language,
[328]	169	const text_t &longindex,
	170	const text_t &collection,
[395]	171	const termfreqclassarray &phrase,
[328]	172	int docnum) {
[2146]	173
	174	// get OID
	175	infodbclass docnum_info;
[15558]	176	if (!db.getinfo (docnum, docnum_info)) return false;
[2146]	177	text_t &OID = docnum_info["section"];
	178	if (OID.empty()) return false;
	179
[328]	180	// disect the long index to find out where the text should come from
[2146]	181	text_t gran, type;
[328]	182	text_t::const_iterator longindex_here = longindex.begin();
	183	text_t::const_iterator longindex_end = longindex.end();
	184	longindex_here = getdelimitstr (longindex_here, longindex_end, ':', gran);
[2146]	185	longindex_here = getdelimitstr (longindex_here, longindex_end, ':', type);
[328]	186
	187	if (gran.empty()) return false;
	188
[2146]	189	// note that we're treating indexes of type 'all' (i.e. text,Title,Creator)
	190	// or other composite indexes that contain "text" as if they were simply 'text' indexes
[2937]	191	if ((type == "text") \|\| (type == "all") \|\| (findword(type.begin(),type.end(),"text") != type.end())) {
[328]	192	char *doc = NULL;
	193	int doclen = 0;
	194
	195	// get text from mg.
[2146]	196	if (gran == "document") {
[328]	197
[2146]	198	// if this is a document level index (which should only happen if
	199	// there are no matching indexes with a finer granularity -- see
	200	// mgqueryfilterclass::mg_parse_query_params) then we must do the
	201	// phrase search on the entire document (i.e. all the sections)
	202	// -- this is going to make a slow process even slower
	203	vector<int> docnum_list; text_t fulldoc;
[15558]	204	get_all_docnums (db, OID, docnum_list);
[2146]	205	vector<int>::const_iterator this_docnum = docnum_list.begin();
	206	vector<int>::const_iterator end_docnum = docnum_list.end();
	207	while (this_docnum != end_docnum) {
	208	if (mgsearch.mgdocument (index, subcollection, language, collection,
	209	*this_docnum, doc, doclen)) {
	210	fulldoc.appendcstr (doc);
	211	}
[9620]	212	++this_docnum;
[2146]	213	}
	214	doc = fulldoc.getcstr();
	215	doclen = fulldoc.size();
	216	bool rv = doc_phrase_search ((unsigned char *)doc, doclen, phrase);
[9631]	217	delete []doc;
[2146]	218	return rv;
	219
	220	} else {
	221
	222	if (!mgsearch.mgdocument (index, subcollection, language, collection,
	223	docnum, doc, doclen)) return false;
	224	return doc_phrase_search ((unsigned char *)doc, doclen, phrase);
	225	}
	226	}
	227
[328]	228	char *metadata = NULL;
	229	text_t::size_type metadata_len = 0;
	230	infodbclass OID_info;
	231
	232	// get field
[15558]	233	if (!db.getinfo (OID, OID_info)) return false;
[328]	234
[500]	235	bool result = false;
	236
[5141]	237	// need to look through all the metadata values in the index
	238	text_tarray keys;
	239	splitchar(type.begin(), type.end(), ',', keys);
	240
	241	text_tarray::const_iterator keyhere = keys.begin();
	242	text_tarray::const_iterator keyend = keys.end();
	243	while (keyhere != keyend) {
	244	text_tarray tarr_ptr = OID_info.getmultinfo (keyhere);
	245	if (tarr_ptr != NULL ) {
	246	text_tarray::const_iterator subvalue_here = (*tarr_ptr).begin();
	247	text_tarray::const_iterator subvalue_end = (*tarr_ptr).end();
	248	while (subvalue_here != subvalue_end) {
	249	metadata = (to_utf8(*subvalue_here)).getcarr(metadata_len);
	250	result = doc_phrase_search ((unsigned char *)metadata, metadata_len, phrase);
	251	delete [] metadata;
	252
	253	if (result) return true;
[9620]	254	++subvalue_here;
[5141]	255	}
[500]	256	}
[9620]	257	++keyhere;
[500]	258	}
[328]	259	return result;
	260	}
[21324]	261	#endif

Note: See TracBrowser for help on using the repository browser.

Context Navigation

source: main/trunk/greenstone2/runtime-src/src/colservr/phrasesearch.cpp@ 27065

Download in other formats: