Context Navigation

source: trunk/gsdl/src/colservr/phrasesearch.cpp@ 1661

Last change on this file since 1661 was 1661, checked in by nzdl, 24 years ago
Fixed a couple of minor bugs in phrase searching code
Property svn:executable set to ``* Property svn:keywords set to `Author Date Id Revision`
File size: 5.8 KB

Line
1	/**********************************************************************
2	*
3	* phrasesearch.cpp -- tools to search for a phrase in a larger text
4	* Copyright (C) 1999 DigiLib Systems Limited
5	*
6	* A component of the Greenstone digital library software
7	* from the New Zealand Digital Library Project at the
8	* University of Waikato, New Zealand.
9	*
10	* This program is free software; you can redistribute it and/or modify
11	* it under the terms of the GNU General Public License as published by
12	* the Free Software Foundation; either version 2 of the License, or
13	* (at your option) any later version.
14	*
15	* This program is distributed in the hope that it will be useful,
16	* but WITHOUT ANY WARRANTY; without even the implied warranty of
17	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18	* GNU General Public License for more details.
19	*
20	* You should have received a copy of the GNU General Public License
21	* along with this program; if not, write to the Free Software
22	* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23	*
24	*********************************************************************/
25
26	#include "phrasesearch.h"
27	#include "gsdlunicode.h"
28
29	inline unsigned char parse_nonword_word (unsigned char here, unsigned char *end,
30	text_t &word) {
31	int c_len = 0;
32	unsigned short c = 0;
33
34	word.clear();
35
36	// parse non word
37	while (here <= end) {
38	c_len = parse_utf8_char (here, end, &c);
39	if (c == '(') {
40	// found a note, look for '}'
41	while (here <= end && c != ')') {
42	c_len = parse_utf8_char (here, end, &c);
43	here += c_len;
44	}
45	}
46	if (c == '{') {
47	// found a composite character, look for '}'
48	while (here <= end && c != '}') {
49	c_len = parse_utf8_char (here, end, &c);
50	here += c_len;
51	}
52	}
53	if (is_unicode_letdig(c)) {
54	while (c_len > 0) {
55	// this is in a word
56	word.push_back(*here);
57	here++; c_len--;
58	}
59	break;
60	}
61	here += c_len;
62	}
63
64	// parse word
65	while (here <= end) {
66	c_len = parse_utf8_char (here, end, &c);
67	if (!is_unicode_letdig(c)) {
68	here += c_len; // it is ok to skip a nonword character
69	break;
70	}
71	while (c_len > 0) {
72	word.push_back(*here);
73	here++; c_len--;
74	}
75	}
76
77	return here;
78	}
79
80
81	bool doc_phrase_search (unsigned char *doc, int doclen,
82	const termfreqclassarray &phrase) {
83	// note: this uses the most braindead search routine :-)
84	// however its not so bad as there shouldn't be many partial
85	// matches
86
87	// a null phrase matches anything
88	if (phrase.empty()) return true;
89
90	// if there is nothing then there can't be a match
91	if (doc == NULL \|\| doclen == 0) return false;
92
93	text_t doc_word;
94	doc_word.reserve (16);
95
96	bool first = true;
97
98	unsigned char *doc_here = doc;
99	unsigned char *doc_herefirstword = doc;
100	unsigned char *doc_end = doc+doclen-1; // unitool conventions :-/
101
102	while (doc_here <= doc_end) {
103	first = true;
104
105	// there will be at least one member of phrase (see above)
106	termfreqclassarray::const_iterator phrase_here = phrase.begin();
107	termfreqclassarray::const_iterator phrase_end = phrase.end();
108	do {
109	// get the next non-word ... and ignore it, then get the next word
110	doc_here = parse_nonword_word (doc_here, doc_end, doc_word);
111	if (first) {doc_herefirstword = doc_here; first = false;}
112
113	// break if this word is not the next in the phrase
114	if ((*phrase_here).utf8equivterms.find (doc_word) ==
115	(*phrase_here).utf8equivterms.end()) break;
116
117	phrase_here++;
118	} while (doc_here <= doc_end && phrase_here != phrase_end);
119
120	// see if we found a phrase
121	if (phrase_here == phrase_end) return true;
122
123	doc_here = doc_herefirstword; // set the counter back
124	}
125
126	return false;
127	}
128
129
130	// looks for the stemmed phrase in the metadata or text associated with
131	// an OID. This function has not been coded with all situations in mind
132	bool OID_phrase_search (mgsearchclass &mgsearch,
133	gdbmclass &gdbm,
134	const text_t &index,
135	const text_t &subcollection,
136	const text_t &language,
137	const text_t &longindex,
138	const text_t &collection,
139	const termfreqclassarray &phrase,
140	int docnum) {
141	// disect the long index to find out where the text should come from
142	text_t level, gran;
143	text_t::const_iterator longindex_here = longindex.begin();
144	text_t::const_iterator longindex_end = longindex.end();
145	longindex_here = getdelimitstr (longindex_here, longindex_end, ':', level);
146	longindex_here = getdelimitstr (longindex_here, longindex_end, ':', gran);
147
148	if (gran.empty()) return false;
149
150	// note that we're treating indexes with granularity of 'all' (i.e. text,Title,Creator)
151	// as if they were simply 'text' indexes
152	if (gran == "text" \|\| gran == "all") {
153	char *doc = NULL;
154	int doclen = 0;
155
156	// get text from mg.
157	if (!mgsearch.mgdocument (index, subcollection, language, collection,
158	docnum, doc, doclen)) return false;
159	return doc_phrase_search ((unsigned char *)doc, doclen, phrase);
160	}
161
162	// get OID
163	char *metadata = NULL;
164	text_t::size_type metadata_len = 0;
165	infodbclass docnum_info;
166	infodbclass OID_info;
167
168	if (!gdbm.getinfo (docnum, docnum_info)) return false;
169	text_t &OID = docnum_info["section"];
170	if (OID.empty()) return false;
171
172	// get field
173	if (!gdbm.getinfo (OID, OID_info)) return false;
174
175	bool result = false;
176	text_tarray *tarr_ptr = OID_info.getmultinfo (gran);
177	if (tarr_ptr != NULL ) {
178	text_tarray::const_iterator subvalue_here = (*tarr_ptr).begin();
179	text_tarray::const_iterator subvalue_end = (*tarr_ptr).end();
180	while (subvalue_here != subvalue_end) {
181	if (subvalue_here != NULL) {
182	metadata = (to_utf8(*subvalue_here)).getcarr(metadata_len);
183	result = doc_phrase_search ((unsigned char *)metadata, metadata_len, phrase);
184	delete [] metadata;
185
186	if (result) return true;
187	}
188
189	subvalue_here++;
190	}
191	}
192
193	return result;
194	}

Note: See TracBrowser for help on using the repository browser.

Download in other formats: