Context Navigation

source: main/tags/2.10/gsdl/src/colservr/phrasesearch.cpp@ 32704

Last change on this file since 32704 was 534, checked in by sjboddie, 25 years ago
added gpl notice
Property svn:executable set to ``* Property svn:keywords set to `Author Date Id Revision`
File size: 6.1 KB

Line
1	/**********************************************************************
2	*
3	* phrasesearch.cpp -- tools to search for a phrase in a larger text
4	* Copyright (C) 1999 DigiLib Systems Limited
5	*
6	* A component of the Greenstone digital library software
7	* from the New Zealand Digital Library Project at the
8	* University of Waikato, New Zealand.
9	*
10	* This program is free software; you can redistribute it and/or modify
11	* it under the terms of the GNU General Public License as published by
12	* the Free Software Foundation; either version 2 of the License, or
13	* (at your option) any later version.
14	*
15	* This program is distributed in the hope that it will be useful,
16	* but WITHOUT ANY WARRANTY; without even the implied warranty of
17	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18	* GNU General Public License for more details.
19	*
20	* You should have received a copy of the GNU General Public License
21	* along with this program; if not, write to the Free Software
22	* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23	*
24	* $Id: phrasesearch.cpp 534 1999-09-07 04:57:43Z sjboddie $
25	*
26	*********************************************************************/
27
28	/*
29	$Log$
30	Revision 1.6 1999/09/07 04:57:23 sjboddie
31	added gpl notice
32
33	Revision 1.5 1999/08/31 22:45:12 rjmcnab
34	fixed small problem
35
36	Revision 1.4 1999/07/16 00:15:48 sjboddie
37	changed to use termfreqclassarray type
38
39	Revision 1.3 1999/07/07 06:19:45 rjmcnab
40	Added ability to combine two or more independant queries.
41
42	Revision 1.2 1999/07/01 09:25:54 rjmcnab
43	fixed bug :-^
44
45	Revision 1.1 1999/07/01 04:01:46 rjmcnab
46	Initial revision.
47
48	*/
49
50
51	#include "phrasesearch.h"
52	#include "gsdlunicode.h"
53
54	inline unsigned char parse_nonword_word (unsigned char here, unsigned char *end,
55	text_t &word) {
56	int c_len = 0;
57	unsigned short c = 0;
58
59	word.clear();
60
61	// parse non word
62	while (here <= end) {
63	c_len = parse_utf8_char (here, end, &c);
64	if (c == '(') {
65	// found a note, look for '}'
66	while (here <= end && c != ')') {
67	c_len = parse_utf8_char (here, end, &c);
68	here += c_len;
69	}
70	}
71	if (c == '{') {
72	// found a composite character, look for '}'
73	while (here <= end && c != '}') {
74	c_len = parse_utf8_char (here, end, &c);
75	here += c_len;
76	}
77	}
78	if (is_unicode_letdig(c)) {
79	while (c_len > 0) {
80	// this is in a word
81	word.push_back(*here);
82	here++; c_len--;
83	}
84	break;
85	}
86	here += c_len;
87	}
88
89	// parse word
90	while (here <= end) {
91	c_len = parse_utf8_char (here, end, &c);
92	if (!is_unicode_letdig(c)) {
93	here += c_len; // it is ok to skip a nonword character
94	break;
95	}
96	while (c_len > 0) {
97	word.push_back(*here);
98	here++; c_len--;
99	}
100	}
101
102	return here;
103	}
104
105
106	bool doc_phrase_search (unsigned char *doc, int doclen,
107	const termfreqclassarray &phrase) {
108	// note: this uses the most braindead search routine :-)
109	// however its not so bad as there shouldn't be many partial
110	// matches
111
112	// a null phrase matches anything
113	if (phrase.empty()) return true;
114
115	// if there is nothing then there can't be a match
116	if (doc == NULL \|\| doclen == 0) return false;
117
118	text_t doc_word;
119	doc_word.reserve (16);
120
121	bool first = true;
122
123	unsigned char *doc_here = doc;
124	unsigned char *doc_herefirstword = doc;
125	unsigned char *doc_end = doc+doclen-1; // unitool conventions :-/
126
127	while (doc_here <= doc_end) {
128	first = true;
129
130	// there will be at least one member of phrase (see above)
131	termfreqclassarray::const_iterator phrase_here = phrase.begin();
132	termfreqclassarray::const_iterator phrase_end = phrase.end();
133	do {
134	// get the next non-word ... and ignore it, then get the next word
135	doc_here = parse_nonword_word (doc_here, doc_end, doc_word);
136	if (first) {doc_herefirstword = doc_here; first = false;}
137
138	// break if this word is not the next in the phrase
139	if ((*phrase_here).utf8equivterms.find (doc_word) ==
140	(*phrase_here).utf8equivterms.end()) break;
141
142	phrase_here++;
143	} while (doc_here <= doc_end && phrase_here != phrase_end);
144
145	// see if we found a phrase
146	if (phrase_here == phrase_end) return true;
147
148	doc_here = doc_herefirstword; // set the counter back
149	}
150
151	return false;
152	}
153
154
155	// looks for the stemmed phrase in the metadata or text associated with
156	// an OID. This function has not been coded with all situations in mind
157	bool OID_phrase_search (mgsearchclass &mgsearch,
158	gdbmclass &gdbm,
159	const text_t &index,
160	const text_t &subcollection,
161	const text_t &language,
162	const text_t &longindex,
163	const text_t &collection,
164	const termfreqclassarray &phrase,
165	int docnum) {
166	// disect the long index to find out where the text should come from
167	text_t level, gran;
168	text_t::const_iterator longindex_here = longindex.begin();
169	text_t::const_iterator longindex_end = longindex.end();
170	longindex_here = getdelimitstr (longindex_here, longindex_end, ':', level);
171	longindex_here = getdelimitstr (longindex_here, longindex_end, ':', gran);
172
173	if (gran.empty()) return false;
174
175	if (gran == "text") {
176	char *doc = NULL;
177	int doclen = 0;
178
179	// get text from mg.
180	if (!mgsearch.mgdocument (index, subcollection, language, collection,
181	docnum, doc, doclen)) return false;
182	return doc_phrase_search ((unsigned char *)doc, doclen, phrase);
183	}
184
185	// get OID
186	char *metadata = NULL;
187	text_t::size_type metadata_len = 0;
188	infodbclass docnum_info;
189	infodbclass OID_info;
190
191	if (!gdbm.getinfo (docnum, docnum_info)) return false;
192	text_t &OID = docnum_info["section"];
193	if (OID.empty()) return false;
194
195	// get field
196	if (!gdbm.getinfo (OID, OID_info)) return false;
197
198	bool result = false;
199	text_tarray *tarr_ptr = OID_info.getmultinfo (gran);
200	if (tarr_ptr != NULL ) {
201	text_tarray::const_iterator subvalue_here = (*tarr_ptr).begin();
202	text_tarray::const_iterator subvalue_end = (*tarr_ptr).end();
203	while (subvalue_here != subvalue_end) {
204	if (subvalue_here != NULL) {
205	metadata = (to_utf8(*subvalue_here)).getcarr(metadata_len);
206	result = doc_phrase_search ((unsigned char *)metadata, metadata_len, phrase);
207	delete [] metadata;
208
209	if (result) return true;
210	}
211
212	subvalue_here++;
213	}
214	}
215
216	return result;
217	}

Note: See TracBrowser for help on using the repository browser.

Download in other formats: