Context Navigation

source: trunk/gsdl/src/colservr/phrasesearch.cpp@ 351

Last change on this file since 351 was 351, checked in by rjmcnab, 25 years ago
Added ability to combine two or more independant queries.
Property svn:executable set to ``* Property svn:keywords set to `Author Date Id Revision`
File size: 4.4 KB

Line
1	/**********************************************************************
2	*
3	* phrasesearch.cpp -- tools to search for a phrase in a larger text
4	* Copyright (C) 1999 DigiLib Systems Limited
5	*
6	* PUT COPYRIGHT NOTICE HERE
7	*
8	* $Id: phrasesearch.cpp 351 1999-07-07 06:19:47Z rjmcnab $
9	*
10	*********************************************************************/
11
12	/*
13	$Log$
14	Revision 1.3 1999/07/07 06:19:45 rjmcnab
15	Added ability to combine two or more independant queries.
16
17	Revision 1.2 1999/07/01 09:25:54 rjmcnab
18	fixed bug :-^
19
20	Revision 1.1 1999/07/01 04:01:46 rjmcnab
21	Initial revision.
22
23	*/
24
25
26	#include "phrasesearch.h"
27	#include "gsdlunicode.h"
28
29	inline unsigned char parse_nonword_word (unsigned char here, unsigned char *end,
30	text_t &word) {
31	int c_len = 0;
32	unsigned short c = 0;
33
34	word.clear();
35
36	// parse non word
37	while (here <= end) {
38	c_len = parse_utf8_char (here, end, &c);
39	if (is_unicode_letdig(c)) {
40	while (c_len > 0) {
41	word.push_back(*here);
42	here++; c_len--;
43	}
44	break;
45	}
46	here += c_len;
47	}
48
49	// parse word
50	while (here <= end) {
51	c_len = parse_utf8_char (here, end, &c);
52	if (!is_unicode_letdig(c)) {
53	here += c_len; // it is ok to skip a nonword character
54	break;
55	}
56	while (c_len > 0) {
57	word.push_back(*here);
58	here++; c_len--;
59	}
60	}
61
62	return here;
63	}
64
65
66	bool doc_phrase_search (unsigned char *doc, int doclen,
67	const vector<termfreqclass> &phrase) {
68	// note: this uses the most braindead search routine :-)
69	// however its not so bad as there shouldn't be many partial
70	// matches
71
72	// a null phrase matches anything
73	if (phrase.empty()) return true;
74
75	// if there is nothing then there can't be a match
76	if (doc == NULL \|\| doclen == 0) return false;
77
78	text_t doc_word;
79	doc_word.reserve (16);
80
81	bool first = true;
82
83	unsigned char *doc_here = doc;
84	unsigned char *doc_herefirstword = doc;
85	unsigned char *doc_end = doc+doclen-1; // unitool conventions :-/
86
87	while (doc_here <= doc_end) {
88	first = true;
89
90	// there will be at least one member of phrase (see above)
91	vector<termfreqclass>::const_iterator phrase_here = phrase.begin();
92	vector<termfreqclass>::const_iterator phrase_end = phrase.end();
93	do {
94	// get the next non-word ... and ignore it, then get the next word
95	doc_here = parse_nonword_word (doc_here, doc_end, doc_word);
96	if (first) {doc_herefirstword = doc_here; first = false;}
97
98	// break if this word is not the next in the phrase
99	if ((*phrase_here).utf8equivterms.find (doc_word) ==
100	(*phrase_here).utf8equivterms.end()) break;
101
102	phrase_here++;
103	} while (doc_here <= doc_end && phrase_here != phrase_end);
104
105	// see if we found a phrase
106	if (phrase_here == phrase_end) return true;
107
108	doc_here = doc_herefirstword; // set the counter back
109	}
110
111	return false;
112	}
113
114
115	// looks for the stemmed phrase in the metadata or text associated with
116	// an OID. This function has not been coded with all situations in mind
117	bool OID_phrase_search (mgsearchclass &mgsearch,
118	gdbmclass &gdbm,
119	const text_t &index,
120	const text_t &subcollection,
121	const text_t &language,
122	const text_t &longindex,
123	const text_t &collection,
124	const vector<termfreqclass> &phrase,
125	int docnum) {
126	// disect the long index to find out where the text should come from
127	text_t level, gran;
128	text_t::const_iterator longindex_here = longindex.begin();
129	text_t::const_iterator longindex_end = longindex.end();
130	longindex_here = getdelimitstr (longindex_here, longindex_end, ':', level);
131	longindex_here = getdelimitstr (longindex_here, longindex_end, ':', gran);
132
133	if (gran.empty()) return false;
134
135	if (gran == "text") {
136	char *doc = NULL;
137	int doclen = 0;
138
139	// get text from mg.
140	if (!mgsearch.mgdocument (index, subcollection, language, collection,
141	docnum, doc, doclen)) return false;
142	return doc_phrase_search ((unsigned char *)doc, doclen, phrase);
143	}
144
145	// get OID
146	char *metadata = NULL;
147	text_t::size_type metadata_len = 0;
148	infodbclass docnum_info;
149	infodbclass OID_info;
150
151	if (!gdbm.getinfo (docnum, docnum_info)) return false;
152	text_t &OID = docnum_info["section"];
153	if (OID.empty()) return false;
154
155	// get field
156	if (!gdbm.getinfo (OID, OID_info)) return false;
157
158	metadata = (to_utf8(OID_info[gran])).getcarr(metadata_len);
159	bool result = doc_phrase_search ((unsigned char *)metadata, metadata_len, phrase);
160	delete [] metadata;
161
162	return result;
163	}

Note: See TracBrowser for help on using the repository browser.

Download in other formats: