Context Navigation

← Previous Revision
Latest Revision
Next Revision →
Blame
Revision Log

source: trunk/gsdl/src/colservr/phrasesearch.cpp@ 328

Last change on this file since 328 was 328, checked in by rjmcnab, 25 years ago
Initial revision.
Property svn:executable set to ``* Property svn:keywords set to `Author Date Id Revision`
File size: 4.0 KB

Line
1	/**********************************************************************
2	*
3	* phrasesearch.cpp -- tools to search for a phrase in a larger text
4	* Copyright (C) 1999 DigiLib Systems Limited
5	*
6	* PUT COPYRIGHT NOTICE HERE
7	*
8	* $Id: phrasesearch.cpp 328 1999-07-01 04:01:47Z rjmcnab $
9	*
10	*********************************************************************/
11
12	/*
13	$Log$
14	Revision 1.1 1999/07/01 04:01:46 rjmcnab
15	Initial revision.
16
17	*/
18
19
20	#include "phrasesearch.h"
21	#include "gsdlunicode.h"
22
23	inline unsigned char parse_nonword_word (unsigned char here, unsigned char *end,
24	text_t &word) {
25	int c_len = 0;
26	unsigned short c = 0;
27
28	word.clear();
29
30	// parse non word
31	while (here <= end) {
32	c_len = parse_utf8_char (here, end, &c);
33	here += c_len;
34	if (is_unicode_letdig(c)) {
35	word.push_back(c);
36	break;
37	}
38	}
39
40	// parse word
41	while (here <= end) {
42	c_len = parse_utf8_char (here, end, &c);
43	here += c_len; // it is ok to skip a nonword character
44	if (!is_unicode_letdig(c)) break;
45	word.push_back(c);
46	}
47
48	return here;
49	}
50
51
52	bool doc_phrase_search (unsigned char *doc, int doclen,
53	const vector<termfreqclass> &phrase) {
54	// note: this uses the most braindead search routine :-)
55	// however its not so bad as there shouldn't be many partial
56	// matches
57
58	// a null phrase matches anything
59	if (phrase.empty()) return true;
60
61	// if there is nothing then there can't be a match
62	if (doc == NULL \|\| doclen == 0) return false;
63
64	text_t doc_word;
65	doc_word.reserve (16);
66
67	bool first = true;
68
69	unsigned char *doc_here = doc;
70	unsigned char *doc_herefirstword = doc;
71	unsigned char *doc_end = doc+doclen-1; // unitool conventions :-/
72
73	while (doc_here <= doc_end) {
74	first = true;
75
76	// there will be at least one member of phrase (see above)
77	vector<termfreqclass>::const_iterator phrase_here = phrase.begin();
78	vector<termfreqclass>::const_iterator phrase_end = phrase.end();
79	do {
80	// get the next non-word ... and ignore it, then get the next word
81	doc_here = parse_nonword_word (doc_here, doc_end, doc_word);
82	if (first) {doc_herefirstword = doc_here; first = false;}
83
84	// break if this word is not the next in the phrase
85	if ((*phrase_here).utf8equivterms.find (doc_word) ==
86	(*phrase_here).utf8equivterms.end()) break;
87
88	phrase_here++;
89	} while (doc_here <= doc_end && phrase_here != phrase_end);
90
91	// see if we found a phrase
92	if (phrase_here == phrase_end) return true;
93
94	doc_here = doc_herefirstword; // set the counter back
95	}
96
97	return false;
98	}
99
100
101	// looks for the stemmed phrase in the metadata or text associated with
102	// an OID. This function has not been coded with all situations in mind
103	bool OID_phrase_search (mgsearchclass &mgsearch,
104	gdbmclass &gdbm,
105	const text_t &index,
106	const text_t &longindex,
107	const text_t &collection,
108	const vector<termfreqclass> &phrase,
109	int docnum) {
110	// disect the long index to find out where the text should come from
111	text_t level, gran;
112	text_t::const_iterator longindex_here = longindex.begin();
113	text_t::const_iterator longindex_end = longindex.end();
114	longindex_here = getdelimitstr (longindex_here, longindex_end, ':', level);
115	longindex_here = getdelimitstr (longindex_here, longindex_end, ':', gran);
116
117	if (gran.empty()) return false;
118
119	if (gran == "text") {
120	char *doc = NULL;
121	int doclen = 0;
122
123	// get text from mg.
124	if (!mgsearch.mgdocument (index, collection, docnum, doc, doclen)) return false;
125	return doc_phrase_search ((unsigned char *)doc, doclen, phrase);
126	}
127
128	// get OID
129	char *metadata = NULL;
130	text_t::size_type metadata_len = 0;
131	infodbclass docnum_info;
132	infodbclass OID_info;
133
134	if (!gdbm.getinfo (docnum, docnum_info)) return false;
135	text_t &OID = docnum_info["section"];
136	if (OID.empty()) return false;
137
138	// get field
139	if (!gdbm.getinfo (OID, OID_info)) return false;
140
141	metadata = (to_utf8(OID_info[gran])).getcarr(metadata_len);
142	bool result = doc_phrase_search ((unsigned char *)metadata, metadata_len, phrase);
143	delete [] metadata;
144
145	return result;
146	}

Note: See TracBrowser for help on using the repository browser.

Download in other formats: