source: trunk/gsdl/src/recpt/highlighttext.cpp@ 7392

Last change on this file since 7392 was 5140, checked in by sjboddie, 21 years ago

Fix for search term highlighting code so it no longer becomes confused
if the query string contains leading or trailing space.

  • Property svn:keywords set to Author Date Id Revision
File size: 8.0 KB
Line 
1/**********************************************************************
2 *
3 * highlighttext.cpp --
4 * Copyright (C) 2002 D L Consulting Ltd
5 *
6 * A component of the Greenstone digital library software
7 * from the New Zealand Digital Library Project at the
8 * University of Waikato, New Zealand.
9 *
10 * This program is free software; you can redistribute it and/or modify
11 * it under the terms of the GNU General Public License as published by
12 * the Free Software Foundation; either version 2 of the License, or
13 * (at your option) any later version.
14 *
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 * GNU General Public License for more details.
19 *
20 * You should have received a copy of the GNU General Public License
21 * along with this program; if not, write to the Free Software
22 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23 *
24 *********************************************************************/
25
26#include "highlighttext.h"
27#include "unitool.h"
28
29
30static void highlight_terms(const text_t &text, const TermInfo_tarray &terms, const text_t &shl,
31 const text_t &ehl, displayclass &disp, outconvertclass &outconvert,
32 ostream &textout);
33
34static void highlight_phrases(const text_t &text, const text_t &querystring, const TermInfo_tarray &terms,
35 const text_t &shl, const text_t &ehl, displayclass &disp,
36 outconvertclass &outconvert, ostream &textout);
37
38static void get_phrase_terms (const text_t &querystring, text_tarray &phrase_terms);
39
40static void remove_space (text_t &qstring);
41
42// highlights text string by adding _starthighlight_ and _endhightlight_
43// around terms and/or phrases that match querystring
44
45// - at present this only handles phrase searches where the first and last
46// characters are double quotes (i.e. it won't correctly handle a mixture
47// of phrase and non-phrase terms or queries containing multiple phrases) -
48// it also doesn't highlight stemmed variations of terms within a phrase
49// because the terminfo returned by mgqueryfilter doesn't currently tell
50// you which term variants belong to which term
51
52// - this function can be forced to treat the querystring like a phrase
53// even if it isn't one by setting the "hl" cgi argument to "2"
54void highlighttext(const text_t &text, cgiargsclass &args, const TermInfo_tarray &terms,
55 displayclass &disp, outconvertclass &outconvert, ostream &textout) {
56
57
58 text_t &querystring = args["q"];
59
60 // get the text to start and end a hightlight
61 text_t shl = "<b><u>";
62 text_t ehl = "</u></b>";
63 if (disp.isdefaultmacro("Global", "starthighlight")) {
64 disp.expandstring("Global", "_starthighlight_", shl);
65 }
66 if (disp.isdefaultmacro("Global", "endhighlight")) {
67 disp.expandstring("Global", "_endhighlight_", ehl);
68 }
69
70 // remove leading and trailing whitespace
71 remove_space(querystring);
72
73 if ((args["hl"] == 2) || ((*(querystring.begin()) == '"') && (*(querystring.end()-1) == '"'))) {
74 highlight_phrases(text, querystring, terms, shl, ehl, disp, outconvert, textout);
75 } else {
76 highlight_terms(text, terms, shl, ehl, disp, outconvert, textout);
77 }
78}
79
80void highlight_terms(const text_t &text, const TermInfo_tarray &terms, const text_t &shl,
81 const text_t &ehl, displayclass &disp, outconvertclass &outconvert,
82 ostream &textout) {
83
84 text_tmap allterms;
85 text_tmap::const_iterator it;
86
87 // first load all the term variations into a map
88 TermInfo_tarray::const_iterator this_term = terms.begin();
89 TermInfo_tarray::const_iterator last_term = terms.end();
90 while (this_term != last_term) {
91 text_tarray::const_iterator this_var = (*this_term).matchTerms.begin();
92 text_tarray::const_iterator last_var = (*this_term).matchTerms.end();
93 while (this_var != last_var) {
94 allterms[*this_var] = 1;
95 this_var ++;
96 }
97 this_term ++;
98 }
99
100 text_t::const_iterator here = text.begin();
101 text_t::const_iterator end = text.end();
102
103 text_t word, buffer;
104 while (here != end) {
105 if (is_unicode_letdig(*here)) {
106 // not word boundary
107 word.push_back(*here);
108 here++;
109
110 } else {
111 // found word boundary
112 // add last word if there was one
113 if (!word.empty()) {
114 it = allterms.find(word);
115 if (it != allterms.end()) {
116 word = shl + word + ehl;
117 }
118 buffer += word;
119 word.clear();
120 }
121
122 if (*here == '<') {
123 // skip over rest of html tag
124 while ((here != end) && (*here != '>')) {
125 buffer.push_back(*here);
126 here++;
127 }
128 }
129
130 buffer.push_back(*here);
131 here++;
132
133 if (buffer.size() > 1024) {
134 textout << outconvert << disp << buffer;
135 buffer.clear();
136 }
137 }
138 }
139 textout << outconvert << disp << buffer;
140}
141
142void highlight_phrases(const text_t &text, const text_t &querystring, const TermInfo_tarray &terms,
143 const text_t &shl, const text_t &ehl, displayclass &disp,
144 outconvertclass &outconvert, ostream &textout) {
145
146 text_tmap allterms;
147 text_tarray phrase_terms;
148 text_tmap::const_iterator it;
149
150 get_phrase_terms(querystring, phrase_terms);
151 int phraselen = phrase_terms.size();
152
153 TermInfo_tarray::const_iterator this_term = terms.begin();
154 TermInfo_tarray::const_iterator last_term = terms.end();
155 bool first = true;
156 while (this_term != last_term) {
157 text_tarray::const_iterator this_var = (*this_term).matchTerms.begin();
158 text_tarray::const_iterator last_var = (*this_term).matchTerms.end();
159 while (this_var != last_var) {
160 allterms[*this_var] = 1;
161 this_var ++;
162 }
163 first = false;
164 this_term ++;
165 }
166
167 text_t::const_iterator here = text.begin();
168 text_t::const_iterator end = text.end();
169
170 text_t word, buffer;
171 int phrasecount = 0;
172 while (here != end) {
173 if (is_unicode_letdig(*here)) {
174 // not word boundary
175 word.push_back(*here);
176 here++;
177
178 } else {
179 // found word boundary
180 // add last word if there was one
181 if (!word.empty()) {
182 it = allterms.find(word);
183 if (it != allterms.end()) {
184 // found a word that matches somewhere in the phrase
185
186 text_t lcword = word; lc(lcword);
187 if (lcword == phrase_terms[phrasecount]) {
188
189 if (phrasecount == 0) {
190 // clear the buffer (from here on buffer will contain the phrase
191 // as it's built up)
192 textout << outconvert << disp << buffer;
193 buffer.clear();
194 }
195 phrasecount ++;
196 } else {
197 phrasecount = 0;
198 }
199 } else {
200 phrasecount = 0;
201 }
202 buffer += word;
203 word.clear();
204
205 if (phrasecount == phraselen) {
206 // have found entire phrase
207 textout << outconvert << disp << shl << buffer << ehl;
208 buffer.clear();
209 phrasecount = 0;
210 }
211 }
212
213 if (*here == '<') {
214 // skip over rest of html tag
215 while ((here != end) && (*here != '>')) {
216 buffer.push_back(*here);
217 here++;
218 }
219 }
220
221 buffer.push_back(*here);
222 here++;
223
224 if (buffer.size() > 1024 && phrasecount == 0) {
225 textout << outconvert << disp << buffer;
226 buffer.clear();
227 }
228 }
229 }
230 textout << outconvert << disp << buffer;
231}
232
233void get_phrase_terms (const text_t &querystring, text_tarray &phrase_terms) {
234
235 phrase_terms.erase(phrase_terms.begin(), phrase_terms.end());
236
237 text_t::const_iterator here = querystring.begin();
238 text_t::const_iterator end = querystring.end();
239
240 text_t word;
241 while (here != end) {
242 if (is_unicode_letdig(*here)) {
243 // not word boundary
244 word.push_back(*here);
245
246 } else {
247 // found word boundary
248 if (!word.empty()) {
249 lc(word);
250 phrase_terms.push_back(word);
251 word.clear();
252 }
253 }
254 here++;
255 }
256
257 if (!word.empty()) {
258 lc(word);
259 phrase_terms.push_back(word);
260 }
261}
262
263void remove_space (text_t &qstring) {
264
265 text_t altered_string;
266 text_t space;
267
268 text_t::const_iterator here = qstring.begin();
269 text_t::const_iterator end = qstring.end();
270 while (here != end) {
271 if (is_unicode_space(*here)) {
272 space.push_back(*here);
273 } else {
274 if (!altered_string.empty()) {
275 altered_string += space;
276 }
277 space.clear();
278 altered_string.push_back(*here);
279 }
280 here++;
281 }
282
283 qstring = altered_string;
284}
Note: See TracBrowser for help on using the repository browser.