source: trunk/gsdl/src/recpt/highlighttext.cpp@ 10148

Last change on this file since 10148 was 9620, checked in by kjdon, 19 years ago

added some x++ -> ++x changes submitted by Emanuel Dejanu

  • Property svn:keywords set to Author Date Id Revision
File size: 8.1 KB
Line 
1/**********************************************************************
2 *
3 * highlighttext.cpp --
4 * Copyright (C) 2002 D L Consulting Ltd
5 *
6 * A component of the Greenstone digital library software
7 * from the New Zealand Digital Library Project at the
8 * University of Waikato, New Zealand.
9 *
10 * This program is free software; you can redistribute it and/or modify
11 * it under the terms of the GNU General Public License as published by
12 * the Free Software Foundation; either version 2 of the License, or
13 * (at your option) any later version.
14 *
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 * GNU General Public License for more details.
19 *
20 * You should have received a copy of the GNU General Public License
21 * along with this program; if not, write to the Free Software
22 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23 *
24 *********************************************************************/
25
26#include "highlighttext.h"
27#include "unitool.h"
28
29
30static void highlight_terms(const text_t &text, const TermInfo_tarray &terms, const text_t &shl,
31 const text_t &ehl, displayclass &disp, outconvertclass &outconvert,
32 ostream &textout);
33
34static void highlight_phrases(const text_t &text, const text_t &querystring, const TermInfo_tarray &terms,
35 const text_t &shl, const text_t &ehl, displayclass &disp,
36 outconvertclass &outconvert, ostream &textout);
37
38static void get_phrase_terms (const text_t &querystring, text_tarray &phrase_terms);
39
40static void remove_space (text_t &qstring);
41
42// highlights text string by adding _starthighlight_ and _endhightlight_
43// around terms and/or phrases that match querystring
44
45// - at present this only handles phrase searches where the first and last
46// characters are double quotes (i.e. it won't correctly handle a mixture
47// of phrase and non-phrase terms or queries containing multiple phrases) -
48// it also doesn't highlight stemmed variations of terms within a phrase
49// because the terminfo returned by mgqueryfilter doesn't currently tell
50// you which term variants belong to which term
51
52// - this function can be forced to treat the querystring like a phrase
53// even if it isn't one by setting the "hl" cgi argument to "2"
54void highlighttext(const text_t &text, cgiargsclass &args, const TermInfo_tarray &terms,
55 displayclass &disp, outconvertclass &outconvert, ostream &textout) {
56
57
58 text_t &querystring = args["q"];
59
60 // get the text to start and end a hightlight
61 text_t shl = "<b><u>";
62 text_t ehl = "</u></b>";
63 if (disp.isdefaultmacro(displayclass::defaultpackage, "starthighlight")) {
64 disp.expandstring(displayclass::defaultpackage, "_starthighlight_", shl);
65 }
66 if (disp.isdefaultmacro(displayclass::defaultpackage, "endhighlight")) {
67 disp.expandstring(displayclass::defaultpackage, "_endhighlight_", ehl);
68 }
69
70 // remove leading and trailing whitespace
71 remove_space(querystring);
72
73 if ((args["hl"] == 2) || ((*(querystring.begin()) == '"') && (*(querystring.end()-1) == '"'))) {
74 highlight_phrases(text, querystring, terms, shl, ehl, disp, outconvert, textout);
75 } else {
76 highlight_terms(text, terms, shl, ehl, disp, outconvert, textout);
77 }
78}
79
80void highlight_terms(const text_t &text, const TermInfo_tarray &terms, const text_t &shl,
81 const text_t &ehl, displayclass &disp, outconvertclass &outconvert,
82 ostream &textout) {
83
84 text_tmap allterms;
85 text_tmap::const_iterator it;
86
87 // first load all the term variations into a map
88 TermInfo_tarray::const_iterator this_term = terms.begin();
89 TermInfo_tarray::const_iterator last_term = terms.end();
90 while (this_term != last_term) {
91 text_tarray::const_iterator this_var = (*this_term).matchTerms.begin();
92 text_tarray::const_iterator last_var = (*this_term).matchTerms.end();
93 while (this_var != last_var) {
94 allterms[*this_var] = 1;
95 ++this_var;
96 }
97 ++this_term;
98 }
99
100 text_t::const_iterator here = text.begin();
101 text_t::const_iterator end = text.end();
102
103 text_t word, buffer;
104 while (here != end) {
105 if (is_unicode_letdig(*here)) {
106 // not word boundary
107 word.push_back(*here);
108 ++here;
109
110 } else {
111 // found word boundary
112 // add last word if there was one
113 if (!word.empty()) {
114 it = allterms.find(word);
115 if (it != allterms.end()) {
116 word = shl + word + ehl;
117 }
118 buffer += word;
119 word.clear();
120 }
121
122 if (*here == '<') {
123 // skip over rest of html tag
124 while ((here != end) && (*here != '>')) {
125 buffer.push_back(*here);
126 ++here;
127 }
128 }
129
130 buffer.push_back(*here);
131 ++here;
132
133 if (buffer.size() > 1024) {
134 textout << outconvert << disp << buffer;
135 buffer.clear();
136 }
137 }
138 }
139 textout << outconvert << disp << buffer;
140}
141
142void highlight_phrases(const text_t &text, const text_t &querystring, const TermInfo_tarray &terms,
143 const text_t &shl, const text_t &ehl, displayclass &disp,
144 outconvertclass &outconvert, ostream &textout) {
145
146 text_tmap allterms;
147 text_tarray phrase_terms;
148 text_tmap::const_iterator it;
149
150 get_phrase_terms(querystring, phrase_terms);
151 int phraselen = phrase_terms.size();
152
153 TermInfo_tarray::const_iterator this_term = terms.begin();
154 TermInfo_tarray::const_iterator last_term = terms.end();
155 bool first = true;
156 while (this_term != last_term) {
157 text_tarray::const_iterator this_var = (*this_term).matchTerms.begin();
158 text_tarray::const_iterator last_var = (*this_term).matchTerms.end();
159 while (this_var != last_var) {
160 allterms[*this_var] = 1;
161 ++this_var;
162 }
163 first = false;
164 ++this_term;
165 }
166
167 text_t::const_iterator here = text.begin();
168 text_t::const_iterator end = text.end();
169
170 text_t word, buffer;
171 int phrasecount = 0;
172 while (here != end) {
173 if (is_unicode_letdig(*here)) {
174 // not word boundary
175 word.push_back(*here);
176 ++here;
177
178 } else {
179 // found word boundary
180 // add last word if there was one
181 if (!word.empty()) {
182 it = allterms.find(word);
183 if (it != allterms.end()) {
184 // found a word that matches somewhere in the phrase
185
186 text_t lcword = word; lc(lcword);
187 if (lcword == phrase_terms[phrasecount]) {
188
189 if (phrasecount == 0) {
190 // clear the buffer (from here on buffer will contain the phrase
191 // as it's built up)
192 textout << outconvert << disp << buffer;
193 buffer.clear();
194 }
195 ++phrasecount;
196 } else {
197 phrasecount = 0;
198 }
199 } else {
200 phrasecount = 0;
201 }
202 buffer += word;
203 word.clear();
204
205 if (phrasecount == phraselen) {
206 // have found entire phrase
207 textout << outconvert << disp << shl << buffer << ehl;
208 buffer.clear();
209 phrasecount = 0;
210 }
211 }
212
213 if (*here == '<') {
214 // skip over rest of html tag
215 while ((here != end) && (*here != '>')) {
216 buffer.push_back(*here);
217 ++here;
218 }
219 }
220
221 buffer.push_back(*here);
222 ++here;
223
224 if (buffer.size() > 1024 && phrasecount == 0) {
225 textout << outconvert << disp << buffer;
226 buffer.clear();
227 }
228 }
229 }
230 textout << outconvert << disp << buffer;
231}
232
233void get_phrase_terms (const text_t &querystring, text_tarray &phrase_terms) {
234
235 phrase_terms.erase(phrase_terms.begin(), phrase_terms.end());
236
237 text_t::const_iterator here = querystring.begin();
238 text_t::const_iterator end = querystring.end();
239
240 text_t word;
241 while (here != end) {
242 if (is_unicode_letdig(*here)) {
243 // not word boundary
244 word.push_back(*here);
245
246 } else {
247 // found word boundary
248 if (!word.empty()) {
249 lc(word);
250 phrase_terms.push_back(word);
251 word.clear();
252 }
253 }
254 ++here;
255 }
256
257 if (!word.empty()) {
258 lc(word);
259 phrase_terms.push_back(word);
260 }
261}
262
263void remove_space (text_t &qstring) {
264
265 text_t altered_string;
266 text_t space;
267
268 text_t::const_iterator here = qstring.begin();
269 text_t::const_iterator end = qstring.end();
270 while (here != end) {
271 if (is_unicode_space(*here)) {
272 space.push_back(*here);
273 } else {
274 if (!altered_string.empty()) {
275 altered_string += space;
276 }
277 space.clear();
278 altered_string.push_back(*here);
279 }
280 ++here;
281 }
282
283 qstring = altered_string;
284}
Note: See TracBrowser for help on using the repository browser.