source: main/trunk/greenstone2/runtime-src/src/recpt/highlighttext.cpp@ 22744

Last change on this file since 22744 was 22744, checked in by mdewsnip, 11 years ago

Minor copyright statement fixes.

  • Property svn:keywords set to Author Date Id Revision
File size: 8.2 KB
Line 
1/**********************************************************************
2 *
3 * highlighttext.cpp --
4 * Copyright (C) 2002 DL Consulting Ltd
5 *
6 * A component of the Greenstone digital library software
7 * from the New Zealand Digital Library Project at the
8 * University of Waikato, New Zealand.
9 *
10 * This program is free software; you can redistribute it and/or modify
11 * it under the terms of the GNU General Public License as published by
12 * the Free Software Foundation; either version 2 of the License, or
13 * (at your option) any later version.
14 *
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 * GNU General Public License for more details.
19 *
20 * You should have received a copy of the GNU General Public License
21 * along with this program; if not, write to the Free Software
22 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23 *
24 *********************************************************************/
25
26#include "highlighttext.h"
27#include "unitool.h"
28
29
30static void highlight_terms(const text_t &text, const TermInfo_tarray &terms, const text_t &shl,
31 const text_t &ehl, displayclass &disp, outconvertclass &outconvert,
32 ostream &textout);
33
34static void highlight_phrases(const text_t &text, const text_t &querystring, const TermInfo_tarray &terms,
35 const text_t &shl, const text_t &ehl, displayclass &disp,
36 outconvertclass &outconvert, ostream &textout);
37
38static void get_phrase_terms (const text_t &querystring, text_tarray &phrase_terms);
39
40static void remove_space (text_t &qstring);
41
42// highlights text string by adding _starthighlight_ and _endhightlight_
43// around terms and/or phrases that match querystring
44
45// - at present this only handles phrase searches where the first and last
46// characters are double quotes (i.e. it won't correctly handle a mixture
47// of phrase and non-phrase terms or queries containing multiple phrases) -
48// it also doesn't highlight stemmed variations of terms within a phrase
49// because the terminfo returned by mgqueryfilter doesn't currently tell
50// you which term variants belong to which term
51
52// - this function can be forced to treat the querystring like a phrase
53// even if it isn't one by setting the "hl" cgi argument to "2"
54void highlighttext(const text_t &text, cgiargsclass &args, const TermInfo_tarray &terms,
55 displayclass &disp, outconvertclass &outconvert, ostream &textout) {
56
57
58 text_t &querystring = args["q"];
59
60 // get the text to start and end a hightlight
61 text_t shl = "<b><u>";
62 text_t ehl = "</u></b>";
63 if (disp.isdefaultmacro(displayclass::defaultpackage, "starthighlight")) {
64 disp.expandstring(displayclass::defaultpackage, "_starthighlight_", shl);
65 }
66 if (disp.isdefaultmacro(displayclass::defaultpackage, "endhighlight")) {
67 disp.expandstring(displayclass::defaultpackage, "_endhighlight_", ehl);
68 }
69
70 // remove leading and trailing whitespace
71 remove_space(querystring);
72
73 // Expand macros before highlighting -- by Jens Wille
74 text_t text_expanded = "";
75 disp.expandstring(text, text_expanded);
76
77 if ((args["hl"] == 2) || ((*(querystring.begin()) == '"') && (*(querystring.end()-1) == '"'))) {
78 highlight_phrases(text_expanded, querystring, terms, shl, ehl, disp, outconvert, textout);
79 } else {
80 highlight_terms(text_expanded, terms, shl, ehl, disp, outconvert, textout);
81 }
82}
83
84void highlight_terms(const text_t &text, const TermInfo_tarray &terms, const text_t &shl,
85 const text_t &ehl, displayclass &disp, outconvertclass &outconvert,
86 ostream &textout) {
87
88 text_tmap allterms;
89 text_tmap::const_iterator it;
90
91 // first load all the term variations into a map
92 TermInfo_tarray::const_iterator this_term = terms.begin();
93 TermInfo_tarray::const_iterator last_term = terms.end();
94 while (this_term != last_term) {
95 text_tarray::const_iterator this_var = (*this_term).matchTerms.begin();
96 text_tarray::const_iterator last_var = (*this_term).matchTerms.end();
97 while (this_var != last_var) {
98 allterms[*this_var] = 1;
99 ++this_var;
100 }
101 ++this_term;
102 }
103
104 text_t::const_iterator here = text.begin();
105 text_t::const_iterator end = text.end();
106
107 text_t word, buffer;
108 while (here != end) {
109 if (is_unicode_letdig(*here)) {
110 // not word boundary
111 word.push_back(*here);
112 ++here;
113
114 } else {
115 // found word boundary
116 // add last word if there was one
117 if (!word.empty()) {
118 it = allterms.find(word);
119 if (it != allterms.end()) {
120 word = shl + word + ehl;
121 }
122 buffer += word;
123 word.clear();
124 }
125
126 if (*here == '<') {
127 // skip over rest of html tag
128 while ((here != end) && (*here != '>')) {
129 buffer.push_back(*here);
130 ++here;
131 }
132 }
133
134 buffer.push_back(*here);
135 ++here;
136
137 if (buffer.size() > 1024) {
138 textout << outconvert << disp << buffer;
139 buffer.clear();
140 }
141 }
142 }
143 textout << outconvert << disp << buffer;
144}
145
146void highlight_phrases(const text_t &text, const text_t &querystring, const TermInfo_tarray &terms,
147 const text_t &shl, const text_t &ehl, displayclass &disp,
148 outconvertclass &outconvert, ostream &textout) {
149
150 text_tmap allterms;
151 text_tarray phrase_terms;
152 text_tmap::const_iterator it;
153
154 get_phrase_terms(querystring, phrase_terms);
155 int phraselen = phrase_terms.size();
156
157 TermInfo_tarray::const_iterator this_term = terms.begin();
158 TermInfo_tarray::const_iterator last_term = terms.end();
159 bool first = true;
160 while (this_term != last_term) {
161 text_tarray::const_iterator this_var = (*this_term).matchTerms.begin();
162 text_tarray::const_iterator last_var = (*this_term).matchTerms.end();
163 while (this_var != last_var) {
164 allterms[*this_var] = 1;
165 ++this_var;
166 }
167 first = false;
168 ++this_term;
169 }
170
171 text_t::const_iterator here = text.begin();
172 text_t::const_iterator end = text.end();
173
174 text_t word, buffer;
175 int phrasecount = 0;
176 while (here != end) {
177 if (is_unicode_letdig(*here)) {
178 // not word boundary
179 word.push_back(*here);
180 ++here;
181
182 } else {
183 // found word boundary
184 // add last word if there was one
185 if (!word.empty()) {
186 it = allterms.find(word);
187 if (it != allterms.end()) {
188 // found a word that matches somewhere in the phrase
189
190 text_t lcword = word; lc(lcword);
191 if (lcword == phrase_terms[phrasecount]) {
192
193 if (phrasecount == 0) {
194 // clear the buffer (from here on buffer will contain the phrase
195 // as it's built up)
196 textout << outconvert << disp << buffer;
197 buffer.clear();
198 }
199 ++phrasecount;
200 } else {
201 phrasecount = 0;
202 }
203 } else {
204 phrasecount = 0;
205 }
206 buffer += word;
207 word.clear();
208
209 if (phrasecount == phraselen) {
210 // have found entire phrase
211 textout << outconvert << disp << shl << buffer << ehl;
212 buffer.clear();
213 phrasecount = 0;
214 }
215 }
216
217 if (*here == '<') {
218 // skip over rest of html tag
219 while ((here != end) && (*here != '>')) {
220 buffer.push_back(*here);
221 ++here;
222 }
223 }
224
225 buffer.push_back(*here);
226 ++here;
227
228 if (buffer.size() > 1024 && phrasecount == 0) {
229 textout << outconvert << disp << buffer;
230 buffer.clear();
231 }
232 }
233 }
234 textout << outconvert << disp << buffer;
235}
236
237void get_phrase_terms (const text_t &querystring, text_tarray &phrase_terms) {
238
239 phrase_terms.erase(phrase_terms.begin(), phrase_terms.end());
240
241 text_t::const_iterator here = querystring.begin();
242 text_t::const_iterator end = querystring.end();
243
244 text_t word;
245 while (here != end) {
246 if (is_unicode_letdig(*here)) {
247 // not word boundary
248 word.push_back(*here);
249
250 } else {
251 // found word boundary
252 if (!word.empty()) {
253 lc(word);
254 phrase_terms.push_back(word);
255 word.clear();
256 }
257 }
258 ++here;
259 }
260
261 if (!word.empty()) {
262 lc(word);
263 phrase_terms.push_back(word);
264 }
265}
266
267void remove_space (text_t &qstring) {
268
269 text_t altered_string;
270 text_t space;
271
272 text_t::const_iterator here = qstring.begin();
273 text_t::const_iterator end = qstring.end();
274 while (here != end) {
275 if (is_unicode_space(*here)) {
276 space.push_back(*here);
277 } else {
278 if (!altered_string.empty()) {
279 altered_string += space;
280 }
281 space.clear();
282 altered_string.push_back(*here);
283 }
284 ++here;
285 }
286
287 qstring = altered_string;
288}
Note: See TracBrowser for help on using the repository browser.