1 | /**********************************************************************
|
---|
2 | *
|
---|
3 | * highlighttext.cpp --
|
---|
4 | * Copyright (C) 2002 D L Consulting Ltd
|
---|
5 | *
|
---|
6 | * A component of the Greenstone digital library software
|
---|
7 | * from the New Zealand Digital Library Project at the
|
---|
8 | * University of Waikato, New Zealand.
|
---|
9 | *
|
---|
10 | * This program is free software; you can redistribute it and/or modify
|
---|
11 | * it under the terms of the GNU General Public License as published by
|
---|
12 | * the Free Software Foundation; either version 2 of the License, or
|
---|
13 | * (at your option) any later version.
|
---|
14 | *
|
---|
15 | * This program is distributed in the hope that it will be useful,
|
---|
16 | * but WITHOUT ANY WARRANTY; without even the implied warranty of
|
---|
17 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
---|
18 | * GNU General Public License for more details.
|
---|
19 | *
|
---|
20 | * You should have received a copy of the GNU General Public License
|
---|
21 | * along with this program; if not, write to the Free Software
|
---|
22 | * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
|
---|
23 | *
|
---|
24 | *********************************************************************/
|
---|
25 |
|
---|
26 | #include "highlighttext.h"
|
---|
27 | #include "unitool.h"
|
---|
28 |
|
---|
29 |
|
---|
30 | static void highlight_terms(const text_t &text, const TermInfo_tarray &terms, const text_t &shl,
|
---|
31 | const text_t &ehl, displayclass &disp, outconvertclass &outconvert,
|
---|
32 | ostream &textout);
|
---|
33 |
|
---|
34 | static void highlight_phrases(const text_t &text, const text_t &querystring, const TermInfo_tarray &terms,
|
---|
35 | const text_t &shl, const text_t &ehl, displayclass &disp,
|
---|
36 | outconvertclass &outconvert, ostream &textout);
|
---|
37 |
|
---|
38 | static void get_phrase_terms (const text_t &querystring, text_tarray &phrase_terms);
|
---|
39 |
|
---|
40 | static void remove_space (text_t &qstring);
|
---|
41 |
|
---|
42 | // highlights text string by adding _starthighlight_ and _endhightlight_
|
---|
43 | // around terms and/or phrases that match querystring
|
---|
44 |
|
---|
45 | // - at present this only handles phrase searches where the first and last
|
---|
46 | // characters are double quotes (i.e. it won't correctly handle a mixture
|
---|
47 | // of phrase and non-phrase terms or queries containing multiple phrases) -
|
---|
48 | // it also doesn't highlight stemmed variations of terms within a phrase
|
---|
49 | // because the terminfo returned by mgqueryfilter doesn't currently tell
|
---|
50 | // you which term variants belong to which term
|
---|
51 |
|
---|
52 | // - this function can be forced to treat the querystring like a phrase
|
---|
53 | // even if it isn't one by setting the "hl" cgi argument to "2"
|
---|
54 | void highlighttext(const text_t &text, cgiargsclass &args, const TermInfo_tarray &terms,
|
---|
55 | displayclass &disp, outconvertclass &outconvert, ostream &textout) {
|
---|
56 |
|
---|
57 |
|
---|
58 | text_t &querystring = args["q"];
|
---|
59 |
|
---|
60 | // get the text to start and end a hightlight
|
---|
61 | text_t shl = "<b><u>";
|
---|
62 | text_t ehl = "</u></b>";
|
---|
63 | if (disp.isdefaultmacro(displayclass::defaultpackage, "starthighlight")) {
|
---|
64 | disp.expandstring(displayclass::defaultpackage, "_starthighlight_", shl);
|
---|
65 | }
|
---|
66 | if (disp.isdefaultmacro(displayclass::defaultpackage, "endhighlight")) {
|
---|
67 | disp.expandstring(displayclass::defaultpackage, "_endhighlight_", ehl);
|
---|
68 | }
|
---|
69 |
|
---|
70 | // remove leading and trailing whitespace
|
---|
71 | remove_space(querystring);
|
---|
72 |
|
---|
73 | // Expand macros before highlighting -- by Jens Wille
|
---|
74 | text_t text_expanded = "";
|
---|
75 | disp.expandstring(text, text_expanded);
|
---|
76 |
|
---|
77 | if ((args["hl"] == 2) || ((*(querystring.begin()) == '"') && (*(querystring.end()-1) == '"'))) {
|
---|
78 | highlight_phrases(text_expanded, querystring, terms, shl, ehl, disp, outconvert, textout);
|
---|
79 | } else {
|
---|
80 | highlight_terms(text_expanded, terms, shl, ehl, disp, outconvert, textout);
|
---|
81 | }
|
---|
82 | }
|
---|
83 |
|
---|
84 | void highlight_terms(const text_t &text, const TermInfo_tarray &terms, const text_t &shl,
|
---|
85 | const text_t &ehl, displayclass &disp, outconvertclass &outconvert,
|
---|
86 | ostream &textout) {
|
---|
87 |
|
---|
88 | text_tmap allterms;
|
---|
89 | text_tmap::const_iterator it;
|
---|
90 |
|
---|
91 | // first load all the term variations into a map
|
---|
92 | TermInfo_tarray::const_iterator this_term = terms.begin();
|
---|
93 | TermInfo_tarray::const_iterator last_term = terms.end();
|
---|
94 | while (this_term != last_term) {
|
---|
95 | text_tarray::const_iterator this_var = (*this_term).matchTerms.begin();
|
---|
96 | text_tarray::const_iterator last_var = (*this_term).matchTerms.end();
|
---|
97 | while (this_var != last_var) {
|
---|
98 | allterms[*this_var] = 1;
|
---|
99 | ++this_var;
|
---|
100 | }
|
---|
101 | ++this_term;
|
---|
102 | }
|
---|
103 |
|
---|
104 | text_t::const_iterator here = text.begin();
|
---|
105 | text_t::const_iterator end = text.end();
|
---|
106 |
|
---|
107 | text_t word, buffer;
|
---|
108 | while (here != end) {
|
---|
109 | if (is_unicode_letdig(*here)) {
|
---|
110 | // not word boundary
|
---|
111 | word.push_back(*here);
|
---|
112 | ++here;
|
---|
113 |
|
---|
114 | } else {
|
---|
115 | // found word boundary
|
---|
116 | // add last word if there was one
|
---|
117 | if (!word.empty()) {
|
---|
118 | it = allterms.find(word);
|
---|
119 | if (it != allterms.end()) {
|
---|
120 | word = shl + word + ehl;
|
---|
121 | }
|
---|
122 | buffer += word;
|
---|
123 | word.clear();
|
---|
124 | }
|
---|
125 |
|
---|
126 | if (*here == '<') {
|
---|
127 | // skip over rest of html tag
|
---|
128 | while ((here != end) && (*here != '>')) {
|
---|
129 | buffer.push_back(*here);
|
---|
130 | ++here;
|
---|
131 | }
|
---|
132 | }
|
---|
133 |
|
---|
134 | buffer.push_back(*here);
|
---|
135 | ++here;
|
---|
136 |
|
---|
137 | if (buffer.size() > 1024) {
|
---|
138 | textout << outconvert << disp << buffer;
|
---|
139 | buffer.clear();
|
---|
140 | }
|
---|
141 | }
|
---|
142 | }
|
---|
143 | textout << outconvert << disp << buffer;
|
---|
144 | }
|
---|
145 |
|
---|
146 | void highlight_phrases(const text_t &text, const text_t &querystring, const TermInfo_tarray &terms,
|
---|
147 | const text_t &shl, const text_t &ehl, displayclass &disp,
|
---|
148 | outconvertclass &outconvert, ostream &textout) {
|
---|
149 |
|
---|
150 | text_tmap allterms;
|
---|
151 | text_tarray phrase_terms;
|
---|
152 | text_tmap::const_iterator it;
|
---|
153 |
|
---|
154 | get_phrase_terms(querystring, phrase_terms);
|
---|
155 | int phraselen = phrase_terms.size();
|
---|
156 |
|
---|
157 | TermInfo_tarray::const_iterator this_term = terms.begin();
|
---|
158 | TermInfo_tarray::const_iterator last_term = terms.end();
|
---|
159 | bool first = true;
|
---|
160 | while (this_term != last_term) {
|
---|
161 | text_tarray::const_iterator this_var = (*this_term).matchTerms.begin();
|
---|
162 | text_tarray::const_iterator last_var = (*this_term).matchTerms.end();
|
---|
163 | while (this_var != last_var) {
|
---|
164 | allterms[*this_var] = 1;
|
---|
165 | ++this_var;
|
---|
166 | }
|
---|
167 | first = false;
|
---|
168 | ++this_term;
|
---|
169 | }
|
---|
170 |
|
---|
171 | text_t::const_iterator here = text.begin();
|
---|
172 | text_t::const_iterator end = text.end();
|
---|
173 |
|
---|
174 | text_t word, buffer;
|
---|
175 | int phrasecount = 0;
|
---|
176 | while (here != end) {
|
---|
177 | if (is_unicode_letdig(*here)) {
|
---|
178 | // not word boundary
|
---|
179 | word.push_back(*here);
|
---|
180 | ++here;
|
---|
181 |
|
---|
182 | } else {
|
---|
183 | // found word boundary
|
---|
184 | // add last word if there was one
|
---|
185 | if (!word.empty()) {
|
---|
186 | it = allterms.find(word);
|
---|
187 | if (it != allterms.end()) {
|
---|
188 | // found a word that matches somewhere in the phrase
|
---|
189 |
|
---|
190 | text_t lcword = word; lc(lcword);
|
---|
191 | if (lcword == phrase_terms[phrasecount]) {
|
---|
192 |
|
---|
193 | if (phrasecount == 0) {
|
---|
194 | // clear the buffer (from here on buffer will contain the phrase
|
---|
195 | // as it's built up)
|
---|
196 | textout << outconvert << disp << buffer;
|
---|
197 | buffer.clear();
|
---|
198 | }
|
---|
199 | ++phrasecount;
|
---|
200 | } else {
|
---|
201 | phrasecount = 0;
|
---|
202 | }
|
---|
203 | } else {
|
---|
204 | phrasecount = 0;
|
---|
205 | }
|
---|
206 | buffer += word;
|
---|
207 | word.clear();
|
---|
208 |
|
---|
209 | if (phrasecount == phraselen) {
|
---|
210 | // have found entire phrase
|
---|
211 | textout << outconvert << disp << shl << buffer << ehl;
|
---|
212 | buffer.clear();
|
---|
213 | phrasecount = 0;
|
---|
214 | }
|
---|
215 | }
|
---|
216 |
|
---|
217 | if (*here == '<') {
|
---|
218 | // skip over rest of html tag
|
---|
219 | while ((here != end) && (*here != '>')) {
|
---|
220 | buffer.push_back(*here);
|
---|
221 | ++here;
|
---|
222 | }
|
---|
223 | }
|
---|
224 |
|
---|
225 | buffer.push_back(*here);
|
---|
226 | ++here;
|
---|
227 |
|
---|
228 | if (buffer.size() > 1024 && phrasecount == 0) {
|
---|
229 | textout << outconvert << disp << buffer;
|
---|
230 | buffer.clear();
|
---|
231 | }
|
---|
232 | }
|
---|
233 | }
|
---|
234 | textout << outconvert << disp << buffer;
|
---|
235 | }
|
---|
236 |
|
---|
237 | void get_phrase_terms (const text_t &querystring, text_tarray &phrase_terms) {
|
---|
238 |
|
---|
239 | phrase_terms.erase(phrase_terms.begin(), phrase_terms.end());
|
---|
240 |
|
---|
241 | text_t::const_iterator here = querystring.begin();
|
---|
242 | text_t::const_iterator end = querystring.end();
|
---|
243 |
|
---|
244 | text_t word;
|
---|
245 | while (here != end) {
|
---|
246 | if (is_unicode_letdig(*here)) {
|
---|
247 | // not word boundary
|
---|
248 | word.push_back(*here);
|
---|
249 |
|
---|
250 | } else {
|
---|
251 | // found word boundary
|
---|
252 | if (!word.empty()) {
|
---|
253 | lc(word);
|
---|
254 | phrase_terms.push_back(word);
|
---|
255 | word.clear();
|
---|
256 | }
|
---|
257 | }
|
---|
258 | ++here;
|
---|
259 | }
|
---|
260 |
|
---|
261 | if (!word.empty()) {
|
---|
262 | lc(word);
|
---|
263 | phrase_terms.push_back(word);
|
---|
264 | }
|
---|
265 | }
|
---|
266 |
|
---|
267 | void remove_space (text_t &qstring) {
|
---|
268 |
|
---|
269 | text_t altered_string;
|
---|
270 | text_t space;
|
---|
271 |
|
---|
272 | text_t::const_iterator here = qstring.begin();
|
---|
273 | text_t::const_iterator end = qstring.end();
|
---|
274 | while (here != end) {
|
---|
275 | if (is_unicode_space(*here)) {
|
---|
276 | space.push_back(*here);
|
---|
277 | } else {
|
---|
278 | if (!altered_string.empty()) {
|
---|
279 | altered_string += space;
|
---|
280 | }
|
---|
281 | space.clear();
|
---|
282 | altered_string.push_back(*here);
|
---|
283 | }
|
---|
284 | ++here;
|
---|
285 | }
|
---|
286 |
|
---|
287 | qstring = altered_string;
|
---|
288 | }
|
---|