/********************************************************************** * * highlighttext.cpp -- * Copyright (C) 2002 DL Consulting Ltd * * A component of the Greenstone digital library software * from the New Zealand Digital Library Project at the * University of Waikato, New Zealand. * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. * *********************************************************************/ #include "highlighttext.h" #include "unitool.h" static void highlight_terms(const text_t &text, const TermInfo_tarray &terms, const text_t &shl, const text_t &ehl, displayclass &disp, outconvertclass &outconvert, ostream &textout); static void highlight_phrases(const text_t &text, const text_t &querystring, const TermInfo_tarray &terms, const text_t &shl, const text_t &ehl, displayclass &disp, outconvertclass &outconvert, ostream &textout); static void get_phrase_terms (const text_t &querystring, text_tarray &phrase_terms); static void remove_space (text_t &qstring); // highlights text string by adding _starthighlight_ and _endhightlight_ // around terms and/or phrases that match querystring // - at present this only handles phrase searches where the first and last // characters are double quotes (i.e. it won't correctly handle a mixture // of phrase and non-phrase terms or queries containing multiple phrases) - // it also doesn't highlight stemmed variations of terms within a phrase // because the terminfo returned by mgqueryfilter doesn't currently tell // you which term variants belong to which term // - this function can be forced to treat the querystring like a phrase // even if it isn't one by setting the "hl" cgi argument to "2" void highlighttext(const text_t &text, cgiargsclass &args, const TermInfo_tarray &terms, displayclass &disp, outconvertclass &outconvert, ostream &textout) { text_t &querystring = args["q"]; // get the text to start and end a hightlight text_t shl = ""; text_t ehl = ""; if (disp.isdefaultmacro(displayclass::defaultpackage, "starthighlight")) { disp.expandstring(displayclass::defaultpackage, "_starthighlight_", shl); } if (disp.isdefaultmacro(displayclass::defaultpackage, "endhighlight")) { disp.expandstring(displayclass::defaultpackage, "_endhighlight_", ehl); } // remove leading and trailing whitespace remove_space(querystring); // Expand macros before highlighting -- by Jens Wille text_t text_expanded = ""; disp.expandstring(text, text_expanded); if ((args["hl"] == 2) || ((*(querystring.begin()) == '"') && (*(querystring.end()-1) == '"'))) { highlight_phrases(text_expanded, querystring, terms, shl, ehl, disp, outconvert, textout); } else { highlight_terms(text_expanded, terms, shl, ehl, disp, outconvert, textout); } } void highlight_terms(const text_t &text, const TermInfo_tarray &terms, const text_t &shl, const text_t &ehl, displayclass &disp, outconvertclass &outconvert, ostream &textout) { text_tmap allterms; text_tmap::const_iterator it; // first load all the term variations into a map TermInfo_tarray::const_iterator this_term = terms.begin(); TermInfo_tarray::const_iterator last_term = terms.end(); while (this_term != last_term) { text_tarray::const_iterator this_var = (*this_term).matchTerms.begin(); text_tarray::const_iterator last_var = (*this_term).matchTerms.end(); while (this_var != last_var) { allterms[*this_var] = 1; ++this_var; } ++this_term; } text_t::const_iterator here = text.begin(); text_t::const_iterator end = text.end(); text_t word, buffer; while (here != end) { if (is_unicode_letdig(*here)) { // not word boundary word.push_back(*here); ++here; } else { // found word boundary // add last word if there was one if (!word.empty()) { it = allterms.find(word); if (it != allterms.end()) { word = shl + word + ehl; } buffer += word; word.clear(); } if (*here == '<') { // skip over rest of html tag while ((here != end) && (*here != '>')) { buffer.push_back(*here); ++here; } } buffer.push_back(*here); ++here; if (buffer.size() > 1024) { textout << outconvert << disp << buffer; buffer.clear(); } } } textout << outconvert << disp << buffer; } void highlight_phrases(const text_t &text, const text_t &querystring, const TermInfo_tarray &terms, const text_t &shl, const text_t &ehl, displayclass &disp, outconvertclass &outconvert, ostream &textout) { text_tmap allterms; text_tarray phrase_terms; text_tmap::const_iterator it; get_phrase_terms(querystring, phrase_terms); int phraselen = phrase_terms.size(); TermInfo_tarray::const_iterator this_term = terms.begin(); TermInfo_tarray::const_iterator last_term = terms.end(); bool first = true; while (this_term != last_term) { text_tarray::const_iterator this_var = (*this_term).matchTerms.begin(); text_tarray::const_iterator last_var = (*this_term).matchTerms.end(); while (this_var != last_var) { allterms[*this_var] = 1; ++this_var; } first = false; ++this_term; } text_t::const_iterator here = text.begin(); text_t::const_iterator end = text.end(); text_t word, buffer; int phrasecount = 0; while (here != end) { if (is_unicode_letdig(*here)) { // not word boundary word.push_back(*here); ++here; } else { // found word boundary // add last word if there was one if (!word.empty()) { it = allterms.find(word); if (it != allterms.end()) { // found a word that matches somewhere in the phrase text_t lcword = word; lc(lcword); if (lcword == phrase_terms[phrasecount]) { if (phrasecount == 0) { // clear the buffer (from here on buffer will contain the phrase // as it's built up) textout << outconvert << disp << buffer; buffer.clear(); } ++phrasecount; } else { phrasecount = 0; } } else { phrasecount = 0; } buffer += word; word.clear(); if (phrasecount == phraselen) { // have found entire phrase textout << outconvert << disp << shl << buffer << ehl; buffer.clear(); phrasecount = 0; } } if (*here == '<') { // skip over rest of html tag while ((here != end) && (*here != '>')) { buffer.push_back(*here); ++here; } } buffer.push_back(*here); ++here; if (buffer.size() > 1024 && phrasecount == 0) { textout << outconvert << disp << buffer; buffer.clear(); } } } textout << outconvert << disp << buffer; } void get_phrase_terms (const text_t &querystring, text_tarray &phrase_terms) { phrase_terms.erase(phrase_terms.begin(), phrase_terms.end()); text_t::const_iterator here = querystring.begin(); text_t::const_iterator end = querystring.end(); text_t word; while (here != end) { if (is_unicode_letdig(*here)) { // not word boundary word.push_back(*here); } else { // found word boundary if (!word.empty()) { lc(word); phrase_terms.push_back(word); word.clear(); } } ++here; } if (!word.empty()) { lc(word); phrase_terms.push_back(word); } } void remove_space (text_t &qstring) { text_t altered_string; text_t space; text_t::const_iterator here = qstring.begin(); text_t::const_iterator end = qstring.end(); while (here != end) { if (is_unicode_space(*here)) { space.push_back(*here); } else { if (!altered_string.empty()) { altered_string += space; } space.clear(); altered_string.push_back(*here); } ++here; } qstring = altered_string; }