Changeset 3322


Ignore:
Timestamp:
2002-08-05T15:15:18+12:00 (22 years ago)
Author:
sjboddie
Message:

Attempted to improve search term highlighting. Simple phrase searches will
now be highlighted correctly rather than simply highlighting all terms that
appear in the phrase. There are still many cases where highlighting won't
work quite as expected though, e.g. when stemming is turned on and there
are stemmed terms within the phrase and when there is more than one phrase
in the query string.

Location:
trunk/gsdl/src/recpt
Files:
2 added
4 edited

Legend:

Unmodified
Added
Removed
  • trunk/gsdl/src/recpt/Makefile.in

    r3321 r3322  
    138138        datelistbrowserclass.h invbrowserclass.h pagedbrowserclass.h \
    139139        htmlbrowserclass.h phindbrowserclass.h historydb.h collectoraction.h \
    140         phindaction.h summarise.h
     140        phindaction.h summarise.h highlighttext.h
    141141
    142142RECEPTHEADERS =
     
    160160        invbrowserclass.cpp pagedbrowserclass.cpp htmlbrowserclass.cpp \
    161161        phindbrowserclass.cpp  historydb.cpp collectoraction.cpp argdb.cpp \
    162         browseactiontools.h formattools.cpp phindaction.cpp summarise.cpp
     162        browseactiontools.h formattools.cpp phindaction.cpp summarise.cpp \
     163        highlighttext.cpp
    163164
    164165
     
    192193        pagedbrowserclass.o htmlbrowserclass.o phindbrowserclass.o \
    193194        historydb.o $(Z_COMOBJS) collectoraction.o phindaction.o \
    194         summarise.o
     195        summarise.o highlighttext.o
    195196
    196197RECPTOBJECTS = recptmain.o
  • trunk/gsdl/src/recpt/documentaction.cpp

    r3087 r3322  
    3131#include "unitool.h"
    3232#include "gsdltools.h"
     33#include "highlighttext.h"
    3334
    3435documentaction::documentaction () {
     
    106107
    107108  // in this action "hl" is the "highlighting on/
    108   // highlighting off control
     109  // highlighting off control ("hl" == "2" is a
     110  // special case that forces phrase highlighting
     111  // to be used even if the query string doesn't
     112  // appear to be a phrase)
    109113  arg_ainfo.shortname = "hl";
    110114  arg_ainfo.longname = "highlighting on/off";
     
    181185  // check hl argument
    182186  int arg_hl = args.getintarg("hl");
    183   if (arg_hl != 0 && arg_hl != 1) {
     187  if (arg_hl < 0 || arg_hl > 2) {
    184188    logout << "Warning: \"hl\" argument out of range (" << arg_hl << ")\n";
    185189    cgiarginfo *hlinfo = argsinfo.getarginfo ("hl");
     
    782786                  text_t &collection, recptproto *collectproto,
    783787                  displayclass &disp, outconvertclass &outconvert,
    784                   ostream &textout, ostream &logout) {
     788                  ostream &textout, ostream &logout,
     789                  cgiargsclass &args) {
    785790 
    786791  DocumentRequest_t docrequest;
     
    798803      // (wanttext will equal 2 if we want text and other stuff too)
    799804      if (wanttext == 1)
    800     if (highlight)
    801       highlighttext(docresponse.doc, terminfo, disp, outconvert, textout, logout);
    802     else
     805    if (highlight) {
     806      highlighttext(docresponse.doc, args, terminfo, disp, outconvert, textout);
     807    } else {
    803808      textout << outconvert << disp << docresponse.doc;
     809    }
    804810    }
    805811 
    806812    if (wanttext != 1) {
    807813      text_t doctext
    808       = get_formatted_string (collection, collectproto,
    809                   docinfo, disp, formatlistptr, docresponse.doc,
    810                   logout);
     814    = get_formatted_string (collection, collectproto,
     815                docinfo, disp, formatlistptr, docresponse.doc,
     816                logout);
    811817     
    812       if (highlight)
    813     highlighttext(doctext, terminfo, disp, outconvert, textout, logout);
    814       else
     818      if (highlight) {
     819    highlighttext(doctext, args, terminfo, disp, outconvert, textout);
     820      } else {
    815821    textout << outconvert << disp << doctext;
     822      }
    816823    }
    817824  }
     
    876883      output_text (inforesponse.docInfo[0], formatlistptr, queryresponse.termInfo,
    877884           OID, highlight, hastxt, wanttext, collection, collectproto,
    878            disp, outconvert, textout, logout);
     885           disp, outconvert, textout, logout, args);
    879886
    880887
     
    919926      output_text (thisdocinfo, formatlistptr, queryresponse.termInfo,
    920927               OID, highlight, hastxt, wanttext, collection,
    921                collectproto, disp, outconvert, textout, logout);
     928               collectproto, disp, outconvert, textout, logout, args);
    922929     
    923930    }
     
    936943      output_text (*sechere, formatlistptr, queryresponse.termInfo,
    937944               (*sechere).OID, highlight, shastxt, wanttext, collection,
    938                collectproto, disp, outconvert, textout, logout);
     945               collectproto, disp, outconvert, textout, logout, args);
    939946      count ++;
    940947      sechere ++;
     
    945952  delete formatlistptr;
    946953}
    947 
    948 // highlighttext highlights query terms in text string and outputs the resulting text string
    949 void documentaction::highlighttext(text_t &text, const TermInfo_tarray &terms,
    950                    displayclass &disp, outconvertclass &outconvert,
    951                    ostream &textout, ostream &/*logout*/) {
    952 
    953   text_tmap allterms;
    954   text_tmap::const_iterator it;
    955 
    956   // first load all the term variations into a map
    957   TermInfo_tarray::const_iterator this_term = terms.begin();
    958   TermInfo_tarray::const_iterator last_term = terms.end();
    959   while (this_term != last_term) {
    960     text_tarray::const_iterator this_var = (*this_term).matchTerms.begin();
    961     text_tarray::const_iterator last_var = (*this_term).matchTerms.end();
    962     while (this_var != last_var) {
    963       allterms[*this_var] = 1;
    964       this_var ++;
    965     }
    966     this_term ++;
    967   }
    968 
    969   // get the text to start and end a hightlight
    970   text_t starthighlight = "<b><u>";
    971   text_t endhighlight = "</u></b>";
    972   if (disp.isdefaultmacro("Global", "starthighlight"))
    973     disp.expandstring("Global", "_starthighlight_", starthighlight);
    974   if (disp.isdefaultmacro("Global", "endhighlight"))
    975     disp.expandstring("Global", "_endhighlight_", endhighlight);
    976 
    977 
    978   text_t::iterator here = text.begin();
    979   text_t::iterator end = text.end();
    980   text_t word, buffer;
    981   while (here != end) {
    982     if (is_unicode_letdig(*here)) {
    983       // not word boundary
    984       word.push_back(*here);
    985       here++;
    986 
    987     } else {
    988       // found word boundary
    989       // add last word if there was one
    990       if (!word.empty()) {
    991     it = allterms.find(word);
    992     if (it != allterms.end()) {
    993       word = starthighlight + word + endhighlight;
    994     }
    995     buffer += word;
    996         word.clear();
    997       }
    998 
    999       if (*here == '<') {
    1000         // skip over rest of html tag
    1001     while ((here != end) && (*here != '>')) {
    1002       buffer.push_back(*here);
    1003       here++;
    1004     }
    1005       }
    1006 
    1007       buffer.push_back(*here);
    1008       here++;
    1009 
    1010       if (buffer.size() > 1024) {
    1011     textout << outconvert << disp << buffer;
    1012     buffer.clear();
    1013       }
    1014     }
    1015   }
    1016   textout << outconvert << disp << buffer;
    1017 }
  • trunk/gsdl/src/recpt/documentaction.h

    r1805 r3322  
    5454                text_t &collection, recptproto *collectproto,
    5555                displayclass &disp, outconvertclass &outconvert,
    56                 ostream &textout, ostream &logout);
     56                ostream &textout, ostream &logout,
     57                cgiargsclass &args);
    5758
    5859  virtual void output_document (const text_t &OID, cgiargsclass &args,
     
    6061                outconvertclass &outconvert, ostream &textout,
    6162                ostream &logout);
    62 
    63   void highlighttext(text_t &text, const TermInfo_tarray &terms,
    64              displayclass &disp, outconvertclass &outconvert,
    65              ostream &textout, ostream &logout);
    6663
    6764  formatinfo_t formatinfo;
  • trunk/gsdl/src/recpt/win32.mak

    r3321 r3322  
    8080        htmlbrowserclass.h historydb.h phindbrowserclass.h collectoraction.h \
    8181        nullproto.h argdb.h browseaction.h browseactiontools.h phindaction.h \
    82         summarise.h
     82        summarise.h highlighttext.h
    8383
    8484CGIHEADERS = cgiwrapper.h
     
    9797        historydb.cpp phindbrowserclass.cpp collectoraction.cpp nullproto.cpp \
    9898        argdb.cpp browseaction.cpp browseactiontools.cpp phindaction.cpp \
    99         summarise.cpp
     99        summarise.cpp highlighttext.cpp
    100100
    101101CGISOURCES = librarymain.cpp cgiwrapper.cpp
     
    114114        historydb.obj phindbrowserclass.obj collectoraction.obj nullproto.obj \
    115115        argdb.obj browseaction.obj browseactiontools.obj phindaction.obj \
    116         summarise.obj
     116        summarise.obj highlighttext.obj
    117117
    118118CGIOBJECTS = librarymain.obj cgiwrapper.obj
Note: See TracChangeset for help on using the changeset viewer.