Changeset 1633 for trunk/gsdl


Ignore:
Timestamp:
2000-10-31T10:07:22+13:00 (24 years ago)
Author:
paynter
Message:

Split expansion pherases into a prefix, body, and suffix (the old phind
used to do this). This information is used to format the HTML output.

File:
1 edited

Legend:

Unmodified
Added
Removed
  • trunk/gsdl/src/phind/host/phindcgi.cpp

    r1629 r1633  
    6969            bool &XMLmode);
    7070
    71 void print_expansions(char *cgi_script, char *collection, bool XMLmode,
    72               TextData &textdata, vector <unsigned long> dlist,
     71void print_expansions(char *cgi_script, char *collection, bool XMLmode, UCArray body,
     72              TextData &textdata, vector <unsigned long> elist,
    7373              unsigned long first, unsigned long last);
    7474
     
    9191             vector <unsigned long> &docnum,
    9292             vector <unsigned long> &docfrq);
     93
     94void split_phrase(UCArray word, UCArray body, UCArray &prefix, UCArray &suffix);
     95bool phrase_match(UCArray text, UCArray::iterator &here, UCArray::iterator end);
    9396
    9497void get_document_all_data(TextData &docdata, unsigned long docNum,
     
    203206       << "\" end=\"" << last_e << "\">" << endl;
    204207
    205       print_expansions(argv[0], collection, XMLmode, textdata, el, first_e, last_e);
    206 
     208      print_expansions(argv[0], collection, XMLmode, word, textdata, el, first_e, last_e);
    207209
    208210      cout << "</expansionlist>" << endl;
     
    217219      }
    218220
    219       cout << "<p><table><tr><th align=left>Phrase</th><th>freq</th><th>docs</th></tr>" << endl;
    220       print_expansions(argv[0], collection, XMLmode, textdata, el, first_e, last_e);
     221      cout << "<p><table border=0><tr><th align=left>Phrase</th><th>freq</th><th>docs</th></tr>" << endl;
     222      print_expansions(argv[0], collection, XMLmode, word, textdata, el, first_e, last_e);
    221223      cout << "</table>" << endl;
    222224
     
    304306// Print a list of expansions
    305307//
    306 // Given the textData and a list of phrase numbers,
    307 // print out each of the words.
    308 
    309 void print_expansions(char *cgi_script, char *collection, bool XMLmode,
    310               TextData &textdata, vector <unsigned long> dlist,
     308// Given the textData and a list of phrase numbers, print out each of the
     309// expansions.
     310
     311void print_expansions(char *cgi_script, char *collection, bool XMLmode, UCArray body,
     312              TextData &textdata, vector <unsigned long> elist,
    311313              unsigned long first, unsigned long last) {
    312314
    313315  UCArray word;
    314316  unsigned long phrase, tf, df, ef;
     317
     318  UCArray suffix, prefix;
    315319 
    316320  for (unsigned long e = first; e < last; e++) {
    317321
    318     phrase = dlist[e];
     322    phrase = elist[e];
    319323    get_phrase_freq_data(textdata, phrase, word, tf, ef, df);
     324
     325    split_phrase(word, body, prefix, suffix);
    320326   
    321327    if (XMLmode) {
    322328      cout << "<expansion num=\"" << e
    323329       << "\" id=\"" << phrase
     330       << "\" prefix=\"" << prefix
     331       << "\" suffix=\"" << suffix
    324332       << "\" text=\"" << word
    325333       << "\" tf=\"" << tf
    326334       << "\" df=\"" << df << "\"/>" << endl;
    327335    } else {
    328       cout << "<tr valign=top><td><a href='" << cgi_script << "?c=" << collection
    329        << "&n=" << phrase << "'>" << word << "</a>"
    330        << "</td><td>" << tf << "</td><td>" << df << "</td></tr>"
    331        << endl;
     336      cout << "<tr valign=top><td align=right><a href='" << cgi_script
     337       << "?c=" << collection << "&n=" << phrase << "'>" << prefix << "</a></td>"
     338       << "<td align=center><a href='" << cgi_script
     339       << "?c=" << collection << "&n=" << phrase << "'>" << body << "</a></td>"
     340       << "<td align=left><a href='" << cgi_script
     341       << "?c=" << collection << "&n=" << phrase << "'>" << suffix << "</a></td>"
     342       << "<td>" << tf << "</td><td>" << df << "</td></tr>" << endl;
    332343    }
    333344  }
     
    374385// The phrase is stored in textData as record phrase.
    375386// We retrieve:
    376 //   word - the text od the phrase
     387//   word - the text of the phrase
    377388//   tf - the total frequency of the phrase
    378389//   ef - the expansion frequency of the phrase
     
    648659      }
    649660     
     661      // d: the last document number
     662      else if (key[0] == 'd') {
     663    last_d = toLongInt(value);
     664      }
     665
     666      // e: the last expansion number
     667      else if (key[0] == 'e') {
     668    last_e = toLongInt(value);
     669      }
     670
     671      // f: the first document number
     672      else if (key[0] == 'f') {
     673    first_d = toLongInt(value);
     674      }
     675
     676      // g: the first expansion number
     677      else if (key[0] == 'g') {
     678    first_e = toLongInt(value);
     679      }
     680
     681      // x: XML mode
     682      else if (key[0] == 'x') {
     683    XMLmode = true;
     684      }
     685
    650686      // n: the phrase number
    651687      else if (key[0] == 'n') {
     
    656692      else if (key[0] == 'p') {
    657693    toUCArray(value, phrasetext);
    658       }
    659 
    660       // d: the last document number
    661       else if (key[0] == 'd') {
    662     last_d = toLongInt(value);
    663       }
    664 
    665       // e: the last expansion number
    666       else if (key[0] == 'e') {
    667     last_e = toLongInt(value);
    668       }
    669 
    670       // f: the first document number
    671       else if (key[0] == 'f') {
    672     first_d = toLongInt(value);
    673       }
    674 
    675       // g: the first expansion number
    676       else if (key[0] == 'g') {
    677     first_e = toLongInt(value);
    678       }
    679 
    680       // x: XML mode
    681       else if (key[0] == 'x') {
    682     XMLmode = true;
    683694      }
    684695
     
    747758
    748759
     760// split an expansion into prefix and suffix
     761
     762void split_phrase(UCArray word, UCArray body, UCArray &prefix, UCArray &suffix) {
     763
     764  prefix.clear();
     765  suffix.clear();
     766
     767  bool readingPrefix = true;
     768  UCArray::iterator here = word.begin();
     769  UCArray::iterator end = word.end();
     770 
     771  while (here != end) {
     772
     773    // if we've not read all the prefix, add the next char to the prefix
     774    if (readingPrefix) {
     775      if (phrase_match(body, here, end)) {
     776    readingPrefix = false;
     777    // trim whitespace from end of prefix & start of suffix
     778    if (!prefix.empty()) {
     779      prefix.pop_back();
     780    }
     781    while (*here == ' ') {
     782      here++;
     783    }
     784      } else {
     785    prefix.push_back(*here);
     786    here++;
     787      }
     788    }
     789    // if we've finished with the prefix, update the suffix
     790    else {
     791      suffix.push_back(*here);
     792      here++;
     793    }
     794  }
     795}
     796
     797// phrase_match
     798//
     799// compare two strings, one represented as an UCArray, the other as two
     800// UCArray iterators.
     801//
     802// Return true if the UCArray is the same as the phrase the iterator points
     803// too for the length of the UCArray.
     804
     805bool phrase_match(UCArray text, UCArray::iterator &here, UCArray::iterator end) {
     806
     807  UCArray::iterator one_here = text.begin();
     808  UCArray::iterator one_end  = text.end();
     809  UCArray::iterator two_here = here;
     810
     811  // iterate over the length of the first string, comparing each element to
     812  // the corresponding element in the second string.
     813  while (one_here != one_end) {
     814    if (*one_here != *two_here) {
     815      return false;
     816    }
     817    one_here++;
     818    two_here++;
     819  }
     820
     821  here = two_here;
     822  return true;
     823}
     824
     825
    749826// Convert from text_t format
    750827//
Note: See TracChangeset for help on using the changeset viewer.