Changeset 1619


Ignore:
Timestamp:
2000-10-27T09:23:55+13:00 (24 years ago)
Author:
paynter
Message:

Added an XML output mode. Fixed a bug reading the last item of the
document list. A few other fixes & some tidying.

File:
1 edited

Legend:

Unmodified
Added
Removed
  • trunk/gsdl/src/phind/host/phindcgi.cpp

    r1603 r1619  
    1 // phindcgi.cpp
    2 
    3 // The program itself reads request for phind data from STDIN,
    4 // looks up the phrase's charatoristics in the mgpp files, and
    5 // reports output to STDOUT.
    6 
     1/**********************************************************************
     2 *
     3 * phindcgi.cpp -- cgi program to serve phind phrase hierarchies
     4 *
     5 * Copyright 2000 Gordon Paynter
     6 * Copyright 2000 The New Zealand Digital Library Project
     7 *
     8 *
     9 * A component of the Greenstone digital library software
     10 * from the New Zealand Digital Library Project at the
     11 * University of Waikato, New Zealand.
     12 *
     13 * This program is free software; you can redistribute it and/or modify
     14 * it under the terms of the GNU General Public License as published by
     15 * the Free Software Foundation; either version 2 of the License, or
     16 * (at your option) any later version.
     17 *
     18 * This program is distributed in the hope that it will be useful,
     19 * but WITHOUT ANY WARRANTY; without even the implied warranty of
     20 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
     21 * GNU General Public License for more details.
     22 *
     23 * You should have received a copy of the GNU General Public License
     24 * along with this program; if not, write to the Free Software
     25 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
     26 *
     27 *********************************************************************/
     28
     29/*
     30 * phindcgi.cpp
     31 *
     32 * The program itself reads request for a phrase's data from the
     33 * QUERY_STRING variable, looks up the phrase (if necessary) in the MGPP
     34 * pword database, then looks up the phrase's charatoristics in the MGPP
     35 * pdata database, and reports output to STDOUT ar crude HTML or XML.
     36 *
     37 */
    738
    839
     
    1546#include <vector.h>
    1647#include <algo.h>
    17 
    1848
    1949// Include MGPP functionality.
     
    3666            unsigned long &phrasenumber, UCArray &phrasetext,
    3767            unsigned long &first_e, unsigned long &last_e,
    38             unsigned long &first_d, unsigned long &last_d);
    39 
    40 void find_phrase_number_from_word(char *basepath, UCArray &query, DocNumArray &result);
    41 
    42 void print_word_tf_df(char *cgi_script, char *collection,
     68            unsigned long &first_d, unsigned long &last_d,
     69            bool &XMLmode);
     70
     71void print_expansions(char *cgi_script, char *collection, bool XMLmode,
    4372              TextData &textdata, vector <unsigned long> dlist,
    4473              unsigned long first, unsigned long last);
    4574
    46 void print_document_df(char *basepath, char *cgi_script, char *collection,
    47                vector <unsigned long> docNums,
    48                vector <unsigned long> docFreq,
    49                unsigned long first, unsigned long last);
    50  
     75void print_documents(bool XMLmode, char *basepath, char *cgi_script,
     76             char *collection,
     77             vector <unsigned long> docNums,
     78             vector <unsigned long> docFreq,
     79             unsigned long first, unsigned long last);
     80
     81void find_phrase_number_from_word(char *basepath, UCArray &query, DocNumArray &result);
     82
    5183void get_phrase_freq_data(TextData &textdata, unsigned long phrase,
    5284              UCArray &word, unsigned long &tf,
     
    5789             unsigned long &ef, unsigned long &df,
    5890             vector <unsigned long> &el,
    59              vector <unsigned long> &docnum, vector <unsigned long> &docfrq);
     91             vector <unsigned long> &docnum,
     92             vector <unsigned long> &docfrq);
    6093
    6194void get_document_all_data(TextData &docdata, unsigned long docNum,
     
    81114
    82115  // the number of occurances to display
    83   unsigned long first_e, last_e, first_d, last_d;
    84  
     116  unsigned long first_e, last_e, count_e, first_d, last_d, count_d;
     117 
     118  // are we in XML mode (as opposed to HTML mode)
     119  bool XMLmode = false;
     120
    85121  // Read the gsdlsite.cfg file
    86122  char *gsdlhome = NULL;
     
    94130  char *collection;
    95131  text_tmap param;
    96   get_cgi_parameters(collection, phrase, word, first_e, last_e, first_d, last_d);
     132  get_cgi_parameters(collection, phrase, word,
     133             first_e, last_e, first_d, last_d, XMLmode);
    97134
    98135  if (collection == NULL) {
     
    133170
    134171
    135   // Output the HTML page
    136   cout << "Content-type: text/html" << endl << endl
    137        << "<html><head><title>" << word << "</title></head>" << endl
    138        << "<body><center>" << endl
    139        << "<p><h1>" << word << "</h1>" << endl
    140        << "<p><b>"<< word << "</b> occurs " << tf << " times in " << df << " documents" << endl;
     172  // Output the header
     173  if (XMLmode) {
     174    cout << "Content-type: text/plain" << endl << endl
     175     << "<phinddata id=\"" << phrase
     176     << "\" text=\"" << word
     177     << "\" df=\"" << df
     178     << "\" ef=\"" << ef
     179     << "\">" << endl;
     180  } else {
     181    cout << "Content-type: text/html" << endl << endl
     182     << "<html><head><title>" << word << "</title></head>" << endl
     183     << "<body><center>" << endl
     184     << "<p><h1>" << word << "</h1>" << endl
     185     << "<p><b>"<< word << "</b> occurs "
     186     << tf << " times in " << df << " documents" << endl;
     187  }
    141188 
    142189  // Output the expansions
     
    147194      last_e = el.size();
    148195    }
    149 
    150     if (last_e == el.size()) {
    151       cout << "<p><b> " << last_e << " expansions</b>" << endl;
    152     } else {
    153       cout << "<p><b>" << last_e << " of " << ef << " expansions</b>" << endl;
    154     }
    155 
    156     cout << "<p><table><tr><th align=left>Phrase</th><th>freq</th><th>docs</th></tr>" << endl;
    157     print_word_tf_df(argv[0], collection, textdata, el, first_e, last_e);
    158     cout << "</table>" << endl;
    159 
    160     if (last_e < el.size()) {
    161       cout << "<br><a href='" << argv[0]
    162        << "?c=" << collection << "&n=" << phrase
    163        << "&e=" << (last_e + 10) << "&d=" << last_d
    164        << "'>Get more phrases</a>"
    165        << endl
    166        << "<br><a href='" << argv[0]
    167        << "?c=" << collection << "&n=" << phrase
    168        << "&e=0&d=" << last_d
    169        << "'>Get every phrase</a>"
    170        << endl;
     196    count_e = last_e - first_e;
     197
     198    // output expansions as XML
     199    if (XMLmode) {
     200      cout << "<expansionlist length=\"" << ef
     201       << "\" start=\"" << first_e
     202       << "\" end=\"" << last_e << "\">" << endl;
     203
     204      print_expansions(argv[0], collection, XMLmode, textdata, el, first_e, last_e);
     205
     206
     207      cout << "</expansionlist>" << endl;
     208    }
     209
     210    // output expansions as HTML
     211    else {
     212      if (count_e == el.size()) {
     213    cout << "<p><b> " << count_e << " expansions</b>" << endl;
     214      } else {
     215    cout << "<p><b>" << count_e << " of " << ef << " expansions</b>" << endl;
     216      }
     217
     218      cout << "<p><table><tr><th align=left>Phrase</th><th>freq</th><th>docs</th></tr>" << endl;
     219      print_expansions(argv[0], collection, XMLmode, textdata, el, first_e, last_e);
     220      cout << "</table>" << endl;
     221
     222      if (last_e < el.size()) {
     223    cout << "<br><a href='" << argv[0]
     224         << "?c=" << collection << "&n=" << phrase
     225         << "&e=" << (last_e + 10) << "&d=" << last_d
     226         << "&g=" << first_e << "&f=" << first_d
     227         << "'>Get more phrases</a>"
     228         << endl
     229         << "<br><a href='" << argv[0]
     230         << "?c=" << collection << "&n=" << phrase
     231         << "&e=0&d=" << last_d
     232         << "&g=" << first_e << "&f=" << first_d
     233         << "'>Get every phrase</a>"
     234         << endl;
     235      }
    171236    }
    172237  }
     
    179244      last_d = docNums.size();
    180245    }
    181 
    182     if (last_d == docNums.size()) {
    183       cout << "<p><b> " << last_d << " documents</b>" << endl;
    184     } else {
    185       cout << "<p><b>" << last_d << " of " << df << " documents</b>" << endl;
    186     }
    187 
    188     cout << "<p><table><tr><th align=left>Document</th><th>freq</th></tr>" << endl;
    189     print_document_df(basepath, "library", collection, docNums, docfreq, first_d, last_d);
    190     cout << "</table>" << endl;
    191 
    192     if (last_d < docNums.size()) {
    193       cout << "<br><a href='" << argv[0]
    194        << "?c=" << collection << "&n=" << phrase
    195        << "&e=" << last_e << "&d=" << (last_d + 10)
    196        << "'>Get more documents</a>" << endl
    197        << "<br><a href='" << argv[0]
    198        << "?c=" << collection << "&n=" << phrase
    199        << "&e=" << last_e
    200        << "&d=0'>Get every document</a>" << endl;
    201     }
    202 
    203 
     246    count_d = last_d - first_d;
     247
     248    // output document list as XML
     249    if (XMLmode) {
     250      cout << "<documentlist length=\"" << df
     251       << "\" start=\"" << first_d
     252       << "\" end=\"" << last_d << "\">" << endl;
     253
     254      print_documents(XMLmode, basepath, "library", collection,
     255              docNums, docfreq, first_d, last_d);
     256
     257      cout << "</documentlist>" << endl;
     258    }
     259
     260    // output document list as HTML
     261    else {
     262
     263      if (count_d == docNums.size()) {
     264    cout << "<p><b> " << count_d << " documents</b>" << endl;
     265      } else {
     266    cout << "<p><b>" << count_d << " of " << df << " documents</b>" << endl;
     267      }
     268
     269      cout << "<p><table><tr><th align=left>Document</th><th>freq</th></tr>" << endl;
     270      print_documents(XMLmode, basepath, "library", collection,
     271              docNums, docfreq, first_d, last_d);
     272      cout << "</table>" << endl;
     273     
     274      if (last_d < docNums.size()) {
     275    cout << "<br><a href='" << argv[0]
     276         << "?c=" << collection << "&n=" << phrase
     277         << "&e=" << last_e << "&d=" << (last_d + 10)
     278         << "&g=" << first_e << "&f=" << first_d
     279         << "'>Get more documents</a>" << endl
     280         << "<br><a href='" << argv[0]
     281         << "?c=" << collection << "&n=" << phrase
     282         << "&e=" << last_e
     283         << "&g=" << first_e << "&f=" << first_d
     284         << "&d=0'>Get every document</a>" << endl;
     285      }
     286    }
     287  }
     288
     289  // Close the document
     290  if (XMLmode) {
     291    cout << "</phinddata>" << endl;
     292  } else {
    204293    cout << "</center></body></html>" << endl;
    205294  }
     
    215304// print out each of the words.
    216305
    217 void print_word_tf_df(char *cgi_script, char *collection,
     306void print_expansions(char *cgi_script, char *collection, bool XMLmode,
    218307              TextData &textdata, vector <unsigned long> dlist,
    219308              unsigned long first, unsigned long last) {
     
    227316    get_phrase_freq_data(textdata, phrase, word, tf, ef, df);
    228317   
    229 
    230     cout << "<tr valign=top><td><a href='" << cgi_script << "?c=" << collection
    231      << "&n=" << phrase << "'>" << word << "</a>"
    232      << "</td><td>" << tf << "</td><td>" << df << "</td></tr>"
    233      << endl;
    234   }
    235 }
    236 
    237 void print_document_df(char *basepath, char *cgi_script, char *collection, 
    238                vector <unsigned long> docNums, vector <unsigned long> docFreq,
    239                unsigned long first, unsigned long last) {
     318    if (XMLmode) {
     319      cout << "<expansion num=\"" << e
     320       << "\" id=\"" << phrase
     321       << "\" text=\"" << word
     322       << "\" tf=\"" << tf
     323       << "\" df=\"" << df << "\"/>" << endl;
     324    } else {
     325      cout << "<tr valign=top><td><a href='" << cgi_script << "?c=" << collection
     326       << "&n=" << phrase << "'>" << word << "</a>"
     327       << "</td><td>" << tf << "</td><td>" << df << "</td></tr>"
     328       << endl;
     329    }
     330  }
     331}
     332
     333void print_documents(bool XMLmode, char *basepath, char *cgi_script, char *collection, 
     334             vector <unsigned long> docNums, vector <unsigned long> docFreq,
     335             unsigned long first, unsigned long last) {
    240336 
    241337  // Create a TextData object to read the document data
     
    255351    get_document_all_data(docdata, doc, title, hash);
    256352
    257     cout << "<tr valign=top><td><a href='" << cgi_script << "?c=" << collection
    258      << "&a=d&d=" << hash << "'>" << title << "</a>"
    259      << "</td><td>" << freq << "</td></tr>"
    260      << endl;
     353    if (XMLmode) {
     354      cout << "<document num=\"" << d
     355       << "\" hash=\"" << hash
     356       << "\" freq=\"" << freq
     357       << "\" title=\"" << title << "\"/>" << endl;
     358    } else {
     359      cout << "<tr valign=top><td><a href='" << cgi_script << "?c=" << collection
     360       << "&a=d&d=" << hash << "'>" << title << "</a>"
     361       << "</td><td>" << freq << "</td></tr>"
     362       << endl;
     363    }
    261364  }
    262365}
     
    390493
    391494  // Get document list & the document frequency list
     495  while (text.back() == '\n') {
     496    text.pop_back();
     497  }
     498  text.push_back(';');
    392499  text.push_back(':');
    393500  docnum.clear();
     
    432539  // Look the word up in the textData
    433540  if (!GetDocText (docdata, docLevel, docNum, text)) {
    434     FatalError (1, "Error while trying to get phrase %u", docNum);
     541    FatalError (1, "Error while trying to get document %u", docNum);
    435542  }
    436543
     
    439546  while (*next++ != '\t');
    440547 
    441   // Get the title
     548  // Get the document OID (hash)
    442549  hash.clear();
    443550  for (; *next != '\t'; next++) {
     
    446553
    447554  // Get the title
    448   text.push_back('\t');
     555  text.push_back('\n');
    449556  title.clear();
    450   for (next++; *next != '\t'; next++) {
     557  for (next++; *next != '\n'; next++) {
    451558    title.push_back(*next);
    452559  }
     
    487594            unsigned long &phrasenumber, UCArray &phrasetext,
    488595            unsigned long &first_e, unsigned long &last_e,
    489             unsigned long &first_d, unsigned long &last_d) {
     596            unsigned long &first_d, unsigned long &last_d,
     597            bool &XMLmode) {
    490598
    491599
     
    538646     
    539647      // n: the phrase number
    540       if (key[0] == 'n') {
     648      else if (key[0] == 'n') {
    541649    phrasenumber = toLongInt(value);
    542650      }
     
    547655      }
    548656
     657      // d: the last document number
     658      else if (key[0] == 'd') {
     659    last_d = toLongInt(value);
     660      }
     661
    549662      // e: the last expansion number
    550       if (key[0] == 'e') {
     663      else if (key[0] == 'e') {
    551664    last_e = toLongInt(value);
    552665      }
    553666
    554       // d: the last document number
    555       if (key[0] == 'd') {
    556     last_d = toLongInt(value);
     667      // f: the first document number
     668      else if (key[0] == 'f') {
     669    first_d = toLongInt(value);
     670      }
     671
     672      // g: the first expansion number
     673      else if (key[0] == 'g') {
     674    first_e = toLongInt(value);
     675      }
     676
     677      // x: XML mode
     678      else if (key[0] == 'x') {
     679    XMLmode = true;
    557680      }
    558681
Note: See TracChangeset for help on using the changeset viewer.