Changeset 1828


Ignore:
Timestamp:
2001-01-11T10:09:17+13:00 (23 years ago)
Author:
paynter
Message:

Handle the new phind.dm input format, which includes thesaurus data,
and output this thesaurus data when it is available.

File:
1 edited

Legend:

Unmodified
Added
Removed
  • trunk/gsdl/src/phind/host/phindcgi.cpp

    r1809 r1828  
    6666            unsigned long &phrasenumber, UCArray &phrasetext,
    6767            unsigned long &first_e, unsigned long &last_e,
     68            unsigned long &first_l, unsigned long &last_l,
    6869            unsigned long &first_d, unsigned long &last_d,
    6970            bool &XMLmode);
     
    7273              TextData &textdata, vector <unsigned long> elist,
    7374              unsigned long first, unsigned long last);
     75
     76void print_thesaurus_links(char *cgi_script, char *collection,
     77               bool XMLmode, UCArray body, TextData &textdata,
     78               vector <unsigned long> &linkdest,
     79               vector <UCArray> &linktype,
     80               unsigned long first, unsigned long last);
    7481
    7582void print_documents(bool XMLmode, char *basepath, char *cgi_script,
     
    8693
    8794void get_phrase_all_data(TextData &textdata, unsigned long phrase,
    88              UCArray &word, unsigned long &tf,
    89              unsigned long &ef, unsigned long &df,
    90              vector <unsigned long> &el,
     95             UCArray &word,
     96             unsigned long &tf, unsigned long &ef,
     97             unsigned long &lf, unsigned long &df,
     98             vector <unsigned long> &el,
     99             vector <unsigned long> &linkdest,
     100             vector <UCArray> &linktype,
    91101             vector <unsigned long> &docnum,
    92102             vector <unsigned long> &docfrq);
     
    113123
    114124  // the frequency and occurances of the phrase
    115   unsigned long tf, ef, df;
    116   vector <unsigned long> el, docNums, docfreq;
     125  unsigned long tf;
     126  vector <unsigned long> el, linkdest, docNums, docfreq;
     127  vector <UCArray> linktype;
    117128
    118129  // the number of occurances to display
    119   unsigned long first_e, last_e, count_e, first_d, last_d, count_d;
     130  unsigned long ef, first_e, last_e, count_e,
     131                lf, first_l, last_l, count_l,
     132                df, first_d, last_d, count_d;
    120133 
    121134  // are we in XML mode (as opposed to HTML mode)
     
    134147  text_tmap param;
    135148  get_cgi_parameters(collection, phrase, word,
    136              first_e, last_e, first_d, last_d, XMLmode);
     149             first_e, last_e, first_l, last_l, first_d, last_d, XMLmode);
    137150
    138151  if (collection == NULL) {
     
    170183    FatalError (1, "Couldn't load text information for \"%s\"", filename);
    171184  }
    172   get_phrase_all_data(textdata, phrase, word, tf, ef, df, el, docNums, docfreq);
     185  get_phrase_all_data(textdata, phrase, word, tf, ef, lf, df, el,
     186              linkdest, linktype, docNums, docfreq);
    173187
    174188
     
    179193     << "\" text=\"" << word
    180194     << "\" tf=\"" << tf
     195     << "\" ef=\"" << ef
    181196     << "\" df=\"" << df
    182      << "\" ef=\"" << ef
     197     << "\" lf=\"" << lf
    183198     << "\">" << endl;
    184199  } else {
     
    190205     << tf << " times in " << df << " documents" << endl;
    191206  }
     207
     208
     209  // Output the thesaurus links
     210  if ((lf > 0) && (first_l < last_l)) {
     211
     212    // figure out the number of phrases to output
     213    if (last_l > lf) {
     214      last_l = lf;
     215    }
     216    count_l = last_l - first_l;
     217   
     218    if (XMLmode) {
     219      cout << "<thesauruslist length=\"" << lf
     220       << "\" start=\"" << first_l
     221       << "\" end=\"" << last_l << "\">" << endl;
     222      print_thesaurus_links(argv[0], collection, XMLmode, word, textdata,
     223                linkdest, linktype, first_l, last_l);
     224      cout << "</thesauruslist>" << endl;
     225    }
     226
     227    // output links as HTML
     228    else {
     229      if (count_l == lf) {
     230    cout << "<p><b> " << count_l << " thesaurus links</b>" << endl;
     231      } else {
     232    cout << "<p><b>" << count_l << " of " << lf << " thesaurus links</b>" << endl;
     233      }
     234
     235      cout << "<p><table border=1><tr><th>type</th><th>topic</th><th>freq</th><th>docs</th></tr>" << endl;
     236      print_thesaurus_links(argv[0], collection, XMLmode, word, textdata,
     237                linkdest, linktype, first_l, last_l);
     238     
     239      cout << "</table>" << endl;
     240
     241      if (last_l < lf) {
     242    if ((last_l + 10) < lf) {
     243      cout << "<br><a href='" << argv[0]
     244           << "?c=" << collection
     245           << "&n=" << phrase
     246           << "&e=" << first_e
     247           << "&f=" << last_e
     248           << "&h=" << first_d
     249           << "&i=" << last_d
     250           << "&k=" << first_l
     251           << "&l=" << (last_l + 10)
     252           << "'>Get more thesaurus links</a>"
     253           << endl;
     254    }
     255    cout << "<br><a href='" << argv[0]
     256         << "?c=" << collection
     257         << "&n=" << phrase
     258         << "&e=" << first_e
     259         << "&f=" << last_e
     260         << "&h=" << first_d
     261         << "&i=" << last_d
     262         << "&k=" << first_l
     263         << "&l=" << lf
     264         << "'>Get every thesaurus link</a>"
     265         << endl;
     266      }
     267    }
     268   
     269  }
    192270 
    193271  // Output the expansions
     
    219297      }
    220298
    221       cout << "<p><table border=0><tr><th align=left>Phrase</th><th>freq</th><th>docs</th></tr>" << endl;
     299      cout << "<p><table border=1><tr><th colspan=3>phrase</th><th>freq</th><th>docs</th></tr>" << endl;
    222300      print_expansions(argv[0], collection, XMLmode, word, textdata, el, first_e, last_e);
    223301      cout << "</table>" << endl;
    224302
    225       if (last_e < el.size()) {
     303      if (last_e < ef) {
     304    if ((last_e + 10) < ef) {
     305      cout << "<br><a href='" << argv[0]
     306           << "?c=" << collection
     307           << "&n=" << phrase
     308           << "&e=" << first_e
     309           << "&f=" << (last_e + 10)
     310           << "&h=" << first_d
     311           << "&i=" << last_d
     312           << "&k=" << first_l
     313           << "&l=" << last_l
     314           << "'>Get more expansions</a>"
     315           << endl;
     316    }
    226317    cout << "<br><a href='" << argv[0]
    227          << "?c=" << collection << "&n=" << phrase
    228          << "&e=" << (last_e + 10) << "&d=" << last_d
    229          << "&g=" << first_e << "&f=" << first_d
    230          << "'>Get more phrases</a>"
    231          << endl
    232          << "<br><a href='" << argv[0]
    233          << "?c=" << collection << "&n=" << phrase
    234          << "&e=" << el.size() << "&d=" << last_d
    235          << "&g=" << first_e << "&f=" << first_d
    236          << "'>Get every phrase</a>"
     318         << "?c=" << collection
     319         << "&n=" << phrase
     320         << "&e=" << first_e
     321         << "&f=" << ef
     322         << "&h=" << first_d
     323         << "&i=" << last_d
     324         << "&k=" << first_l
     325         << "&l=" << last_l
     326         << "'>Get every expansion</a>"
    237327         << endl;
    238328      }
     
    270360      }
    271361
    272       cout << "<p><table><tr><th align=left>Document</th><th>freq</th></tr>" << endl;
     362      cout << "<p><table border=1><tr><th align=left>document</th><th>freq</th></tr>" << endl;
    273363      print_documents(XMLmode, basepath, "library", collection,
    274364              docNums, docfreq, first_d, last_d);
    275365      cout << "</table>" << endl;
    276366     
    277       if (last_d < docNums.size()) {
     367      if (last_d < df) {
     368    if ((last_d + 10) < df) {
     369      cout << "<br><a href='" << argv[0]
     370           << "?c=" << collection
     371           << "&n=" << phrase
     372           << "&e=" << first_e
     373           << "&f=" << last_e
     374           << "&h=" << first_d
     375           << "&i=" << (last_d + 10) 
     376           << "&k=" << first_l
     377           << "&l=" << last_l
     378           << "'>Get more documents</a>" << endl;
     379    }
    278380    cout << "<br><a href='" << argv[0]
    279          << "?c=" << collection << "&n=" << phrase
    280          << "&e=" << last_e << "&d=" << (last_d + 10)
    281          << "&g=" << first_e << "&f=" << first_d
    282          << "'>Get more documents</a>" << endl
    283          << "<br><a href='" << argv[0]
    284          << "?c=" << collection << "&n=" << phrase
    285          << "&g=" << first_e
    286          << "&e=" << last_e
    287          << "&f=" << first_d
    288          << "&d=" << docNums.size()
     381         << "?c=" << collection
     382         << "&n=" << phrase
     383         << "&e=" << first_e
     384         << "&f=" << last_e
     385         << "&h=" << first_d
     386         << "&i=" << df
     387         << "&k=" << first_l
     388         << "&l=" << last_l
    289389         << "'>Get every document</a>" << endl;
    290390      }
     
    328428      // body is always the same as the text of the phrase, so no need to send it
    329429      cout << "<expansion num=\"" << e
    330        << "\" id=\"" << phrase
    331        << "\" prefix=\"" << prefix
    332        << "\" suffix=\"" << suffix
     430       << "\" id=\"" << phrase
    333431       << "\" tf=\"" << tf
    334        << "\" df=\"" << df << "\"/>" << endl;
     432       << "\" df=\"" << df;
     433      if (!prefix.empty()) {
     434    cout << "\" prefix=\"" << prefix;
     435      }
     436      if (!suffix.empty()) {
     437    cout << "\" suffix=\"" << suffix;
     438      }
     439      cout << "\"/>" << endl;
    335440    } else {
    336441      cout << "<tr valign=top><td align=right><a href='" << cgi_script
     
    344449  }
    345450}
     451
     452void print_thesaurus_links(char *cgi_script, char *collection,
     453               bool XMLmode, UCArray body, TextData &textdata,
     454               vector <unsigned long> &linkdest,
     455               vector <UCArray> &linktype,
     456               unsigned long first, unsigned long last) {
     457
     458  // information describing each link in the list
     459  unsigned long phrase, tf, ef, df;
     460  UCArray type, text, newbody, suffix, prefix;
     461 
     462  for (unsigned long l = first; l < last; l++) {
     463
     464    // get the phrase data
     465    phrase = linkdest[l];
     466    type = linktype[l];
     467    get_phrase_freq_data(textdata, phrase, text, tf, ef, df);
     468    // split_phrase(text, newbody, prefix, suffix);
     469   
     470    if (XMLmode) {
     471      cout << "<thesaurus num=\"" << l
     472       << "\" id=\"" << phrase
     473       << "\" tf=\"" << tf
     474       << "\" df=\"" << df
     475       << "\" type=\"" << type
     476       << "\" text=\"" << text
     477       << "\"/>" << endl;
     478    } else {
     479      cout << "<tr valign=top><td>" << type << "</td><td>"
     480       << "<a href='" << cgi_script << "?c=" << collection
     481       << "&n=" << phrase << "'>" << text << "</a>"
     482       << "</td><td>" << tf << "</td><td>" << df << "</td></tr>" << endl;
     483    }
     484  }
     485}
     486
    346487
    347488void print_documents(bool XMLmode, char *basepath, char *cgi_script, char *collection, 
     
    400541  // Look the word up in the textData
    401542  if (!GetDocText (textdata, docLevel, phrase, text)) {
    402     FatalError (1, "Error while trying to get document %u", phrase);
     543    FatalError (1, "Error while trying to get phrase %u", phrase);
    403544  }
    404545
     
    439580// The phrase is stored in textData as record phrase.
    440581// We retrieve:
    441 //   word - the text od the phrase
     582//   word - the text of the phrase
    442583//   tf - the total frequency of the phrase
    443584//   ef - the expansion frequency of the phrase
     585//   lf - the thesaurus link frequency of the phrase
    444586//   df - the document frequency of the phrase
    445587//   el - the list of phrases that are expansions of phrase
     588//   ll - the list of phrases that are thesaurus links
    446589//   dl - the list of documents that contain phrase
    447590
    448591void get_phrase_all_data(TextData &textdata, unsigned long phrase,
    449              UCArray &word, unsigned long &tf,
    450              unsigned long &ef, unsigned long &df,
    451              vector <unsigned long> &el,
     592             UCArray &word,
     593             unsigned long &tf, unsigned long &ef,
     594             unsigned long &lf, unsigned long &df,
     595             vector <unsigned long> &el,
     596             vector <unsigned long> &linkdest,
     597             vector <UCArray> &linktype,
    452598             vector <unsigned long> &docnum,
    453599             vector <unsigned long> &docfrq) {
     
    464610  UCArray::iterator next = text.begin();
    465611  while (*next++ != ':');
     612
     613  // ignore training cariage returns
     614  while (text.back() == '\n') {
     615    text.pop_back();
     616  }
    466617 
    467618  // Get the word
     
    504655    }
    505656  }
    506   el.push_back(e);
    507657
    508658  // Get document list & the document frequency list
    509   while (text.back() == '\n') {
    510     text.pop_back();
    511   }
    512   text.push_back(';');
    513   text.push_back(':');
    514659  docnum.clear();
    515660  docfrq.clear();
     
    535680    }
    536681  }
     682
     683  // Get thesaurus link frequency & link list
     684  text.push_back(':');
     685  text.push_back(':');
     686
     687  // link frequency
     688  lf = 0;
     689  for (next++; *next != ':'; next++) {
     690    lf *= 10;
     691    lf += (*next - '0');
     692  }
     693
     694  // two lists of link data
     695  linkdest.clear();
     696  linktype.clear();
     697 
     698  UCArray thistype;
     699  thistype.clear();
     700  bool typedone = false;
     701  unsigned long l = 0;
     702  for (next++; *next != ':'; next++) {
     703   
     704    if (!typedone) {
     705      // first read the link type, a charactor string
     706      if (*next == ',') {
     707    typedone = true;
     708      } else {
     709    thistype.push_back(*next);
     710      }
     711    } else {
     712      // having read the link type, read the list of link destinations
     713      if (*next == ',') {
     714    linkdest.push_back(l);
     715    linktype.push_back(thistype);
     716    l = 0;
     717      } else if (*next == ';') {
     718    linkdest.push_back(l);
     719    linktype.push_back(thistype);
     720    l = 0;
     721    thistype.clear();
     722    typedone = false;
     723      } else {
     724    l *= 10;
     725    l += (*next - '0');
     726      }
     727    }
     728  }
    537729}
    538730
     
    608800            unsigned long &phrasenumber, UCArray &phrasetext,
    609801            unsigned long &first_e, unsigned long &last_e,
     802            unsigned long &first_l, unsigned long &last_l,
    610803            unsigned long &first_d, unsigned long &last_d,
    611804            bool &XMLmode) {
     
    617810  first_e = 0;
    618811  last_e = 10;
     812  first_l = 0;
     813  last_l = 10;
    619814  first_d = 0;
    620815  last_d = 10;
     
    659854      }
    660855     
    661       // d: the last document number
    662       else if (key[0] == 'd') {
     856      // e: the first expansion number
     857      else if (key[0] == 'e') {
     858    first_e = toLongInt(value);
     859      }
     860
     861      // f: the last expansion number
     862      else if (key[0] == 'f') {
     863    last_e = toLongInt(value);
     864      }
     865
     866      // h: the first document number
     867      else if (key[0] == 'h') {
     868    first_d = toLongInt(value);
     869      }
     870
     871      // i: the last document number
     872      else if (key[0] == 'i') {
    663873    last_d = toLongInt(value);
    664874      }
    665875
    666       // e: the last expansion number
    667       else if (key[0] == 'e') {
    668     last_e = toLongInt(value);
    669       }
    670 
    671       // f: the first document number
    672       else if (key[0] == 'f') {
    673     first_d = toLongInt(value);
    674       }
    675 
    676       // g: the first expansion number
    677       else if (key[0] == 'g') {
    678     first_e = toLongInt(value);
    679       }
    680 
    681       // x: XML mode
    682       else if (key[0] == 'x') {
    683     XMLmode = true;
     876      // k: the first thesaurus list number
     877      else if (key[0] == 'k') {
     878    first_l = toLongInt(value);
     879      }
     880
     881      // l: the last thesaurus list number
     882      else if (key[0] == 'l') {
     883    last_l = toLongInt(value);
    684884      }
    685885
     
    692892      else if (key[0] == 'p') {
    693893    toUCArray(value, phrasetext);
     894      }
     895
     896      // x: XML mode
     897      else if (key[0] == 'x') {
     898    XMLmode = true;
    694899      }
    695900
Note: See TracChangeset for help on using the changeset viewer.