Context Navigation

← Previous Changeset
Next Changeset →

Changeset 1828

Timestamp:

2001-01-11T10:09:17+13:00 (23 years ago)

Author:

paynter

Message:

Handle the new phind.dm input format, which includes thesaurus data,
and output this thesaurus data when it is available.

File:

: 1 edited

trunk/gsdl/src/phind/host/phindcgi.cpp (modified) (21 diffs)

Legend:

: Unmodified
: Added
: Removed

trunk/gsdl/src/phind/host/phindcgi.cpp

-              r1809
+              r1828
             unsigned long &phrasenumber, UCArray &phrasetext,
             unsigned long &first_e, unsigned long &last_e,
+            unsigned long &first_l, unsigned long &last_l,
             unsigned long &first_d, unsigned long &last_d,
             bool &XMLmode);
 …
               TextData &textdata, vector <unsigned long> elist,
               unsigned long first, unsigned long last);
+void print_thesaurus_links(char *cgi_script, char *collection,
+               bool XMLmode, UCArray body, TextData &textdata,
+               vector <unsigned long> &linkdest,
+               vector <UCArray> &linktype,
+               unsigned long first, unsigned long last);
 void print_documents(bool XMLmode, char *basepath, char *cgi_script,
 …
 void get_phrase_all_data(TextData &textdata, unsigned long phrase,
+             UCArray &word, unsigned long &tf,
+             unsigned long &ef, unsigned long &df,
+             vector <unsigned long> &el,
+             UCArray &word,
+             unsigned long &tf, unsigned long &ef,
+             unsigned long &lf, unsigned long &df,
+             vector <unsigned long> &el,
+             vector <unsigned long> &linkdest,
+             vector <UCArray> &linktype,
              vector <unsigned long> &docnum,
              vector <unsigned long> &docfrq);
 …
   // the frequency and occurances of the phrase
+  unsigned long tf, ef, df;
+  vector <unsigned long> el, docNums, docfreq;
+  unsigned long tf;
+  vector <unsigned long> el, linkdest, docNums, docfreq;
+  vector <UCArray> linktype;
   // the number of occurances to display
+  unsigned long first_e, last_e, count_e, first_d, last_d, count_d;
+  unsigned long ef, first_e, last_e, count_e,
+                lf, first_l, last_l, count_l,
+                df, first_d, last_d, count_d;
   // are we in XML mode (as opposed to HTML mode)
 …
   text_tmap param;
   get_cgi_parameters(collection, phrase, word,
              first_e, last_e, first_d, last_d, XMLmode);
+             first_e, last_e, first_l, last_l, first_d, last_d, XMLmode);
   if (collection == NULL) {
 …
     FatalError (1, "Couldn't load text information for \"%s\"", filename);
+  }
+  get_phrase_all_data(textdata, phrase, word, tf, ef, df, el, docNums, docfreq);
+  get_phrase_all_data(textdata, phrase, word, tf, ef, lf, df, el,
+              linkdest, linktype, docNums, docfreq);
 …
      << "\" text=\"" << word
      << "\" tf=\"" << tf
+     << "\" ef=\"" << ef
      << "\" df=\"" << df
      << "\" ef=\"" << ef
+     << "\" lf=\"" << lf
      << "\">" << endl;
   } else {
 …
      << tf << " times in " << df << " documents" << endl;
+  }
+  // Output the thesaurus links
+  if ((lf > 0) && (first_l < last_l)) {
+    // figure out the number of phrases to output
+    if (last_l > lf) {
+      last_l = lf;
+    }
+    count_l = last_l - first_l;
+    if (XMLmode) {
+      cout << "<thesauruslist length=\"" << lf
+       << "\" start=\"" << first_l
+       << "\" end=\"" << last_l << "\">" << endl;
+      print_thesaurus_links(argv[0], collection, XMLmode, word, textdata,
+                linkdest, linktype, first_l, last_l);
+      cout << "</thesauruslist>" << endl;
+    }
+    // output links as HTML
+    else {
+      if (count_l == lf) {
+    cout << "<p><b> " << count_l << " thesaurus links</b>" << endl;
+      } else {
+    cout << "<p><b>" << count_l << " of " << lf << " thesaurus links</b>" << endl;
+      }
+      cout << "<p><table border=1><tr><th>type</th><th>topic</th><th>freq</th><th>docs</th></tr>" << endl;
+      print_thesaurus_links(argv[0], collection, XMLmode, word, textdata,
+                linkdest, linktype, first_l, last_l);
+      cout << "</table>" << endl;
+      if (last_l < lf) {
+    if ((last_l + 10) < lf) {
+      cout << "<br><a href='" << argv[0]
+           << "?c=" << collection
+           << "&n=" << phrase
+           << "&e=" << first_e
+           << "&f=" << last_e
+           << "&h=" << first_d
+           << "&i=" << last_d
+           << "&k=" << first_l
+           << "&l=" << (last_l + 10)
+           << "'>Get more thesaurus links</a>"
+           << endl;
+    }
+    cout << "<br><a href='" << argv[0]
+         << "?c=" << collection
+         << "&n=" << phrase
+         << "&e=" << first_e
+         << "&f=" << last_e
+         << "&h=" << first_d
+         << "&i=" << last_d
+         << "&k=" << first_l
+         << "&l=" << lf
+         << "'>Get every thesaurus link</a>"
+         << endl;
+      }
+    }
+  }
   // Output the expansions
 …
+      }
       cout << "<p><table border=0><tr><th align=left>Phrase</th><th>freq</th><th>docs</th></tr>" << endl;
+      cout << "<p><table border=1><tr><th colspan=3>phrase</th><th>freq</th><th>docs</th></tr>" << endl;
       print_expansions(argv[0], collection, XMLmode, word, textdata, el, first_e, last_e);
       cout << "</table>" << endl;
+      if (last_e < el.size()) {
+      if (last_e < ef) {
+    if ((last_e + 10) < ef) {
+      cout << "<br><a href='" << argv[0]
+           << "?c=" << collection
+           << "&n=" << phrase
+           << "&e=" << first_e
+           << "&f=" << (last_e + 10)
+           << "&h=" << first_d
+           << "&i=" << last_d
+           << "&k=" << first_l
+           << "&l=" << last_l
+           << "'>Get more expansions</a>"
+           << endl;
+    }
     cout << "<br><a href='" << argv[0]
+         << "?c=" << collection << "&n=" << phrase
+         << "&e=" << (last_e + 10) << "&d=" << last_d
+         << "&g=" << first_e << "&f=" << first_d
+         << "'>Get more phrases</a>"
+         << endl
+         << "<br><a href='" << argv[0]
+         << "?c=" << collection << "&n=" << phrase
+         << "&e=" << el.size() << "&d=" << last_d
+         << "&g=" << first_e << "&f=" << first_d
+         << "'>Get every phrase</a>"
+         << "?c=" << collection
+         << "&n=" << phrase
+         << "&e=" << first_e
+         << "&f=" << ef
+         << "&h=" << first_d
+         << "&i=" << last_d
+         << "&k=" << first_l
+         << "&l=" << last_l
+         << "'>Get every expansion</a>"
          << endl;
+      }
 …
+      }
       cout << "<p><table><tr><th align=left>Document</th><th>freq</th></tr>" << endl;
+      cout << "<p><table border=1><tr><th align=left>document</th><th>freq</th></tr>" << endl;
       print_documents(XMLmode, basepath, "library", collection,
               docNums, docfreq, first_d, last_d);
       cout << "</table>" << endl;
+      if (last_d < docNums.size()) {
+      if (last_d < df) {
+    if ((last_d + 10) < df) {
+      cout << "<br><a href='" << argv[0]
+           << "?c=" << collection
+           << "&n=" << phrase
+           << "&e=" << first_e
+           << "&f=" << last_e
+           << "&h=" << first_d
+           << "&i=" << (last_d + 10)
+           << "&k=" << first_l
+           << "&l=" << last_l
+           << "'>Get more documents</a>" << endl;
+    }
     cout << "<br><a href='" << argv[0]
+         << "?c=" << collection << "&n=" << phrase
+         << "&e=" << last_e << "&d=" << (last_d + 10)
+         << "&g=" << first_e << "&f=" << first_d
+         << "'>Get more documents</a>" << endl
+         << "<br><a href='" << argv[0]
+         << "?c=" << collection << "&n=" << phrase
+         << "&g=" << first_e
+         << "&e=" << last_e
+         << "&f=" << first_d
+         << "&d=" << docNums.size()
+         << "?c=" << collection
+         << "&n=" << phrase
+         << "&e=" << first_e
+         << "&f=" << last_e
+         << "&h=" << first_d
+         << "&i=" << df
+         << "&k=" << first_l
+         << "&l=" << last_l
          << "'>Get every document</a>" << endl;
+      }
 …
       // body is always the same as the text of the phrase, so no need to send it
       cout << "<expansion num=\"" << e
+       << "\" id=\"" << phrase
+       << "\" prefix=\"" << prefix
+       << "\" suffix=\"" << suffix
+       << "\" id=\"" << phrase
        << "\" tf=\"" << tf
+       << "\" df=\"" << df << "\"/>" << endl;
+       << "\" df=\"" << df;
+      if (!prefix.empty()) {
+    cout << "\" prefix=\"" << prefix;
+      }
+      if (!suffix.empty()) {
+    cout << "\" suffix=\"" << suffix;
+      }
+      cout << "\"/>" << endl;
     } else {
       cout << "<tr valign=top><td align=right><a href='" << cgi_script
 …
+  }
+}
+void print_thesaurus_links(char *cgi_script, char *collection,
+               bool XMLmode, UCArray body, TextData &textdata,
+               vector <unsigned long> &linkdest,
+               vector <UCArray> &linktype,
+               unsigned long first, unsigned long last) {
+  // information describing each link in the list
+  unsigned long phrase, tf, ef, df;
+  UCArray type, text, newbody, suffix, prefix;
+  for (unsigned long l = first; l < last; l++) {
+    // get the phrase data
+    phrase = linkdest[l];
+    type = linktype[l];
+    get_phrase_freq_data(textdata, phrase, text, tf, ef, df);
+    // split_phrase(text, newbody, prefix, suffix);
+    if (XMLmode) {
+      cout << "<thesaurus num=\"" << l
+       << "\" id=\"" << phrase
+       << "\" tf=\"" << tf
+       << "\" df=\"" << df
+       << "\" type=\"" << type
+       << "\" text=\"" << text
+       << "\"/>" << endl;
+    } else {
+      cout << "<tr valign=top><td>" << type << "</td><td>"
+       << "<a href='" << cgi_script << "?c=" << collection
+       << "&n=" << phrase << "'>" << text << "</a>"
+       << "</td><td>" << tf << "</td><td>" << df << "</td></tr>" << endl;
+    }
+  }
+}
 void print_documents(bool XMLmode, char *basepath, char *cgi_script, char *collection,
 …
   // Look the word up in the textData
   if (!GetDocText (textdata, docLevel, phrase, text)) {
     FatalError (1, "Error while trying to get document %u", phrase);
+    FatalError (1, "Error while trying to get phrase %u", phrase);
+  }
 …
 // The phrase is stored in textData as record phrase.
 // We retrieve:
 //   word - the text od the phrase
+//   word - the text of the phrase
 //   tf - the total frequency of the phrase
 //   ef - the expansion frequency of the phrase
+//   lf - the thesaurus link frequency of the phrase
 //   df - the document frequency of the phrase
 //   el - the list of phrases that are expansions of phrase
+//   ll - the list of phrases that are thesaurus links
 //   dl - the list of documents that contain phrase
 void get_phrase_all_data(TextData &textdata, unsigned long phrase,
+             UCArray &word, unsigned long &tf,
+             unsigned long &ef, unsigned long &df,
+             vector <unsigned long> &el,
+             UCArray &word,
+             unsigned long &tf, unsigned long &ef,
+             unsigned long &lf, unsigned long &df,
+             vector <unsigned long> &el,
+             vector <unsigned long> &linkdest,
+             vector <UCArray> &linktype,
              vector <unsigned long> &docnum,
              vector <unsigned long> &docfrq) {
 …
   UCArray::iterator next = text.begin();
   while (*next++ != ':');
+  // ignore training cariage returns
+  while (text.back() == '\n') {
+    text.pop_back();
+  }
   // Get the word
 …
+    }
+  }
-  el.push_back(e);
   // Get document list & the document frequency list
-  while (text.back() == '\n') {
-    text.pop_back();
+  }
-  text.push_back(';');
-  text.push_back(':');
   docnum.clear();
   docfrq.clear();
 …
+    }
+  }
+  // Get thesaurus link frequency & link list
+  text.push_back(':');
+  text.push_back(':');
+  // link frequency
+  lf = 0;
+  for (next++; *next != ':'; next++) {
+    lf *= 10;
+    lf += (*next - '0');
+  }
+  // two lists of link data
+  linkdest.clear();
+  linktype.clear();
+  UCArray thistype;
+  thistype.clear();
+  bool typedone = false;
+  unsigned long l = 0;
+  for (next++; *next != ':'; next++) {
+    if (!typedone) {
+      // first read the link type, a charactor string
+      if (*next == ',') {
+    typedone = true;
+      } else {
+    thistype.push_back(*next);
+      }
+    } else {
+      // having read the link type, read the list of link destinations
+      if (*next == ',') {
+    linkdest.push_back(l);
+    linktype.push_back(thistype);
+    l = 0;
+      } else if (*next == ';') {
+    linkdest.push_back(l);
+    linktype.push_back(thistype);
+    l = 0;
+    thistype.clear();
+    typedone = false;
+      } else {
+    l *= 10;
+    l += (*next - '0');
+      }
+    }
+  }
+}
 …
             unsigned long &phrasenumber, UCArray &phrasetext,
             unsigned long &first_e, unsigned long &last_e,
+            unsigned long &first_l, unsigned long &last_l,
             unsigned long &first_d, unsigned long &last_d,
             bool &XMLmode) {
 …
   first_e = 0;
   last_e = 10;
+  first_l = 0;
+  last_l = 10;
   first_d = 0;
   last_d = 10;
 …
+      }
+      // d: the last document number
+      else if (key[0] == 'd') {
+      // e: the first expansion number
+      else if (key[0] == 'e') {
+    first_e = toLongInt(value);
+      }
+      // f: the last expansion number
+      else if (key[0] == 'f') {
+    last_e = toLongInt(value);
+      }
+      // h: the first document number
+      else if (key[0] == 'h') {
+    first_d = toLongInt(value);
+      }
+      // i: the last document number
+      else if (key[0] == 'i') {
     last_d = toLongInt(value);
+      }
+      // e: the last expansion number
+      else if (key[0] == 'e') {
+    last_e = toLongInt(value);
+      }
+      // f: the first document number
+      else if (key[0] == 'f') {
+    first_d = toLongInt(value);
+      }
+      // g: the first expansion number
+      else if (key[0] == 'g') {
+    first_e = toLongInt(value);
+      }
+      // x: XML mode
+      else if (key[0] == 'x') {
+    XMLmode = true;
+      // k: the first thesaurus list number
+      else if (key[0] == 'k') {
+    first_l = toLongInt(value);
+      }
+      // l: the last thesaurus list number
+      else if (key[0] == 'l') {
+    last_l = toLongInt(value);
+      }
 …
       else if (key[0] == 'p') {
     toUCArray(value, phrasetext);
+      }
+      // x: XML mode
+      else if (key[0] == 'x') {
+    XMLmode = true;
+      }

Note: See TracChangeset for help on using the changeset viewer.

Context Navigation

Changeset 1828

Legend:

trunk/gsdl/src/phind/host/phindcgi.cpp

Download in other formats: