root/main/trunk/greenstone2/runtime-src/src/recpt/phindaction.cpp @ 21973

Revision 21973, 29.5 KB (checked in by kjdon, 10 years ago)

Need to convert query between unicode and utf8 for mgpp

  • Property svn:keywords set to Author Date Id Revision
Line 
1/**********************************************************************
2 *
3 * phindaction.cpp --
4 *
5 * Copyright 2001 Gordon W. Paynter
6 * Copyright 2001 The New Zealand Digital Library Project
7 *
8 * A component of the Greenstone digital library software
9 * from the New Zealand Digital Library Project at the
10 * University of Waikato, New Zealand.
11 *
12 * This program is free software; you can redistribute it and/or modify
13 * it under the terms of the GNU General Public License as published by
14 * the Free Software Foundation; either version 2 of the License, or
15 * (at your option) any later version.
16 *
17 * This program is distributed in the hope that it will be useful,
18 * but WITHOUT ANY WARRANTY; without even the implied warranty of
19 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
20 * GNU General Public License for more details.
21 *
22 * You should have received a copy of the GNU General Public License
23 * along with this program; if not, write to the Free Software
24 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
25 *
26 *********************************************************************/
27
28#include "gsdl_modules_cfg.h"
29#ifdef GSDL_USE_PHIND_ACTION
30
31// Note that this action uses mgpp to retrieve phind info, calling MGQuery
32// etc. directly, not through the protocol. This breaks our receptionist -
33// collection server separation and should be fixed some day I guess.
34
35#include "phindaction.h"
36#include "fileutil.h"
37#include "gsdlunicode.h"
38
39phindaction::phindaction () {
40
41  cgiarginfo arg_ainfo;
42
43  arg_ainfo.shortname = "pc";
44  arg_ainfo.longname = "phind classifier";
45  arg_ainfo.multiplechar = true;
46  arg_ainfo.defaultstatus = cgiarginfo::weak;
47  arg_ainfo.argdefault = g_EmptyText;
48  arg_ainfo.savedarginfo = cgiarginfo::mustnot;
49  argsinfo.addarginfo (NULL, arg_ainfo);
50
51  arg_ainfo.shortname = "pxml";
52  arg_ainfo.longname = "phind XML mode";
53  arg_ainfo.multiplechar = false;
54  arg_ainfo.defaultstatus = cgiarginfo::weak;
55  arg_ainfo.argdefault = "0";
56  arg_ainfo.savedarginfo = cgiarginfo::mustnot;
57  argsinfo.addarginfo (NULL, arg_ainfo);
58
59  arg_ainfo.shortname = "ppnum";
60  arg_ainfo.longname = "phind phrase number";
61  arg_ainfo.multiplechar = true;
62  arg_ainfo.defaultstatus = cgiarginfo::weak;
63  arg_ainfo.argdefault = "0";
64  arg_ainfo.savedarginfo = cgiarginfo::mustnot;
65  argsinfo.addarginfo (NULL, arg_ainfo);
66
67  arg_ainfo.shortname = "pptext";
68  arg_ainfo.longname = "phind phrase text";
69  arg_ainfo.multiplechar = true;
70  arg_ainfo.defaultstatus = cgiarginfo::weak;
71  arg_ainfo.argdefault = g_EmptyText;
72  arg_ainfo.savedarginfo = cgiarginfo::mustnot;
73  argsinfo.addarginfo (NULL, arg_ainfo);
74
75  arg_ainfo.shortname = "pfe";
76  arg_ainfo.longname = "phind first_e";
77  arg_ainfo.multiplechar = true;
78  arg_ainfo.defaultstatus = cgiarginfo::weak;
79  arg_ainfo.argdefault = "0";
80  arg_ainfo.savedarginfo = cgiarginfo::mustnot;
81  argsinfo.addarginfo (NULL, arg_ainfo);
82
83  arg_ainfo.shortname = "ple";
84  arg_ainfo.longname = "phind last_e";
85  arg_ainfo.multiplechar = true;
86  arg_ainfo.defaultstatus = cgiarginfo::weak;
87  arg_ainfo.argdefault = "10";
88  arg_ainfo.savedarginfo = cgiarginfo::mustnot;
89  argsinfo.addarginfo (NULL, arg_ainfo);
90
91  arg_ainfo.shortname = "pfl";
92  arg_ainfo.longname = "phind first_l";
93  arg_ainfo.multiplechar = true;
94  arg_ainfo.defaultstatus = cgiarginfo::weak;
95  arg_ainfo.argdefault = "0";
96  arg_ainfo.savedarginfo = cgiarginfo::mustnot;
97  argsinfo.addarginfo (NULL, arg_ainfo);
98
99  arg_ainfo.shortname = "pll";
100  arg_ainfo.longname = "phind last_l";
101  arg_ainfo.multiplechar = true;
102  arg_ainfo.defaultstatus = cgiarginfo::weak;
103  arg_ainfo.argdefault = "10";
104  arg_ainfo.savedarginfo = cgiarginfo::mustnot;
105  argsinfo.addarginfo (NULL, arg_ainfo);
106
107  arg_ainfo.shortname = "pfd";
108  arg_ainfo.longname = "phind first_d";
109  arg_ainfo.multiplechar = true;
110  arg_ainfo.defaultstatus = cgiarginfo::weak;
111  arg_ainfo.argdefault = "0";
112  arg_ainfo.savedarginfo = cgiarginfo::mustnot;
113  argsinfo.addarginfo (NULL, arg_ainfo);
114
115  arg_ainfo.shortname = "pld";
116  arg_ainfo.longname = "phind last_d";
117  arg_ainfo.multiplechar = true;
118  arg_ainfo.defaultstatus = cgiarginfo::weak;
119  arg_ainfo.argdefault = "10";
120  arg_ainfo.savedarginfo = cgiarginfo::mustnot;
121  argsinfo.addarginfo (NULL, arg_ainfo);
122}
123
124phindaction::~phindaction () {
125}
126
127void phindaction::get_cgihead_info (cgiargsclass &args, recptprotolistclass * /*protos*/,
128                    response_t &response,text_t &response_data,
129                    ostream &/*logout*/) {
130  response = content;
131  if (args["pxml"] == "1") {
132    response_data = "text/xml";
133  } else {
134    response_data = "text/html";
135  }
136}
137
138bool phindaction::do_action (cgiargsclass &args, recptprotolistclass *protos,
139                 browsermapclass * /*browsers*/, displayclass &disp,
140                 outconvertclass &outconvert, ostream &textout,
141                 ostream &logout) {
142
143  unsigned long count_l, count_e, count_d;
144  unsigned long phrase = args["ppnum"].getulong();
145  text_t &word = args["pptext"];
146  unsigned long first_e = args["pfe"].getulong();
147  unsigned long last_e = args["ple"].getulong();
148  unsigned long first_l = args["pfl"].getulong();
149  unsigned long last_l = args["pll"].getulong();
150  unsigned long first_d = args["pfd"].getulong();
151  unsigned long last_d = args["pld"].getulong();
152  bool XMLmode = false;
153  if (args["pxml"] == "1") XMLmode = true;
154
155  // must have a valid collection server
156  recptproto *collectproto = protos->getrecptproto (args["c"], logout);
157  if (collectproto == NULL) {
158    output_error("phindaction: ERROR: collection not set", textout,
159         outconvert, disp, logout, XMLmode);
160    return true;
161  }
162
163  // the frequency and occurances of the phrase
164  unsigned long tf;
165  vector <unsigned long> el, linkdest, docNums, docfreq;
166  vector <UCArray> linktype;
167
168  // the number of occurances to display
169  unsigned long ef, lf, df;
170 
171  text_t basepath = filename_cat(collecthome, args["c"],
172                 "index", "phind" + args["pc"]);
173
174  // If we don't know the phrase number, look it up
175  if (phrase == 0) {
176   
177    if (word.empty()) {
178      output_error("phindaction: ERROR: no phrase number or word", textout,
179           outconvert, disp, logout, XMLmode);
180      return true;
181    }
182
183    DocNumArray result;
184    /** In order to prevent browser crashing problems, any method which
185     *  previously suffered a silent fatal error, now instead returns false
186     *  to indicate a fatal error has occured. We can then dispatch an
187     *  appropriate error tag to the Phind applet (rather than leave it
188     *  whiling away the milliseconds until the end of existence - or at
189     *  least your browser - in an infinite loop!)
190     *  DLConsulting 12-07-2004
191     */
192   
193    if(!find_phrase_number_from_word(basepath, word, result)) {
194    output_error("phindaction: Fatal Error! Couldn't load index information in find_phrase_number_from_word()",
195             textout, outconvert, disp, logout, XMLmode);
196    return true;
197    }
198   
199    if (result.empty()) {
200      output_error("phindaction: The search term ("+word+") does not occur in the collection",
201           textout, outconvert, disp, logout, XMLmode);
202      return true;
203    } else {
204      phrase = result[0];
205    }
206  }
207
208  // Create a TextData object to read the phrase data (pdata)
209  TextData textdata;
210
211  text_t fullpath = filename_cat(basepath, "pdata");
212  char *fullpathc = fullpath.getcstr();
213#if defined __WIN32__
214  char *base = "";
215#else
216  char *base = "/";
217#endif
218
219  if (!textdata.LoadData (base, fullpathc)) {
220    //    FatalError (1, "Couldn't load text information for \"%s\"", fullpathc);
221    //exit(0);
222    /** We must return something to the client, whether this error is fatal or
223     *  no, otherwise we risk sending their browser into an infinite loop!
224     *  DLConsulting 12-07-2004
225     */
226    output_error("phindaction: Fatal Error! Couldn't load text information for collection",
227         textout, outconvert, disp, logout, XMLmode);
228    return true;
229  }
230
231  delete []fullpathc;
232
233  /** Another previously silent method can now cry out.
234   *  DLConsulting 12-07-2004
235   */
236  if(!get_phrase_all_data(textdata, phrase, word, tf, ef, lf, df, el,
237              linkdest, linktype, docNums, docfreq)) {
238    output_error(
239      "phindaction: Fatal Error! Couldn't parse phrase in get_phrase_all_data()",
240      textout, outconvert, disp, logout, XMLmode);
241      return true;   
242  }
243
244  // Output the header
245  if (XMLmode) {
246    textout << "<phinddata id=\"" << phrase
247        << "\" text=\"" << word
248        << "\" tf=\"" << tf
249        << "\" ef=\"" << ef
250        << "\" df=\"" << df
251        << "\" lf=\"" << lf
252        << "\">\n";
253  } else {
254    textout << "<html><head><title>" << word << "</title></head>\n"
255        << "<body><center>\n"
256        << "<p><h1>" << word << "</h1>\n"
257        << "<p><b>"<< word << "</b> occurs "
258        << tf << " times in " << df << " documents\n";
259  }
260
261  // Output the thesaurus links
262  if ((lf > 0) && (first_l < last_l)) {
263
264    // figure out the number of phrases to output
265    if (last_l > lf) {
266      last_l = lf;
267    }
268    count_l = last_l - first_l;
269   
270    if (XMLmode) {
271      textout << "<thesauruslist length=\"" << lf
272          << "\" start=\"" << first_l
273          << "\" end=\"" << last_l << "\">\n";
274      /** DLConsulting 12-07-2004 */
275      if(!print_thesaurus_links(args["c"], XMLmode, textdata, linkdest, linktype,
276                   first_l, last_l, disp, outconvert, textout)) {
277    output_error(
278             "phindaction: Fatal Error! Couldn't get phrase in get_phrase_freq_data()",
279             textout, outconvert, disp, logout, XMLmode);
280    return true;   
281      }
282      textout << "</thesauruslist>\n";
283    }
284
285    // output links as HTML
286    else {
287      if (count_l == lf) {
288    textout << "<p><b> " << count_l << " thesaurus links</b>\n";
289      } else {
290    textout << "<p><b>" << count_l << " of " << lf << " thesaurus links</b>\n";
291      }
292
293      textout << "<p><table border=1><tr><th>type</th><th>topic</th><th>freq</th><th>docs</th></tr>\n";
294      /** DLConsulting 12-07-2004 */
295      if(!print_thesaurus_links(args["c"], XMLmode, textdata, linkdest, linktype,
296                   first_l, last_l, disp, outconvert, textout)) {
297    output_error(
298             "phindaction: Fatal Error! Couldn't get phrase in get_phrase_freq_data()",
299             textout, outconvert, disp, logout, XMLmode);
300    return true;
301      }
302      textout << "</table>\n";
303
304      if (last_l < lf) {
305    if ((last_l + 10) < lf) {
306      textout << outconvert << disp
307          << "<br><a href=\"_gwcgi_?"
308          << "c=" << args["c"]
309          << "&ppnum=" << phrase
310          << "&pfe=" << first_e
311          << "&ple=" << last_e
312          << "&pfd=" << first_d
313          << "&pld=" << last_d
314          << "&pfl=" << first_l
315          << "&pll=" << (last_l + 10)
316          << "\">Get more thesaurus links</a>\n";
317    }
318    textout << outconvert << disp
319        << "<br><a href=\"_gwcgi_?"
320        << "c=" << args["c"]
321        << "&ppnum=" << phrase
322        << "&pfe=" << first_e
323        << "&ple=" << last_e
324        << "&pfd=" << first_d
325        << "&pld=" << last_d
326        << "&pfl=" << first_l
327        << "&pll=" << lf
328        << "\">Get every thesaurus link</a>\n" ;
329      }
330    }
331  }
332 
333  // Output the expansions
334  if ((ef > 0) && (first_e < last_e)) {
335
336    // figure out the number of phrases to output
337    if (last_e > el.size()) {
338      last_e = el.size();
339    }
340    count_e = last_e - first_e;
341
342    // output expansions as XML
343    if (XMLmode) {
344      textout << "<expansionlist length=\"" << ef
345          << "\" start=\"" << first_e
346          << "\" end=\"" << last_e << "\">" << endl;
347
348      print_expansions(args["c"], XMLmode, word, textdata, el, first_e,
349               last_e, disp, outconvert, textout);
350
351      textout << "</expansionlist>\n";
352    }
353
354    // output expansions as HTML
355    else {
356      if (count_e == el.size()) {
357    textout << "<p><b> " << count_e << " expansions</b>\n";
358      } else {
359    textout << "<p><b>" << count_e << " of " << ef << " expansions</b>\n";
360      }
361
362      textout << "<p><table border=1><tr><th colspan=3>phrase</th><th>freq</th><th>docs</th></tr>\n";
363      print_expansions(args["c"], XMLmode, word, textdata, el, first_e,
364               last_e, disp, outconvert, textout);
365      textout << "</table>\n";
366
367      if (last_e < ef) {
368    if ((last_e + 10) < ef) {
369      textout << outconvert << disp
370          << "<br><a href=\"_gwcgi_?"
371          << "c=" << args["c"]
372          << "&ppnum=" << phrase
373          << "&pfe=" << first_e
374          << "&ple=" << (last_e + 10)
375          << "&pfd=" << first_d
376          << "&pld=" << last_d
377          << "&pfl=" << first_l
378          << "&pll=" << last_l
379          << "\">Get more expansions</a>\n";
380    }
381    textout << outconvert << disp
382        << "<br><a href=\"_gwcgi_?"
383        << "c=" << args["c"]
384        << "&ppnum=" << phrase
385        << "&pfe=" << first_e
386        << "&ple=" << ef
387        << "&pfd=" << first_d
388        << "&pld=" << last_d
389        << "&pfl=" << first_l
390        << "&pll=" << last_l
391        << "\">Get every expansion</a>\n";
392      }
393    }
394  }
395
396  // Output the document occurances
397  if ((df > 0) && (first_d < last_d)) {
398
399    // figure out the phrases to output
400    if (last_d > docNums.size()) {
401      last_d = docNums.size();
402    }
403    count_d = last_d - first_d;
404
405    // output document list as XML
406    if (XMLmode) {
407      textout << "<documentlist length=\"" << df
408          << "\" start=\"" << first_d
409          << "\" end=\"" << last_d << "\">\n";
410     
411      if(!print_documents(XMLmode, basepath, args["c"], docNums, docfreq,
412              first_d, last_d, disp, outconvert, textout)) {
413    output_error(
414             "phindaction: Fatal Error! Couldn't load text information in print_documents() or get_document_all_data()",
415             textout, outconvert, disp, logout, XMLmode);
416    return true;   
417      }
418
419      textout << "</documentlist>\n";
420    }
421
422    // output document list as HTML
423    else {
424     
425      if (count_d == docNums.size()) {
426    textout << "<p><b> " << count_d << " documents</b>\n";
427      } else {
428    textout << "<p><b>" << count_d << " of " << df << " documents</b>\n";
429      }
430
431      textout << "<p><table border=1><tr><th align=left>document</th><th>freq</th></tr>\n";
432      if(!print_documents(XMLmode, basepath, args["c"], docNums, docfreq,
433              first_d, last_d, disp, outconvert, textout)) {
434    output_error(
435             "phindaction: Fatal Error! Couldn't load text information in print_documents()",
436             textout, outconvert, disp, logout, XMLmode);
437    return true;
438      }
439      textout << "</table>\n";
440     
441      if (last_d < df) {
442    if ((last_d + 10) < df) {
443      textout << outconvert << disp
444          << "<br><a href=\"_gwcgi_?"
445          << "c=" << args["c"]
446          << "&ppnum=" << phrase
447          << "&pfe=" << first_e
448          << "&ple=" << last_e
449          << "&pfd=" << first_d
450          << "&pld=" << (last_d + 10) 
451          << "&pfl=" << first_l
452          << "&pll=" << last_l
453          << "\">Get more documents</a>\n";
454    }
455    textout << outconvert << disp
456        << "<br><a href=\"_gwcgi_?"
457        << "c=" << args["c"]
458        << "&ppnum=" << phrase
459        << "&pfe=" << first_e
460        << "&ple=" << last_e
461        << "&pfd=" << first_d
462        << "&pld=" << df
463        << "&pfl=" << first_l
464        << "&pll=" << last_l
465        << "\">Get every document</a>\n";
466      }
467    }
468  }
469
470  // Close the document
471  if (XMLmode) {
472    textout << "</phinddata>\n";
473  } else {
474    textout << "</center></body></html>\n";
475  }
476
477  textdata.UnloadData ();
478
479  return true;
480}
481
482// Find the phrase number of a word in the index file
483bool phindaction::find_phrase_number_from_word(const text_t &basepath,
484                           const text_t &query,
485                           DocNumArray &result) {
486
487  // Open the index file for searching
488  IndexData indexData;
489
490  text_t fullpath = filename_cat(basepath, "pword");
491  char *fullpathc = fullpath.getcstr();
492#if defined __WIN32__
493  char *base = "";
494#else
495  char *base = "/";
496#endif
497
498  if (!indexData.LoadData (base, fullpathc)) {
499    //    FatalError (1, "Couldn't load index information for \"%s\"", fullpathc);
500    //exit(0);
501    /** Don't handle fatal errors here anymore.
502     *  DLConsulting 12-07-2004
503     */
504    return false; // Indicates something very bad has happened
505  }
506
507  delete []fullpathc;
508
509  // set up the query object
510  QueryInfo queryInfo;
511  SetCStr (queryInfo.docLevel, "Document", 8);
512  queryInfo.maxDocs = 5;
513  queryInfo.sortByRank = true;
514  queryInfo.exactWeights = false;
515  queryInfo.needRankInfo = true;
516  queryInfo.needTermFreqs = true;
517 
518  // mode 1 = casefolded, unstemmed search
519  UCArray ucquery;
520  // greenstone gives us the query encoded in unicode. We want utf8.
521  char* utf8querystring=to_utf8(query).getcstr();
522  SetCStr(ucquery, utf8querystring);
523  delete []utf8querystring;
524
525  //toUCArray(query, ucquery);
526  QueryNode *queryTree = ParseQuery(ucquery, 1, 1, 4);
527     
528  // perform the query
529  ExtQueryResult queryResult;
530  MGQuery (indexData, queryInfo, queryTree, queryResult);
531  // cout << "-- word lookup result -- " << endl << queryResult << endl ;
532
533  result.clear();
534  result = queryResult.docs;
535
536  // delete the query
537  if (queryTree != NULL) delete queryTree;
538
539  indexData.UnloadData();
540
541  /** This method now returns a boolean, so...
542   *  DLConsulting 12-07-2004
543   */
544  return true; // Indicates that what happened is all good, baby.
545}
546
547// Get all the data about a phrase
548//
549// The phrase is stored in textData as record phrase.
550// We retrieve:
551//   word - the text of the phrase
552//   tf - the total frequency of the phrase
553//   ef - the expansion frequency of the phrase
554//   lf - the thesaurus link frequency of the phrase
555//   df - the document frequency of the phrase
556//   el - the list of phrases that are expansions of phrase
557//   ll - the list of phrases that are thesaurus links
558//   dl - the list of documents that contain phrase
559bool phindaction::get_phrase_all_data(TextData &textdata, unsigned long phrase,
560                      text_t &word, unsigned long &tf, unsigned long &ef,
561                      unsigned long &lf, unsigned long &df,
562                      vector <unsigned long> &el,
563                      vector <unsigned long> &linkdest,
564                      vector <UCArray> &linktype,
565                      vector <unsigned long> &docnum,
566                      vector <unsigned long> &docfrq) {
567  UCArray text;
568  UCArray docLevel;
569  SetCStr(docLevel, "Document", 8);
570
571  // Look the word up in the textData
572  if (!GetDocText (textdata, docLevel, phrase, text)) {
573    //    FatalError (1, "Error while trying to get phrase %u", phrase);
574    //exit(0);
575    return false; // Something very bad has happened.
576  }
577
578  // Ignore everything up to the first colon
579  UCArray::iterator next = text.begin();
580  while (*next++ != ':');
581
582  // ignore training carriage returns
583  while (text.back() == '\n') {
584    text.pop_back();
585  }
586 
587  // Get the word
588  word.clear();
589  for (; *next != ':'; ++next) {
590    word.push_back(*next);
591  }
592 
593  // Get total frequency
594  tf = 0;
595  for (++next; *next != ':'; ++next) {
596    tf *= 10;
597    tf += (*next - '0');
598  }
599 
600  // Get expansion frequency
601  ef = 0;
602  for (++next; *next != ':'; ++next) {
603    ef *= 10;
604    ef += (*next - '0');
605  }
606 
607  // Get document frequency
608  df = 0;
609  for (++next; *next != ':'; ++next) {
610    df *= 10;
611    df += (*next - '0');
612  }
613 
614  // Get expansion list
615  el.clear();
616  unsigned long e = 0;
617  for (++next; *next != ':'; ++next) {
618    if (*next == ',') {
619      el.push_back(e);
620      e = 0;
621    } else {
622      e *= 10;
623      e += (*next - '0');
624    }
625  }
626
627  // Get document list & the document frequency list
628  docnum.clear();
629  docfrq.clear();
630  bool readnum = false;
631  unsigned long d = 0;
632  for (++next; *next != ':'; ++next) {
633    if (*next == ',') {
634      docnum.push_back(d);
635      readnum = true;
636      d = 0;
637    } else if (*next == ';') {
638      if (readnum) {
639    docfrq.push_back(d);
640      } else {
641    docnum.push_back(d);
642    docfrq.push_back(1);
643      }
644      readnum = false;
645      d = 0;
646    } else {
647      d *= 10;
648      d += (*next - '0');
649    }
650  }
651
652  // Get thesaurus link frequency & link list
653  text.push_back(':');
654  text.push_back(':');
655
656  // link frequency
657  lf = 0;
658  for (++next; *next != ':'; ++next) {
659    lf *= 10;
660    lf += (*next - '0');
661  }
662
663  // two lists of link data
664  linkdest.clear();
665  linktype.clear();
666 
667  UCArray thistype;
668  thistype.clear();
669  bool typedone = false;
670  unsigned long l = 0;
671  for (++next; *next != ':'; ++next) {
672   
673    if (!typedone) {
674      // first read the link type, a charactor string
675      if (*next == ',') {
676    typedone = true;
677      } else {
678    thistype.push_back(*next);
679      }
680    } else {
681      // having read the link type, read the list of link destinations
682      if (*next == ',') {
683    linkdest.push_back(l);
684    linktype.push_back(thistype);
685    l = 0;
686      } else if (*next == ';') {
687    linkdest.push_back(l);
688    linktype.push_back(thistype);
689    l = 0;
690    thistype.clear();
691    typedone = false;
692      } else {
693    l *= 10;
694    l += (*next - '0');
695      }
696    }
697  }
698
699  return true; // Indicates that what happened is all good, baby.
700}
701
702bool phindaction::print_thesaurus_links(const text_t &collection, bool XMLmode,
703                    TextData &textdata, vector <unsigned long> &linkdest,
704                    vector <UCArray> &linktype, unsigned long first,
705                    unsigned long last, displayclass &disp,
706                    outconvertclass &outconvert, ostream &textout) {
707
708  // information describing each link in the list
709  unsigned long phrase, tf, ef, df;
710  UCArray type, text;
711 
712  for (unsigned long l = first; l < last; ++l) {
713
714    // get the phrase data
715    phrase = linkdest[l];
716    type = linktype[l];
717
718    /** DLConsulting 12-07-2004 */
719    if(!get_phrase_freq_data(textdata, phrase, text, tf, ef, df)) {
720      return false;
721    }
722   
723    if (XMLmode) {
724      textout << "<thesaurus num=\"" << l
725          << "\" id=\"" << phrase
726          << "\" tf=\"" << tf
727          << "\" df=\"" << df
728          << "\" type=\"" << type
729          << "\" text=\"" << text
730          << "\"/>\n";
731    } else {
732      textout << "<tr valign=top><td>" << type << "</td><td>";
733      textout << outconvert << disp
734          << "<a href=\"_gwcgi_?c=" << collection;
735      textout << "&ppnum=" << phrase << "\">" << text << "</a>"
736          << "</td><td>" << tf << "</td><td>" << df << "</td></tr>\n";
737    }
738  }
739
740  /** DLConsulting 12-07-2004 */
741  return true;
742}
743
744// Get the frequency data about a phrase
745//
746// The phrase is stored in textData as record phrase.
747// We retrieve:
748//   word - the text of the phrase
749//   tf - the total frequency of the phrase
750//   ef - the expansion frequency of the phrase
751//   df - the document frequency of the phrase
752/**
753 *   Returns:
754 *     false if the method suffered a fatal error, true otherwise
755 */
756bool phindaction::get_phrase_freq_data(TextData &textdata, unsigned long phrase,
757                       UCArray &word, unsigned long &tf,
758                       unsigned long &ef, unsigned long &df) {
759 
760  UCArray text;
761  UCArray docLevel;
762  SetCStr(docLevel, "Document", 8);
763
764  // Look the word up in the textData
765  if (!GetDocText (textdata, docLevel, phrase, text)) {
766    //    FatalError (1, "Error while trying to get phrase %u", phrase);
767    //exit(0);
768    /** DLConsulting 12-07-2004 */
769    return false;
770  }
771
772  // Ignore everything up to the first colon
773  UCArray::iterator next = text.begin();
774  while (*next++ != ':');
775 
776  // Get the word
777  word.clear();
778  for (; *next != ':'; ++next) {
779    word.push_back(*next);
780  }
781 
782  // Get total frequency
783  tf = 0;
784  for (++next; *next != ':'; ++next) {
785    tf *= 10;
786    tf += (*next - '0');
787  }
788 
789  // Get expansion frequency
790  ef = 0;
791  for (++next; *next != ':'; ++next) {
792    ef *= 10;
793    ef += (*next - '0');
794  }
795 
796  // Get document frequency
797  df = 0;
798  for (++next; *next != ':'; ++next) {
799    df *= 10;
800    df += (*next - '0');
801  }
802
803  /** DLConsulting 12-07-2004 */
804  return true;
805}
806
807// Print a list of expansions
808//
809// Given the textData and a list of phrase numbers, print out each of the
810// expansions.
811void phindaction::print_expansions(const text_t &collection, bool XMLmode,
812                   const text_t &body, TextData &textdata,
813                   const vector <unsigned long> &elist,
814                   unsigned long first, unsigned long last,
815                   displayclass &disp, outconvertclass &outconvert,
816                   ostream &textout) {
817 
818  UCArray word;
819  unsigned long phrase, tf, df, ef;
820
821  UCArray suffix, prefix, ucbody;
822 
823  toUCArray(body, ucbody);
824
825  for (unsigned long e = first; e < last; ++e) {
826
827    phrase = elist[e];
828    get_phrase_freq_data(textdata, phrase, word, tf, ef, df);
829
830    split_phrase(word, ucbody, prefix, suffix);
831   
832    if (XMLmode) {
833      // body is always the same as the text of the phrase, so no need to send it
834      textout << "<expansion num=\"" << e
835          << "\" id=\"" << phrase
836          << "\" tf=\"" << tf
837          << "\" df=\"" << df;
838      if (!prefix.empty()) {
839    textout << "\" prefix=\"" << prefix;
840      }
841      if (!suffix.empty()) {
842    textout << "\" suffix=\"" << suffix;
843      }
844      textout << "\"/>\n";
845    } else {
846      textout << outconvert << disp
847          << "<tr valign=top><td align=right><a href=\"_gwcgi_?"
848          << "c=" << collection << "&ppnum=" << phrase << "\">";
849      textout << prefix << "</a></td>";
850      textout <<outconvert << disp
851          << "<td align=center><a href=\"_gwcgi_?"
852          << "c=" << collection << "&ppnum=" << phrase << "\">"
853          << body << "</a></td>"
854          << "<td align=left><a href=\"_gwcgi_?"
855          << "c=" << collection << "&ppnum=" << phrase << "\">";
856      textout << suffix << "</a></td>"
857          << "<td>" << tf << "</td><td>" << df << "</td></tr>\n";
858    }
859  }
860}
861
862// split an expansion into prefix and suffix
863void phindaction::split_phrase(const UCArray &word, const UCArray &body,
864                   UCArray &prefix, UCArray &suffix) {
865
866  prefix.clear();
867  suffix.clear();
868
869  bool readingPrefix = true;
870  UCArray::const_iterator here = word.begin();
871  UCArray::const_iterator end = word.end();
872 
873  while (here != end) {
874
875    // if we've not read all the prefix, add the next char to the prefix
876    if (readingPrefix) {
877      if (phrase_match(body, here, end)) {
878    readingPrefix = false;
879    // trim whitespace from end of prefix & start of suffix
880    if (!prefix.empty()) {
881      prefix.pop_back();
882    }
883    if ((here != end) && (*here == ' ')) {
884      ++here;
885    }
886      } else {
887    prefix.push_back(*here);
888    ++here;
889      }
890    }
891    // if we've finished with the prefix, update the suffix
892    else {
893      suffix.push_back(*here);
894      ++here;
895    }
896  }
897}
898
899// phrase_match
900//
901// compare two strings, one represented as an UCArray, the other as two
902// UCArray iterators.
903//
904// Return true if the UCArray is the same as the phrase the iterators point
905// to for the length of the UCArray.
906bool phindaction::phrase_match(const UCArray &text, UCArray::const_iterator &here,
907                   UCArray::const_iterator end) {
908
909  UCArray::const_iterator one_here = text.begin();
910  UCArray::const_iterator one_end  = text.end();
911  UCArray::const_iterator two_here = here;
912
913  // iterate over the length of the first string, comparing each element to
914  // the corresponding element in the second string.
915  while (one_here != one_end) {
916   
917      if (two_here == end) {
918      return false;
919      } else if (*one_here != *two_here) {
920      return false;
921      }
922      ++one_here;
923      ++two_here;
924  }
925
926  here = two_here;
927  return true;
928}
929
930bool phindaction::print_documents(bool XMLmode, const text_t &basepath,
931                  const text_t &collection,
932                  const vector <unsigned long> &docNums,
933                  const vector <unsigned long> &docFreq,
934                  unsigned long first, unsigned long last,
935                  displayclass &disp, outconvertclass &outconvert,
936                  ostream &textout) {
937 
938  // Create a TextData object to read the document data
939  TextData docdata;
940
941  text_t fullpath = filename_cat(basepath, "docs");
942  char *fullpathc = fullpath.getcstr();
943#if defined __WIN32__
944  char *base = "";
945#else
946  char *base = "/";
947#endif
948
949  if (!docdata.LoadData (base, fullpathc)) {
950    //    FatalError (1, "Couldn't load text information for \"%s\"", fullpathc);
951    //exit(0);
952    /** DLConsulting 12-07-2004 */
953    return false;
954  }
955
956  delete []fullpathc;
957
958  UCArray title, hash;
959  unsigned long freq, doc;
960
961  for (unsigned long d = first; d < last; ++d) {
962    doc = docNums[d];
963    freq = docFreq[d];
964
965    /** DLConsulting 13-07-2004 */
966    if(!get_document_all_data(docdata, doc, title, hash)) {
967      return false;
968    }
969
970    if (XMLmode) {
971      textout << "<document num=\"" << d
972          << "\" hash=\"" << hash
973          << "\" freq=\"" << freq
974          << "\" title=\"" << title << "\"/>\n";
975    } else {
976      textout << outconvert << disp
977          << "<tr valign=top><td><a href=\"_gwcgi_?"
978          << "c=" << collection;
979      textout << "&a=d&d=" << hash << "\">" << title << "</a>"
980          << "</td><td>" << freq << "</td></tr>\n";
981    }
982  }
983
984  docdata.UnloadData();
985
986  /** DLConsulting 12-07-2004 */
987  return true;
988}
989
990// Get all the data about a docment
991//
992// The document's details are stored in docData as record docNum.
993// We retrieve:
994//   title - the document's title
995//   hash - the document's unique OID
996/** Returns:
997 *   false if a fatal error occured, true otherwise
998 *  DLConsulting 12-07-2004
999 */
1000bool phindaction::get_document_all_data(TextData &docdata, unsigned long docNum,
1001                    UCArray &title, UCArray &hash) {
1002
1003  UCArray text;
1004  UCArray docLevel;
1005  SetCStr(docLevel, "Document", 8);
1006
1007  // Look the word up in the textData
1008  if (!GetDocText (docdata, docLevel, docNum, text)) {
1009    //    FatalError (1, "Error while trying to get document %u", docNum);
1010    //exit(0);
1011    /** DLConsulting 13-07-2004 */
1012    return false;
1013  }
1014
1015  // Ignore everything up to the first colon
1016  UCArray::iterator next = text.begin();
1017  while (*next++ != '\t');
1018 
1019  // Get the document OID (hash)
1020  hash.clear();
1021  for (; *next != '\t'; ++next) {
1022    hash.push_back(*next);
1023  }
1024
1025  // Get the title
1026  text.push_back('\n');
1027  title.clear();
1028  for (++next; *next != '\n'; ++next) {
1029    title.push_back(*next);
1030  }
1031
1032  /** DLConsulting 13-07-2004 */
1033  return true;
1034}
1035
1036void phindaction::toUCArray(const text_t &in, UCArray &out) {
1037  out.clear();
1038  if (out.capacity() < in.size() + 1) {
1039    out.reserve(in.size() + 1);
1040  }
1041  text_t::const_iterator here = in.begin();
1042  text_t::const_iterator end = in.end();
1043  while (here != end) {
1044    out.push_back((unsigned char) *here);
1045    ++here;
1046  }
1047}
1048
1049void phindaction::output_error (const text_t &message, ostream &textout,
1050                outconvertclass &outconvert,
1051                displayclass & disp, ostream &logout,
1052                bool XMLmode) {
1053
1054  logout << outconvert << message << "\n";
1055  if (XMLmode) {
1056    textout << outconvert
1057        << "<phinddata>\n"
1058        << "<phinderror>" << message << "</phinderror>\n"
1059        << "</phinddata>\n";
1060  } else {
1061    textout << outconvert << disp
1062        << "_header_\n"
1063        << message
1064        << "_footer_\n";
1065  }
1066}
1067
1068#endif //GSDL_USE_PHIND_ACTION
1069
Note: See TracBrowser for help on using the browser.