root/gsdl/trunk/src/recpt/phindaction.cpp @ 16310

Revision 16310, 29.3 KB (checked in by davidb, 12 years ago)

Introduction of 'collecthome' which parallels 'gsdlhome' to allow the toplevel collect folder to be outside of the gsdlhome area

  • Property svn:keywords set to Author Date Id Revision
Line 
1/**********************************************************************
2 *
3 * phindaction.cpp --
4 *
5 * Copyright 2001 Gordon W. Paynter
6 * Copyright 2001 The New Zealand Digital Library Project
7 *
8 * A component of the Greenstone digital library software
9 * from the New Zealand Digital Library Project at the
10 * University of Waikato, New Zealand.
11 *
12 * This program is free software; you can redistribute it and/or modify
13 * it under the terms of the GNU General Public License as published by
14 * the Free Software Foundation; either version 2 of the License, or
15 * (at your option) any later version.
16 *
17 * This program is distributed in the hope that it will be useful,
18 * but WITHOUT ANY WARRANTY; without even the implied warranty of
19 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
20 * GNU General Public License for more details.
21 *
22 * You should have received a copy of the GNU General Public License
23 * along with this program; if not, write to the Free Software
24 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
25 *
26 *********************************************************************/
27
28#include "gsdl_modules_cfg.h"
29#ifdef GSDL_USE_PHIND_ACTION
30
31// Note that this action uses mgpp to retrieve phind info, calling MGQuery
32// etc. directly, not through the protocol. This breaks our receptionist -
33// collection server separation and should be fixed some day I guess.
34
35#include "phindaction.h"
36#include "fileutil.h"
37
38phindaction::phindaction () {
39
40  cgiarginfo arg_ainfo;
41
42  arg_ainfo.shortname = "pc";
43  arg_ainfo.longname = "phind classifier";
44  arg_ainfo.multiplechar = true;
45  arg_ainfo.defaultstatus = cgiarginfo::weak;
46  arg_ainfo.argdefault = g_EmptyText;
47  arg_ainfo.savedarginfo = cgiarginfo::mustnot;
48  argsinfo.addarginfo (NULL, arg_ainfo);
49
50  arg_ainfo.shortname = "pxml";
51  arg_ainfo.longname = "phind XML mode";
52  arg_ainfo.multiplechar = false;
53  arg_ainfo.defaultstatus = cgiarginfo::weak;
54  arg_ainfo.argdefault = "0";
55  arg_ainfo.savedarginfo = cgiarginfo::mustnot;
56  argsinfo.addarginfo (NULL, arg_ainfo);
57
58  arg_ainfo.shortname = "ppnum";
59  arg_ainfo.longname = "phind phrase number";
60  arg_ainfo.multiplechar = true;
61  arg_ainfo.defaultstatus = cgiarginfo::weak;
62  arg_ainfo.argdefault = "0";
63  arg_ainfo.savedarginfo = cgiarginfo::mustnot;
64  argsinfo.addarginfo (NULL, arg_ainfo);
65
66  arg_ainfo.shortname = "pptext";
67  arg_ainfo.longname = "phind phrase text";
68  arg_ainfo.multiplechar = true;
69  arg_ainfo.defaultstatus = cgiarginfo::weak;
70  arg_ainfo.argdefault = g_EmptyText;
71  arg_ainfo.savedarginfo = cgiarginfo::mustnot;
72  argsinfo.addarginfo (NULL, arg_ainfo);
73
74  arg_ainfo.shortname = "pfe";
75  arg_ainfo.longname = "phind first_e";
76  arg_ainfo.multiplechar = true;
77  arg_ainfo.defaultstatus = cgiarginfo::weak;
78  arg_ainfo.argdefault = "0";
79  arg_ainfo.savedarginfo = cgiarginfo::mustnot;
80  argsinfo.addarginfo (NULL, arg_ainfo);
81
82  arg_ainfo.shortname = "ple";
83  arg_ainfo.longname = "phind last_e";
84  arg_ainfo.multiplechar = true;
85  arg_ainfo.defaultstatus = cgiarginfo::weak;
86  arg_ainfo.argdefault = "10";
87  arg_ainfo.savedarginfo = cgiarginfo::mustnot;
88  argsinfo.addarginfo (NULL, arg_ainfo);
89
90  arg_ainfo.shortname = "pfl";
91  arg_ainfo.longname = "phind first_l";
92  arg_ainfo.multiplechar = true;
93  arg_ainfo.defaultstatus = cgiarginfo::weak;
94  arg_ainfo.argdefault = "0";
95  arg_ainfo.savedarginfo = cgiarginfo::mustnot;
96  argsinfo.addarginfo (NULL, arg_ainfo);
97
98  arg_ainfo.shortname = "pll";
99  arg_ainfo.longname = "phind last_l";
100  arg_ainfo.multiplechar = true;
101  arg_ainfo.defaultstatus = cgiarginfo::weak;
102  arg_ainfo.argdefault = "10";
103  arg_ainfo.savedarginfo = cgiarginfo::mustnot;
104  argsinfo.addarginfo (NULL, arg_ainfo);
105
106  arg_ainfo.shortname = "pfd";
107  arg_ainfo.longname = "phind first_d";
108  arg_ainfo.multiplechar = true;
109  arg_ainfo.defaultstatus = cgiarginfo::weak;
110  arg_ainfo.argdefault = "0";
111  arg_ainfo.savedarginfo = cgiarginfo::mustnot;
112  argsinfo.addarginfo (NULL, arg_ainfo);
113
114  arg_ainfo.shortname = "pld";
115  arg_ainfo.longname = "phind last_d";
116  arg_ainfo.multiplechar = true;
117  arg_ainfo.defaultstatus = cgiarginfo::weak;
118  arg_ainfo.argdefault = "10";
119  arg_ainfo.savedarginfo = cgiarginfo::mustnot;
120  argsinfo.addarginfo (NULL, arg_ainfo);
121}
122
123phindaction::~phindaction () {
124}
125
126void phindaction::get_cgihead_info (cgiargsclass &args, recptprotolistclass * /*protos*/,
127                    response_t &response,text_t &response_data,
128                    ostream &/*logout*/) {
129  response = content;
130  if (args["pxml"] == "1") {
131    response_data = "text/xml";
132  } else {
133    response_data = "text/html";
134  }
135}
136
137bool phindaction::do_action (cgiargsclass &args, recptprotolistclass *protos,
138                 browsermapclass * /*browsers*/, displayclass &disp,
139                 outconvertclass &outconvert, ostream &textout,
140                 ostream &logout) {
141
142  unsigned long count_l, count_e, count_d;
143  unsigned long phrase = args["ppnum"].getulong();
144  text_t &word = args["pptext"];
145  unsigned long first_e = args["pfe"].getulong();
146  unsigned long last_e = args["ple"].getulong();
147  unsigned long first_l = args["pfl"].getulong();
148  unsigned long last_l = args["pll"].getulong();
149  unsigned long first_d = args["pfd"].getulong();
150  unsigned long last_d = args["pld"].getulong();
151  bool XMLmode = false;
152  if (args["pxml"] == "1") XMLmode = true;
153
154  // must have a valid collection server
155  recptproto *collectproto = protos->getrecptproto (args["c"], logout);
156  if (collectproto == NULL) {
157    output_error("phindaction: ERROR: collection not set", textout,
158         outconvert, disp, logout, XMLmode);
159    return true;
160  }
161
162  // the frequency and occurances of the phrase
163  unsigned long tf;
164  vector <unsigned long> el, linkdest, docNums, docfreq;
165  vector <UCArray> linktype;
166
167  // the number of occurances to display
168  unsigned long ef, lf, df;
169 
170  text_t basepath = filename_cat(collecthome, args["c"],
171                 "index", "phind" + args["pc"]);
172
173  // If we don't know the phrase number, look it up
174  if (phrase == 0) {
175   
176    if (word.empty()) {
177      output_error("phindaction: ERROR: no phrase number or word", textout,
178           outconvert, disp, logout, XMLmode);
179      return true;
180    }
181
182    DocNumArray result;
183    /** In order to prevent browser crashing problems, any method which
184     *  previously suffered a silent fatal error, now instead returns false
185     *  to indicate a fatal error has occured. We can then dispatch an
186     *  appropriate error tag to the Phind applet (rather than leave it
187     *  whiling away the milliseconds until the end of existence - or at
188     *  least your browser - in an infinite loop!)
189     *  DLConsulting 12-07-2004
190     */
191    if(!find_phrase_number_from_word(basepath, word, result)) {
192    output_error("phindaction: Fatal Error! Couldn't load index information in find_phrase_number_from_word()",
193             textout, outconvert, disp, logout, XMLmode);
194    return true;
195    }
196   
197    if (result.empty()) {
198      output_error("phindaction: The search term does not occur in the collection",
199           textout, outconvert, disp, logout, XMLmode);
200      return true;
201    } else {
202      phrase = result[0];
203    }
204  }
205
206  // Create a TextData object to read the phrase data (pdata)
207  TextData textdata;
208
209  text_t fullpath = filename_cat(basepath, "pdata");
210  char *fullpathc = fullpath.getcstr();
211#if defined __WIN32__
212  char *base = "";
213#else
214  char *base = "/";
215#endif
216
217  if (!textdata.LoadData (base, fullpathc)) {
218    //    FatalError (1, "Couldn't load text information for \"%s\"", fullpathc);
219    //exit(0);
220    /** We must return something to the client, whether this error is fatal or
221     *  no, otherwise we risk sending their browser into an infinite loop!
222     *  DLConsulting 12-07-2004
223     */
224    output_error("phindaction: Fatal Error! Couldn't load text information for collection",
225         textout, outconvert, disp, logout, XMLmode);
226    return true;
227  }
228
229  delete []fullpathc;
230
231  /** Another previously silent method can now cry out.
232   *  DLConsulting 12-07-2004
233   */
234  if(!get_phrase_all_data(textdata, phrase, word, tf, ef, lf, df, el,
235              linkdest, linktype, docNums, docfreq)) {
236    output_error(
237      "phindaction: Fatal Error! Couldn't parse phrase in get_phrase_all_data()",
238      textout, outconvert, disp, logout, XMLmode);
239      return true;   
240  }
241
242  // Output the header
243  if (XMLmode) {
244    textout << "<phinddata id=\"" << phrase
245        << "\" text=\"" << word
246        << "\" tf=\"" << tf
247        << "\" ef=\"" << ef
248        << "\" df=\"" << df
249        << "\" lf=\"" << lf
250        << "\">\n";
251  } else {
252    textout << "<html><head><title>" << word << "</title></head>\n"
253        << "<body><center>\n"
254        << "<p><h1>" << word << "</h1>\n"
255        << "<p><b>"<< word << "</b> occurs "
256        << tf << " times in " << df << " documents\n";
257  }
258
259  // Output the thesaurus links
260  if ((lf > 0) && (first_l < last_l)) {
261
262    // figure out the number of phrases to output
263    if (last_l > lf) {
264      last_l = lf;
265    }
266    count_l = last_l - first_l;
267   
268    if (XMLmode) {
269      textout << "<thesauruslist length=\"" << lf
270          << "\" start=\"" << first_l
271          << "\" end=\"" << last_l << "\">\n";
272      /** DLConsulting 12-07-2004 */
273      if(!print_thesaurus_links(args["c"], XMLmode, textdata, linkdest, linktype,
274                   first_l, last_l, disp, outconvert, textout)) {
275    output_error(
276             "phindaction: Fatal Error! Couldn't get phrase in get_phrase_freq_data()",
277             textout, outconvert, disp, logout, XMLmode);
278    return true;   
279      }
280      textout << "</thesauruslist>\n";
281    }
282
283    // output links as HTML
284    else {
285      if (count_l == lf) {
286    textout << "<p><b> " << count_l << " thesaurus links</b>\n";
287      } else {
288    textout << "<p><b>" << count_l << " of " << lf << " thesaurus links</b>\n";
289      }
290
291      textout << "<p><table border=1><tr><th>type</th><th>topic</th><th>freq</th><th>docs</th></tr>\n";
292      /** DLConsulting 12-07-2004 */
293      if(!print_thesaurus_links(args["c"], XMLmode, textdata, linkdest, linktype,
294                   first_l, last_l, disp, outconvert, textout)) {
295    output_error(
296             "phindaction: Fatal Error! Couldn't get phrase in get_phrase_freq_data()",
297             textout, outconvert, disp, logout, XMLmode);
298    return true;
299      }
300      textout << "</table>\n";
301
302      if (last_l < lf) {
303    if ((last_l + 10) < lf) {
304      textout << outconvert << disp
305          << "<br><a href=\"_gwcgi_?"
306          << "c=" << args["c"]
307          << "&ppnum=" << phrase
308          << "&pfe=" << first_e
309          << "&ple=" << last_e
310          << "&pfd=" << first_d
311          << "&pld=" << last_d
312          << "&pfl=" << first_l
313          << "&pll=" << (last_l + 10)
314          << "\">Get more thesaurus links</a>\n";
315    }
316    textout << outconvert << disp
317        << "<br><a href=\"_gwcgi_?"
318        << "c=" << args["c"]
319        << "&ppnum=" << phrase
320        << "&pfe=" << first_e
321        << "&ple=" << last_e
322        << "&pfd=" << first_d
323        << "&pld=" << last_d
324        << "&pfl=" << first_l
325        << "&pll=" << lf
326        << "\">Get every thesaurus link</a>\n" ;
327      }
328    }
329  }
330 
331  // Output the expansions
332  if ((ef > 0) && (first_e < last_e)) {
333
334    // figure out the number of phrases to output
335    if (last_e > el.size()) {
336      last_e = el.size();
337    }
338    count_e = last_e - first_e;
339
340    // output expansions as XML
341    if (XMLmode) {
342      textout << "<expansionlist length=\"" << ef
343          << "\" start=\"" << first_e
344          << "\" end=\"" << last_e << "\">" << endl;
345
346      print_expansions(args["c"], XMLmode, word, textdata, el, first_e,
347               last_e, disp, outconvert, textout);
348
349      textout << "</expansionlist>\n";
350    }
351
352    // output expansions as HTML
353    else {
354      if (count_e == el.size()) {
355    textout << "<p><b> " << count_e << " expansions</b>\n";
356      } else {
357    textout << "<p><b>" << count_e << " of " << ef << " expansions</b>\n";
358      }
359
360      textout << "<p><table border=1><tr><th colspan=3>phrase</th><th>freq</th><th>docs</th></tr>\n";
361      print_expansions(args["c"], XMLmode, word, textdata, el, first_e,
362               last_e, disp, outconvert, textout);
363      textout << "</table>\n";
364
365      if (last_e < ef) {
366    if ((last_e + 10) < ef) {
367      textout << outconvert << disp
368          << "<br><a href=\"_gwcgi_?"
369          << "c=" << args["c"]
370          << "&ppnum=" << phrase
371          << "&pfe=" << first_e
372          << "&ple=" << (last_e + 10)
373          << "&pfd=" << first_d
374          << "&pld=" << last_d
375          << "&pfl=" << first_l
376          << "&pll=" << last_l
377          << "\">Get more expansions</a>\n";
378    }
379    textout << outconvert << disp
380        << "<br><a href=\"_gwcgi_?"
381        << "c=" << args["c"]
382        << "&ppnum=" << phrase
383        << "&pfe=" << first_e
384        << "&ple=" << ef
385        << "&pfd=" << first_d
386        << "&pld=" << last_d
387        << "&pfl=" << first_l
388        << "&pll=" << last_l
389        << "\">Get every expansion</a>\n";
390      }
391    }
392  }
393
394  // Output the document occurances
395  if ((df > 0) && (first_d < last_d)) {
396
397    // figure out the phrases to output
398    if (last_d > docNums.size()) {
399      last_d = docNums.size();
400    }
401    count_d = last_d - first_d;
402
403    // output document list as XML
404    if (XMLmode) {
405      textout << "<documentlist length=\"" << df
406          << "\" start=\"" << first_d
407          << "\" end=\"" << last_d << "\">\n";
408     
409      if(!print_documents(XMLmode, basepath, args["c"], docNums, docfreq,
410              first_d, last_d, disp, outconvert, textout)) {
411    output_error(
412             "phindaction: Fatal Error! Couldn't load text information in print_documents() or get_document_all_data()",
413             textout, outconvert, disp, logout, XMLmode);
414    return true;   
415      }
416
417      textout << "</documentlist>\n";
418    }
419
420    // output document list as HTML
421    else {
422     
423      if (count_d == docNums.size()) {
424    textout << "<p><b> " << count_d << " documents</b>\n";
425      } else {
426    textout << "<p><b>" << count_d << " of " << df << " documents</b>\n";
427      }
428
429      textout << "<p><table border=1><tr><th align=left>document</th><th>freq</th></tr>\n";
430      if(!print_documents(XMLmode, basepath, args["c"], docNums, docfreq,
431              first_d, last_d, disp, outconvert, textout)) {
432    output_error(
433             "phindaction: Fatal Error! Couldn't load text information in print_documents()",
434             textout, outconvert, disp, logout, XMLmode);
435    return true;
436      }
437      textout << "</table>\n";
438     
439      if (last_d < df) {
440    if ((last_d + 10) < df) {
441      textout << outconvert << disp
442          << "<br><a href=\"_gwcgi_?"
443          << "c=" << args["c"]
444          << "&ppnum=" << phrase
445          << "&pfe=" << first_e
446          << "&ple=" << last_e
447          << "&pfd=" << first_d
448          << "&pld=" << (last_d + 10) 
449          << "&pfl=" << first_l
450          << "&pll=" << last_l
451          << "\">Get more documents</a>\n";
452    }
453    textout << outconvert << disp
454        << "<br><a href=\"_gwcgi_?"
455        << "c=" << args["c"]
456        << "&ppnum=" << phrase
457        << "&pfe=" << first_e
458        << "&ple=" << last_e
459        << "&pfd=" << first_d
460        << "&pld=" << df
461        << "&pfl=" << first_l
462        << "&pll=" << last_l
463        << "\">Get every document</a>\n";
464      }
465    }
466  }
467
468  // Close the document
469  if (XMLmode) {
470    textout << "</phinddata>\n";
471  } else {
472    textout << "</center></body></html>\n";
473  }
474
475  textdata.UnloadData ();
476
477  return true;
478}
479
480// Find the phrase number of a word in the index file
481bool phindaction::find_phrase_number_from_word(const text_t &basepath,
482                           const text_t &query,
483                           DocNumArray &result) {
484
485  // Open the index file for searching
486  IndexData indexData;
487
488  text_t fullpath = filename_cat(basepath, "pword");
489  char *fullpathc = fullpath.getcstr();
490#if defined __WIN32__
491  char *base = "";
492#else
493  char *base = "/";
494#endif
495
496  if (!indexData.LoadData (base, fullpathc)) {
497    //    FatalError (1, "Couldn't load index information for \"%s\"", fullpathc);
498    //exit(0);
499    /** Don't handle fatal errors here anymore.
500     *  DLConsulting 12-07-2004
501     */
502    return false; // Indicates something very bad has happened
503  }
504
505  delete []fullpathc;
506
507  // set up the query object
508  QueryInfo queryInfo;
509  SetCStr (queryInfo.docLevel, "Document", 8);
510  queryInfo.maxDocs = 5;
511  queryInfo.sortByRank = true;
512  queryInfo.exactWeights = false;
513  queryInfo.needRankInfo = true;
514  queryInfo.needTermFreqs = true;
515 
516  // mode 1 = casefolded, unstemmed search
517  UCArray ucquery;
518  toUCArray(query, ucquery);
519  QueryNode *queryTree = ParseQuery(ucquery, 1, 1, 4);
520     
521  // perform the query
522  ExtQueryResult queryResult;
523  MGQuery (indexData, queryInfo, queryTree, queryResult);
524  // cout << "-- word lookup result -- " << endl << queryResult << endl ;
525
526  result.clear();
527  result = queryResult.docs;
528
529  // delete the query
530  if (queryTree != NULL) delete queryTree;
531
532  indexData.UnloadData();
533
534  /** This method now returns a boolean, so...
535   *  DLConsulting 12-07-2004
536   */
537  return true; // Indicates that what happened is all good, baby.
538}
539
540// Get all the data about a phrase
541//
542// The phrase is stored in textData as record phrase.
543// We retrieve:
544//   word - the text of the phrase
545//   tf - the total frequency of the phrase
546//   ef - the expansion frequency of the phrase
547//   lf - the thesaurus link frequency of the phrase
548//   df - the document frequency of the phrase
549//   el - the list of phrases that are expansions of phrase
550//   ll - the list of phrases that are thesaurus links
551//   dl - the list of documents that contain phrase
552bool phindaction::get_phrase_all_data(TextData &textdata, unsigned long phrase,
553                      text_t &word, unsigned long &tf, unsigned long &ef,
554                      unsigned long &lf, unsigned long &df,
555                      vector <unsigned long> &el,
556                      vector <unsigned long> &linkdest,
557                      vector <UCArray> &linktype,
558                      vector <unsigned long> &docnum,
559                      vector <unsigned long> &docfrq) {
560  UCArray text;
561  UCArray docLevel;
562  SetCStr(docLevel, "Document", 8);
563
564  // Look the word up in the textData
565  if (!GetDocText (textdata, docLevel, phrase, text)) {
566    //    FatalError (1, "Error while trying to get phrase %u", phrase);
567    //exit(0);
568    return false; // Something very bad has happened.
569  }
570
571  // Ignore everything up to the first colon
572  UCArray::iterator next = text.begin();
573  while (*next++ != ':');
574
575  // ignore training carriage returns
576  while (text.back() == '\n') {
577    text.pop_back();
578  }
579 
580  // Get the word
581  word.clear();
582  for (; *next != ':'; ++next) {
583    word.push_back(*next);
584  }
585 
586  // Get total frequency
587  tf = 0;
588  for (++next; *next != ':'; ++next) {
589    tf *= 10;
590    tf += (*next - '0');
591  }
592 
593  // Get expansion frequency
594  ef = 0;
595  for (++next; *next != ':'; ++next) {
596    ef *= 10;
597    ef += (*next - '0');
598  }
599 
600  // Get document frequency
601  df = 0;
602  for (++next; *next != ':'; ++next) {
603    df *= 10;
604    df += (*next - '0');
605  }
606 
607  // Get expansion list
608  el.clear();
609  unsigned long e = 0;
610  for (++next; *next != ':'; ++next) {
611    if (*next == ',') {
612      el.push_back(e);
613      e = 0;
614    } else {
615      e *= 10;
616      e += (*next - '0');
617    }
618  }
619
620  // Get document list & the document frequency list
621  docnum.clear();
622  docfrq.clear();
623  bool readnum = false;
624  unsigned long d = 0;
625  for (++next; *next != ':'; ++next) {
626    if (*next == ',') {
627      docnum.push_back(d);
628      readnum = true;
629      d = 0;
630    } else if (*next == ';') {
631      if (readnum) {
632    docfrq.push_back(d);
633      } else {
634    docnum.push_back(d);
635    docfrq.push_back(1);
636      }
637      readnum = false;
638      d = 0;
639    } else {
640      d *= 10;
641      d += (*next - '0');
642    }
643  }
644
645  // Get thesaurus link frequency & link list
646  text.push_back(':');
647  text.push_back(':');
648
649  // link frequency
650  lf = 0;
651  for (++next; *next != ':'; ++next) {
652    lf *= 10;
653    lf += (*next - '0');
654  }
655
656  // two lists of link data
657  linkdest.clear();
658  linktype.clear();
659 
660  UCArray thistype;
661  thistype.clear();
662  bool typedone = false;
663  unsigned long l = 0;
664  for (++next; *next != ':'; ++next) {
665   
666    if (!typedone) {
667      // first read the link type, a charactor string
668      if (*next == ',') {
669    typedone = true;
670      } else {
671    thistype.push_back(*next);
672      }
673    } else {
674      // having read the link type, read the list of link destinations
675      if (*next == ',') {
676    linkdest.push_back(l);
677    linktype.push_back(thistype);
678    l = 0;
679      } else if (*next == ';') {
680    linkdest.push_back(l);
681    linktype.push_back(thistype);
682    l = 0;
683    thistype.clear();
684    typedone = false;
685      } else {
686    l *= 10;
687    l += (*next - '0');
688      }
689    }
690  }
691
692  return true; // Indicates that what happened is all good, baby.
693}
694
695bool phindaction::print_thesaurus_links(const text_t &collection, bool XMLmode,
696                    TextData &textdata, vector <unsigned long> &linkdest,
697                    vector <UCArray> &linktype, unsigned long first,
698                    unsigned long last, displayclass &disp,
699                    outconvertclass &outconvert, ostream &textout) {
700
701  // information describing each link in the list
702  unsigned long phrase, tf, ef, df;
703  UCArray type, text;
704 
705  for (unsigned long l = first; l < last; ++l) {
706
707    // get the phrase data
708    phrase = linkdest[l];
709    type = linktype[l];
710
711    /** DLConsulting 12-07-2004 */
712    if(!get_phrase_freq_data(textdata, phrase, text, tf, ef, df)) {
713      return false;
714    }
715   
716    if (XMLmode) {
717      textout << "<thesaurus num=\"" << l
718          << "\" id=\"" << phrase
719          << "\" tf=\"" << tf
720          << "\" df=\"" << df
721          << "\" type=\"" << type
722          << "\" text=\"" << text
723          << "\"/>\n";
724    } else {
725      textout << "<tr valign=top><td>" << type << "</td><td>";
726      textout << outconvert << disp
727          << "<a href=\"_gwcgi_?c=" << collection;
728      textout << "&ppnum=" << phrase << "\">" << text << "</a>"
729          << "</td><td>" << tf << "</td><td>" << df << "</td></tr>\n";
730    }
731  }
732
733  /** DLConsulting 12-07-2004 */
734  return true;
735}
736
737// Get the frequency data about a phrase
738//
739// The phrase is stored in textData as record phrase.
740// We retrieve:
741//   word - the text of the phrase
742//   tf - the total frequency of the phrase
743//   ef - the expansion frequency of the phrase
744//   df - the document frequency of the phrase
745/**
746 *   Returns:
747 *     false if the method suffered a fatal error, true otherwise
748 */
749bool phindaction::get_phrase_freq_data(TextData &textdata, unsigned long phrase,
750                       UCArray &word, unsigned long &tf,
751                       unsigned long &ef, unsigned long &df) {
752 
753  UCArray text;
754  UCArray docLevel;
755  SetCStr(docLevel, "Document", 8);
756
757  // Look the word up in the textData
758  if (!GetDocText (textdata, docLevel, phrase, text)) {
759    //    FatalError (1, "Error while trying to get phrase %u", phrase);
760    //exit(0);
761    /** DLConsulting 12-07-2004 */
762    return false;
763  }
764
765  // Ignore everything up to the first colon
766  UCArray::iterator next = text.begin();
767  while (*next++ != ':');
768 
769  // Get the word
770  word.clear();
771  for (; *next != ':'; ++next) {
772    word.push_back(*next);
773  }
774 
775  // Get total frequency
776  tf = 0;
777  for (++next; *next != ':'; ++next) {
778    tf *= 10;
779    tf += (*next - '0');
780  }
781 
782  // Get expansion frequency
783  ef = 0;
784  for (++next; *next != ':'; ++next) {
785    ef *= 10;
786    ef += (*next - '0');
787  }
788 
789  // Get document frequency
790  df = 0;
791  for (++next; *next != ':'; ++next) {
792    df *= 10;
793    df += (*next - '0');
794  }
795
796  /** DLConsulting 12-07-2004 */
797  return true;
798}
799
800// Print a list of expansions
801//
802// Given the textData and a list of phrase numbers, print out each of the
803// expansions.
804void phindaction::print_expansions(const text_t &collection, bool XMLmode,
805                   const text_t &body, TextData &textdata,
806                   const vector <unsigned long> &elist,
807                   unsigned long first, unsigned long last,
808                   displayclass &disp, outconvertclass &outconvert,
809                   ostream &textout) {
810 
811  UCArray word;
812  unsigned long phrase, tf, df, ef;
813
814  UCArray suffix, prefix, ucbody;
815 
816  toUCArray(body, ucbody);
817
818  for (unsigned long e = first; e < last; ++e) {
819
820    phrase = elist[e];
821    get_phrase_freq_data(textdata, phrase, word, tf, ef, df);
822
823    split_phrase(word, ucbody, prefix, suffix);
824   
825    if (XMLmode) {
826      // body is always the same as the text of the phrase, so no need to send it
827      textout << "<expansion num=\"" << e
828          << "\" id=\"" << phrase
829          << "\" tf=\"" << tf
830          << "\" df=\"" << df;
831      if (!prefix.empty()) {
832    textout << "\" prefix=\"" << prefix;
833      }
834      if (!suffix.empty()) {
835    textout << "\" suffix=\"" << suffix;
836      }
837      textout << "\"/>\n";
838    } else {
839      textout << outconvert << disp
840          << "<tr valign=top><td align=right><a href=\"_gwcgi_?"
841          << "c=" << collection << "&ppnum=" << phrase << "\">";
842      textout << prefix << "</a></td>";
843      textout <<outconvert << disp
844          << "<td align=center><a href=\"_gwcgi_?"
845          << "c=" << collection << "&ppnum=" << phrase << "\">"
846          << body << "</a></td>"
847          << "<td align=left><a href=\"_gwcgi_?"
848          << "c=" << collection << "&ppnum=" << phrase << "\">";
849      textout << suffix << "</a></td>"
850          << "<td>" << tf << "</td><td>" << df << "</td></tr>\n";
851    }
852  }
853}
854
855// split an expansion into prefix and suffix
856void phindaction::split_phrase(const UCArray &word, const UCArray &body,
857                   UCArray &prefix, UCArray &suffix) {
858
859  prefix.clear();
860  suffix.clear();
861
862  bool readingPrefix = true;
863  UCArray::const_iterator here = word.begin();
864  UCArray::const_iterator end = word.end();
865 
866  while (here != end) {
867
868    // if we've not read all the prefix, add the next char to the prefix
869    if (readingPrefix) {
870      if (phrase_match(body, here, end)) {
871    readingPrefix = false;
872    // trim whitespace from end of prefix & start of suffix
873    if (!prefix.empty()) {
874      prefix.pop_back();
875    }
876    if ((here != end) && (*here == ' ')) {
877      ++here;
878    }
879      } else {
880    prefix.push_back(*here);
881    ++here;
882      }
883    }
884    // if we've finished with the prefix, update the suffix
885    else {
886      suffix.push_back(*here);
887      ++here;
888    }
889  }
890}
891
892// phrase_match
893//
894// compare two strings, one represented as an UCArray, the other as two
895// UCArray iterators.
896//
897// Return true if the UCArray is the same as the phrase the iterators point
898// to for the length of the UCArray.
899bool phindaction::phrase_match(const UCArray &text, UCArray::const_iterator &here,
900                   UCArray::const_iterator end) {
901
902  UCArray::const_iterator one_here = text.begin();
903  UCArray::const_iterator one_end  = text.end();
904  UCArray::const_iterator two_here = here;
905
906  // iterate over the length of the first string, comparing each element to
907  // the corresponding element in the second string.
908  while (one_here != one_end) {
909   
910      if (two_here == end) {
911      return false;
912      } else if (*one_here != *two_here) {
913      return false;
914      }
915      ++one_here;
916      ++two_here;
917  }
918
919  here = two_here;
920  return true;
921}
922
923bool phindaction::print_documents(bool XMLmode, const text_t &basepath,
924                  const text_t &collection,
925                  const vector <unsigned long> &docNums,
926                  const vector <unsigned long> &docFreq,
927                  unsigned long first, unsigned long last,
928                  displayclass &disp, outconvertclass &outconvert,
929                  ostream &textout) {
930 
931  // Create a TextData object to read the document data
932  TextData docdata;
933
934  text_t fullpath = filename_cat(basepath, "docs");
935  char *fullpathc = fullpath.getcstr();
936#if defined __WIN32__
937  char *base = "";
938#else
939  char *base = "/";
940#endif
941
942  if (!docdata.LoadData (base, fullpathc)) {
943    //    FatalError (1, "Couldn't load text information for \"%s\"", fullpathc);
944    //exit(0);
945    /** DLConsulting 12-07-2004 */
946    return false;
947  }
948
949  delete []fullpathc;
950
951  UCArray title, hash;
952  unsigned long freq, doc;
953
954  for (unsigned long d = first; d < last; ++d) {
955    doc = docNums[d];
956    freq = docFreq[d];
957
958    /** DLConsulting 13-07-2004 */
959    if(!get_document_all_data(docdata, doc, title, hash)) {
960      return false;
961    }
962
963    if (XMLmode) {
964      textout << "<document num=\"" << d
965          << "\" hash=\"" << hash
966          << "\" freq=\"" << freq
967          << "\" title=\"" << title << "\"/>\n";
968    } else {
969      textout << outconvert << disp
970          << "<tr valign=top><td><a href=\"_gwcgi_?"
971          << "c=" << collection;
972      textout << "&a=d&d=" << hash << "\">" << title << "</a>"
973          << "</td><td>" << freq << "</td></tr>\n";
974    }
975  }
976
977  docdata.UnloadData();
978
979  /** DLConsulting 12-07-2004 */
980  return true;
981}
982
983// Get all the data about a docment
984//
985// The document's details are stored in docData as record docNum.
986// We retrieve:
987//   title - the document's title
988//   hash - the document's unique OID
989/** Returns:
990 *   false if a fatal error occured, true otherwise
991 *  DLConsulting 12-07-2004
992 */
993bool phindaction::get_document_all_data(TextData &docdata, unsigned long docNum,
994                    UCArray &title, UCArray &hash) {
995
996  UCArray text;
997  UCArray docLevel;
998  SetCStr(docLevel, "Document", 8);
999
1000  // Look the word up in the textData
1001  if (!GetDocText (docdata, docLevel, docNum, text)) {
1002    //    FatalError (1, "Error while trying to get document %u", docNum);
1003    //exit(0);
1004    /** DLConsulting 13-07-2004 */
1005    return false;
1006  }
1007
1008  // Ignore everything up to the first colon
1009  UCArray::iterator next = text.begin();
1010  while (*next++ != '\t');
1011 
1012  // Get the document OID (hash)
1013  hash.clear();
1014  for (; *next != '\t'; ++next) {
1015    hash.push_back(*next);
1016  }
1017
1018  // Get the title
1019  text.push_back('\n');
1020  title.clear();
1021  for (++next; *next != '\n'; ++next) {
1022    title.push_back(*next);
1023  }
1024
1025  /** DLConsulting 13-07-2004 */
1026  return true;
1027}
1028
1029void phindaction::toUCArray(const text_t &in, UCArray &out) {
1030  out.clear();
1031  if (out.capacity() < in.size() + 1) {
1032    out.reserve(in.size() + 1);
1033  }
1034  text_t::const_iterator here = in.begin();
1035  text_t::const_iterator end = in.end();
1036  while (here != end) {
1037    out.push_back((unsigned char) *here);
1038    ++here;
1039  }
1040}
1041
1042void phindaction::output_error (const text_t &message, ostream &textout,
1043                outconvertclass &outconvert,
1044                displayclass & disp, ostream &logout,
1045                bool XMLmode) {
1046
1047  logout << outconvert << message << "\n";
1048  if (XMLmode) {
1049    textout << outconvert
1050        << "<phinddata>\n"
1051        << "<phinderror>" << message << "</phinderror>\n"
1052        << "</phinddata>\n";
1053  } else {
1054    textout << outconvert << disp
1055        << "_header_\n"
1056        << message
1057        << "_footer_\n";
1058  }
1059}
1060
1061#endif //GSDL_USE_PHIND_ACTION
1062
Note: See TracBrowser for help on using the browser.