root/main/trunk/greenstone2/runtime-src/src/recpt/phindaction.cpp @ 22984

Revision 22984, 29.9 KB (checked in by ak19, 8 years ago)

1. Undoing commit of 22934 where decode_commas was called on stem and fold comma separated list: previously separated due to url-encoding of commas. Now that the problem has been fixed at the source, the decode_commas hack is no longer necessary. 2. Commas in stem and fold are no longer url-encoded because the multiple_value field of the continuously-reused struct arg_ainfo is always set back to the default false after ever being set to true. So it no longer subtly stays at true to affect Greenstone functioning in unforeseen ways (such as suddenly and unnecessarily URL-encoding commas where this is not wanted).

  • Property svn:keywords set to Author Date Id Revision
Line 
1/**********************************************************************
2 *
3 * phindaction.cpp --
4 *
5 * Copyright 2001 Gordon W. Paynter
6 * Copyright 2001 The New Zealand Digital Library Project
7 *
8 * A component of the Greenstone digital library software
9 * from the New Zealand Digital Library Project at the
10 * University of Waikato, New Zealand.
11 *
12 * This program is free software; you can redistribute it and/or modify
13 * it under the terms of the GNU General Public License as published by
14 * the Free Software Foundation; either version 2 of the License, or
15 * (at your option) any later version.
16 *
17 * This program is distributed in the hope that it will be useful,
18 * but WITHOUT ANY WARRANTY; without even the implied warranty of
19 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
20 * GNU General Public License for more details.
21 *
22 * You should have received a copy of the GNU General Public License
23 * along with this program; if not, write to the Free Software
24 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
25 *
26 *********************************************************************/
27
28#include "gsdl_modules_cfg.h"
29#ifdef GSDL_USE_PHIND_ACTION
30
31// Note that this action uses mgpp to retrieve phind info, calling MGQuery
32// etc. directly, not through the protocol. This breaks our receptionist -
33// collection server separation and should be fixed some day I guess.
34
35#include "phindaction.h"
36#include "fileutil.h"
37#include "gsdlunicode.h"
38
39phindaction::phindaction () {
40
41  cgiarginfo arg_ainfo;
42
43  arg_ainfo.shortname = "pc";
44  arg_ainfo.longname = "phind classifier";
45  arg_ainfo.multiplechar = true;
46  arg_ainfo.multiplevalue = false;
47  arg_ainfo.defaultstatus = cgiarginfo::weak;
48  arg_ainfo.argdefault = g_EmptyText;
49  arg_ainfo.savedarginfo = cgiarginfo::mustnot;
50  argsinfo.addarginfo (NULL, arg_ainfo);
51
52  arg_ainfo.shortname = "pxml";
53  arg_ainfo.longname = "phind XML mode";
54  arg_ainfo.multiplechar = false;
55  arg_ainfo.multiplevalue = false;
56  arg_ainfo.defaultstatus = cgiarginfo::weak;
57  arg_ainfo.argdefault = "0";
58  arg_ainfo.savedarginfo = cgiarginfo::mustnot;
59  argsinfo.addarginfo (NULL, arg_ainfo);
60
61  arg_ainfo.shortname = "ppnum";
62  arg_ainfo.longname = "phind phrase number";
63  arg_ainfo.multiplechar = true;
64  arg_ainfo.multiplevalue = false;
65  arg_ainfo.defaultstatus = cgiarginfo::weak;
66  arg_ainfo.argdefault = "0";
67  arg_ainfo.savedarginfo = cgiarginfo::mustnot;
68  argsinfo.addarginfo (NULL, arg_ainfo);
69
70  arg_ainfo.shortname = "pptext";
71  arg_ainfo.longname = "phind phrase text";
72  arg_ainfo.multiplechar = true;
73  arg_ainfo.multiplevalue = false;
74  arg_ainfo.defaultstatus = cgiarginfo::weak;
75  arg_ainfo.argdefault = g_EmptyText;
76  arg_ainfo.savedarginfo = cgiarginfo::mustnot;
77  argsinfo.addarginfo (NULL, arg_ainfo);
78
79  arg_ainfo.shortname = "pfe";
80  arg_ainfo.longname = "phind first_e";
81  arg_ainfo.multiplechar = true;
82  arg_ainfo.multiplevalue = false;
83  arg_ainfo.defaultstatus = cgiarginfo::weak;
84  arg_ainfo.argdefault = "0";
85  arg_ainfo.savedarginfo = cgiarginfo::mustnot;
86  argsinfo.addarginfo (NULL, arg_ainfo);
87
88  arg_ainfo.shortname = "ple";
89  arg_ainfo.longname = "phind last_e";
90  arg_ainfo.multiplechar = true;
91  arg_ainfo.multiplevalue = false;
92  arg_ainfo.defaultstatus = cgiarginfo::weak;
93  arg_ainfo.argdefault = "10";
94  arg_ainfo.savedarginfo = cgiarginfo::mustnot;
95  argsinfo.addarginfo (NULL, arg_ainfo);
96
97  arg_ainfo.shortname = "pfl";
98  arg_ainfo.longname = "phind first_l";
99  arg_ainfo.multiplechar = true;
100  arg_ainfo.multiplevalue = false;
101  arg_ainfo.defaultstatus = cgiarginfo::weak;
102  arg_ainfo.argdefault = "0";
103  arg_ainfo.savedarginfo = cgiarginfo::mustnot;
104  argsinfo.addarginfo (NULL, arg_ainfo);
105
106  arg_ainfo.shortname = "pll";
107  arg_ainfo.longname = "phind last_l";
108  arg_ainfo.multiplechar = true;
109  arg_ainfo.multiplevalue = false;
110  arg_ainfo.defaultstatus = cgiarginfo::weak;
111  arg_ainfo.argdefault = "10";
112  arg_ainfo.savedarginfo = cgiarginfo::mustnot;
113  argsinfo.addarginfo (NULL, arg_ainfo);
114
115  arg_ainfo.shortname = "pfd";
116  arg_ainfo.longname = "phind first_d";
117  arg_ainfo.multiplechar = true;
118  arg_ainfo.multiplevalue = false;
119  arg_ainfo.defaultstatus = cgiarginfo::weak;
120  arg_ainfo.argdefault = "0";
121  arg_ainfo.savedarginfo = cgiarginfo::mustnot;
122  argsinfo.addarginfo (NULL, arg_ainfo);
123
124  arg_ainfo.shortname = "pld";
125  arg_ainfo.longname = "phind last_d";
126  arg_ainfo.multiplechar = true;
127  arg_ainfo.multiplevalue = false;
128  arg_ainfo.defaultstatus = cgiarginfo::weak;
129  arg_ainfo.argdefault = "10";
130  arg_ainfo.savedarginfo = cgiarginfo::mustnot;
131  argsinfo.addarginfo (NULL, arg_ainfo);
132}
133
134phindaction::~phindaction () {
135}
136
137void phindaction::get_cgihead_info (cgiargsclass &args, recptprotolistclass * /*protos*/,
138                    response_t &response,text_t &response_data,
139                    ostream &/*logout*/) {
140  response = content;
141  if (args["pxml"] == "1") {
142    response_data = "text/xml";
143  } else {
144    response_data = "text/html";
145  }
146}
147
148bool phindaction::do_action (cgiargsclass &args, recptprotolistclass *protos,
149                 browsermapclass * /*browsers*/, displayclass &disp,
150                 outconvertclass &outconvert, ostream &textout,
151                 ostream &logout) {
152
153  unsigned long count_l, count_e, count_d;
154  unsigned long phrase = args["ppnum"].getulong();
155  text_t &word = args["pptext"];
156  unsigned long first_e = args["pfe"].getulong();
157  unsigned long last_e = args["ple"].getulong();
158  unsigned long first_l = args["pfl"].getulong();
159  unsigned long last_l = args["pll"].getulong();
160  unsigned long first_d = args["pfd"].getulong();
161  unsigned long last_d = args["pld"].getulong();
162  bool XMLmode = false;
163  if (args["pxml"] == "1") XMLmode = true;
164
165  // must have a valid collection server
166  recptproto *collectproto = protos->getrecptproto (args["c"], logout);
167  if (collectproto == NULL) {
168    output_error("phindaction: ERROR: collection not set", textout,
169         outconvert, disp, logout, XMLmode);
170    return true;
171  }
172
173  // the frequency and occurances of the phrase
174  unsigned long tf;
175  vector <unsigned long> el, linkdest, docNums, docfreq;
176  vector <UCArray> linktype;
177
178  // the number of occurances to display
179  unsigned long ef, lf, df;
180 
181  text_t basepath = filename_cat(collecthome, args["c"],
182                 "index", "phind" + args["pc"]);
183
184  // If we don't know the phrase number, look it up
185  if (phrase == 0) {
186   
187    if (word.empty()) {
188      output_error("phindaction: ERROR: no phrase number or word", textout,
189           outconvert, disp, logout, XMLmode);
190      return true;
191    }
192
193    DocNumArray result;
194    /** In order to prevent browser crashing problems, any method which
195     *  previously suffered a silent fatal error, now instead returns false
196     *  to indicate a fatal error has occured. We can then dispatch an
197     *  appropriate error tag to the Phind applet (rather than leave it
198     *  whiling away the milliseconds until the end of existence - or at
199     *  least your browser - in an infinite loop!)
200     *  DLConsulting 12-07-2004
201     */
202   
203    if(!find_phrase_number_from_word(basepath, word, result)) {
204    output_error("phindaction: Fatal Error! Couldn't load index information in find_phrase_number_from_word()",
205             textout, outconvert, disp, logout, XMLmode);
206    return true;
207    }
208   
209    if (result.empty()) {
210      output_error("phindaction: The search term ("+word+") does not occur in the collection",
211           textout, outconvert, disp, logout, XMLmode);
212      return true;
213    } else {
214      phrase = result[0];
215    }
216  }
217
218  // Create a TextData object to read the phrase data (pdata)
219  TextData textdata;
220
221  text_t fullpath = filename_cat(basepath, "pdata");
222  char *fullpathc = fullpath.getcstr();
223#if defined __WIN32__
224  char *base = "";
225#else
226  char *base = "/";
227#endif
228
229  if (!textdata.LoadData (base, fullpathc)) {
230    //    FatalError (1, "Couldn't load text information for \"%s\"", fullpathc);
231    //exit(0);
232    /** We must return something to the client, whether this error is fatal or
233     *  no, otherwise we risk sending their browser into an infinite loop!
234     *  DLConsulting 12-07-2004
235     */
236    output_error("phindaction: Fatal Error! Couldn't load text information for collection",
237         textout, outconvert, disp, logout, XMLmode);
238    return true;
239  }
240
241  delete []fullpathc;
242
243  /** Another previously silent method can now cry out.
244   *  DLConsulting 12-07-2004
245   */
246  if(!get_phrase_all_data(textdata, phrase, word, tf, ef, lf, df, el,
247              linkdest, linktype, docNums, docfreq)) {
248    output_error(
249      "phindaction: Fatal Error! Couldn't parse phrase in get_phrase_all_data()",
250      textout, outconvert, disp, logout, XMLmode);
251      return true;   
252  }
253
254  // Output the header
255  if (XMLmode) {
256    textout << "<phinddata id=\"" << phrase
257        << "\" text=\"" << word
258        << "\" tf=\"" << tf
259        << "\" ef=\"" << ef
260        << "\" df=\"" << df
261        << "\" lf=\"" << lf
262        << "\">\n";
263  } else {
264    textout << "<html><head><title>" << word << "</title></head>\n"
265        << "<body><center>\n"
266        << "<p><h1>" << word << "</h1>\n"
267        << "<p><b>"<< word << "</b> occurs "
268        << tf << " times in " << df << " documents\n";
269  }
270
271  // Output the thesaurus links
272  if ((lf > 0) && (first_l < last_l)) {
273
274    // figure out the number of phrases to output
275    if (last_l > lf) {
276      last_l = lf;
277    }
278    count_l = last_l - first_l;
279   
280    if (XMLmode) {
281      textout << "<thesauruslist length=\"" << lf
282          << "\" start=\"" << first_l
283          << "\" end=\"" << last_l << "\">\n";
284      /** DLConsulting 12-07-2004 */
285      if(!print_thesaurus_links(args["c"], XMLmode, textdata, linkdest, linktype,
286                   first_l, last_l, disp, outconvert, textout)) {
287    output_error(
288             "phindaction: Fatal Error! Couldn't get phrase in get_phrase_freq_data()",
289             textout, outconvert, disp, logout, XMLmode);
290    return true;   
291      }
292      textout << "</thesauruslist>\n";
293    }
294
295    // output links as HTML
296    else {
297      if (count_l == lf) {
298    textout << "<p><b> " << count_l << " thesaurus links</b>\n";
299      } else {
300    textout << "<p><b>" << count_l << " of " << lf << " thesaurus links</b>\n";
301      }
302
303      textout << "<p><table border=1><tr><th>type</th><th>topic</th><th>freq</th><th>docs</th></tr>\n";
304      /** DLConsulting 12-07-2004 */
305      if(!print_thesaurus_links(args["c"], XMLmode, textdata, linkdest, linktype,
306                   first_l, last_l, disp, outconvert, textout)) {
307    output_error(
308             "phindaction: Fatal Error! Couldn't get phrase in get_phrase_freq_data()",
309             textout, outconvert, disp, logout, XMLmode);
310    return true;
311      }
312      textout << "</table>\n";
313
314      if (last_l < lf) {
315    if ((last_l + 10) < lf) {
316      textout << outconvert << disp
317          << "<br><a href=\"_gwcgi_?"
318          << "c=" << args["c"]
319          << "&ppnum=" << phrase
320          << "&pfe=" << first_e
321          << "&ple=" << last_e
322          << "&pfd=" << first_d
323          << "&pld=" << last_d
324          << "&pfl=" << first_l
325          << "&pll=" << (last_l + 10)
326          << "\">Get more thesaurus links</a>\n";
327    }
328    textout << outconvert << disp
329        << "<br><a href=\"_gwcgi_?"
330        << "c=" << args["c"]
331        << "&ppnum=" << phrase
332        << "&pfe=" << first_e
333        << "&ple=" << last_e
334        << "&pfd=" << first_d
335        << "&pld=" << last_d
336        << "&pfl=" << first_l
337        << "&pll=" << lf
338        << "\">Get every thesaurus link</a>\n" ;
339      }
340    }
341  }
342 
343  // Output the expansions
344  if ((ef > 0) && (first_e < last_e)) {
345
346    // figure out the number of phrases to output
347    if (last_e > el.size()) {
348      last_e = el.size();
349    }
350    count_e = last_e - first_e;
351
352    // output expansions as XML
353    if (XMLmode) {
354      textout << "<expansionlist length=\"" << ef
355          << "\" start=\"" << first_e
356          << "\" end=\"" << last_e << "\">" << endl;
357
358      print_expansions(args["c"], XMLmode, word, textdata, el, first_e,
359               last_e, disp, outconvert, textout);
360
361      textout << "</expansionlist>\n";
362    }
363
364    // output expansions as HTML
365    else {
366      if (count_e == el.size()) {
367    textout << "<p><b> " << count_e << " expansions</b>\n";
368      } else {
369    textout << "<p><b>" << count_e << " of " << ef << " expansions</b>\n";
370      }
371
372      textout << "<p><table border=1><tr><th colspan=3>phrase</th><th>freq</th><th>docs</th></tr>\n";
373      print_expansions(args["c"], XMLmode, word, textdata, el, first_e,
374               last_e, disp, outconvert, textout);
375      textout << "</table>\n";
376
377      if (last_e < ef) {
378    if ((last_e + 10) < ef) {
379      textout << outconvert << disp
380          << "<br><a href=\"_gwcgi_?"
381          << "c=" << args["c"]
382          << "&ppnum=" << phrase
383          << "&pfe=" << first_e
384          << "&ple=" << (last_e + 10)
385          << "&pfd=" << first_d
386          << "&pld=" << last_d
387          << "&pfl=" << first_l
388          << "&pll=" << last_l
389          << "\">Get more expansions</a>\n";
390    }
391    textout << outconvert << disp
392        << "<br><a href=\"_gwcgi_?"
393        << "c=" << args["c"]
394        << "&ppnum=" << phrase
395        << "&pfe=" << first_e
396        << "&ple=" << ef
397        << "&pfd=" << first_d
398        << "&pld=" << last_d
399        << "&pfl=" << first_l
400        << "&pll=" << last_l
401        << "\">Get every expansion</a>\n";
402      }
403    }
404  }
405
406  // Output the document occurances
407  if ((df > 0) && (first_d < last_d)) {
408
409    // figure out the phrases to output
410    if (last_d > docNums.size()) {
411      last_d = docNums.size();
412    }
413    count_d = last_d - first_d;
414
415    // output document list as XML
416    if (XMLmode) {
417      textout << "<documentlist length=\"" << df
418          << "\" start=\"" << first_d
419          << "\" end=\"" << last_d << "\">\n";
420     
421      if(!print_documents(XMLmode, basepath, args["c"], docNums, docfreq,
422              first_d, last_d, disp, outconvert, textout)) {
423    output_error(
424             "phindaction: Fatal Error! Couldn't load text information in print_documents() or get_document_all_data()",
425             textout, outconvert, disp, logout, XMLmode);
426    return true;   
427      }
428
429      textout << "</documentlist>\n";
430    }
431
432    // output document list as HTML
433    else {
434     
435      if (count_d == docNums.size()) {
436    textout << "<p><b> " << count_d << " documents</b>\n";
437      } else {
438    textout << "<p><b>" << count_d << " of " << df << " documents</b>\n";
439      }
440
441      textout << "<p><table border=1><tr><th align=left>document</th><th>freq</th></tr>\n";
442      if(!print_documents(XMLmode, basepath, args["c"], docNums, docfreq,
443              first_d, last_d, disp, outconvert, textout)) {
444    output_error(
445             "phindaction: Fatal Error! Couldn't load text information in print_documents()",
446             textout, outconvert, disp, logout, XMLmode);
447    return true;
448      }
449      textout << "</table>\n";
450     
451      if (last_d < df) {
452    if ((last_d + 10) < df) {
453      textout << outconvert << disp
454          << "<br><a href=\"_gwcgi_?"
455          << "c=" << args["c"]
456          << "&ppnum=" << phrase
457          << "&pfe=" << first_e
458          << "&ple=" << last_e
459          << "&pfd=" << first_d
460          << "&pld=" << (last_d + 10) 
461          << "&pfl=" << first_l
462          << "&pll=" << last_l
463          << "\">Get more documents</a>\n";
464    }
465    textout << outconvert << disp
466        << "<br><a href=\"_gwcgi_?"
467        << "c=" << args["c"]
468        << "&ppnum=" << phrase
469        << "&pfe=" << first_e
470        << "&ple=" << last_e
471        << "&pfd=" << first_d
472        << "&pld=" << df
473        << "&pfl=" << first_l
474        << "&pll=" << last_l
475        << "\">Get every document</a>\n";
476      }
477    }
478  }
479
480  // Close the document
481  if (XMLmode) {
482    textout << "</phinddata>\n";
483  } else {
484    textout << "</center></body></html>\n";
485  }
486
487  textdata.UnloadData ();
488
489  return true;
490}
491
492// Find the phrase number of a word in the index file
493bool phindaction::find_phrase_number_from_word(const text_t &basepath,
494                           const text_t &query,
495                           DocNumArray &result) {
496
497  // Open the index file for searching
498  IndexData indexData;
499
500  text_t fullpath = filename_cat(basepath, "pword");
501  char *fullpathc = fullpath.getcstr();
502#if defined __WIN32__
503  char *base = "";
504#else
505  char *base = "/";
506#endif
507
508  if (!indexData.LoadData (base, fullpathc)) {
509    //    FatalError (1, "Couldn't load index information for \"%s\"", fullpathc);
510    //exit(0);
511    /** Don't handle fatal errors here anymore.
512     *  DLConsulting 12-07-2004
513     */
514    return false; // Indicates something very bad has happened
515  }
516
517  delete []fullpathc;
518
519  // set up the query object
520  QueryInfo queryInfo;
521  SetCStr (queryInfo.docLevel, "Document", 8);
522  queryInfo.maxDocs = 5;
523  queryInfo.sortByRank = true;
524  queryInfo.exactWeights = false;
525  queryInfo.needRankInfo = true;
526  queryInfo.needTermFreqs = true;
527 
528  // mode 1 = casefolded, unstemmed search
529  UCArray ucquery;
530  // greenstone gives us the query encoded in unicode. We want utf8.
531  char* utf8querystring=to_utf8(query).getcstr();
532  SetCStr(ucquery, utf8querystring);
533  delete []utf8querystring;
534
535  //toUCArray(query, ucquery);
536  QueryNode *queryTree = ParseQuery(ucquery, 1, 1, 4);
537     
538  // perform the query
539  ExtQueryResult queryResult;
540  MGQuery (indexData, queryInfo, queryTree, queryResult);
541  // cout << "-- word lookup result -- " << endl << queryResult << endl ;
542
543  result.clear();
544  result = queryResult.docs;
545
546  // delete the query
547  if (queryTree != NULL) delete queryTree;
548
549  indexData.UnloadData();
550
551  /** This method now returns a boolean, so...
552   *  DLConsulting 12-07-2004
553   */
554  return true; // Indicates that what happened is all good, baby.
555}
556
557// Get all the data about a phrase
558//
559// The phrase is stored in textData as record phrase.
560// We retrieve:
561//   word - the text of the phrase
562//   tf - the total frequency of the phrase
563//   ef - the expansion frequency of the phrase
564//   lf - the thesaurus link frequency of the phrase
565//   df - the document frequency of the phrase
566//   el - the list of phrases that are expansions of phrase
567//   ll - the list of phrases that are thesaurus links
568//   dl - the list of documents that contain phrase
569bool phindaction::get_phrase_all_data(TextData &textdata, unsigned long phrase,
570                      text_t &word, unsigned long &tf, unsigned long &ef,
571                      unsigned long &lf, unsigned long &df,
572                      vector <unsigned long> &el,
573                      vector <unsigned long> &linkdest,
574                      vector <UCArray> &linktype,
575                      vector <unsigned long> &docnum,
576                      vector <unsigned long> &docfrq) {
577  UCArray text;
578  UCArray docLevel;
579  SetCStr(docLevel, "Document", 8);
580
581  // Look the word up in the textData
582  if (!GetDocText (textdata, docLevel, phrase, text)) {
583    //    FatalError (1, "Error while trying to get phrase %u", phrase);
584    //exit(0);
585    return false; // Something very bad has happened.
586  }
587
588  // Ignore everything up to the first colon
589  UCArray::iterator next = text.begin();
590  while (*next++ != ':');
591
592  // ignore training carriage returns
593  while (text.back() == '\n') {
594    text.pop_back();
595  }
596 
597  // Get the word
598  word.clear();
599  for (; *next != ':'; ++next) {
600    word.push_back(*next);
601  }
602 
603  // Get total frequency
604  tf = 0;
605  for (++next; *next != ':'; ++next) {
606    tf *= 10;
607    tf += (*next - '0');
608  }
609 
610  // Get expansion frequency
611  ef = 0;
612  for (++next; *next != ':'; ++next) {
613    ef *= 10;
614    ef += (*next - '0');
615  }
616 
617  // Get document frequency
618  df = 0;
619  for (++next; *next != ':'; ++next) {
620    df *= 10;
621    df += (*next - '0');
622  }
623 
624  // Get expansion list
625  el.clear();
626  unsigned long e = 0;
627  for (++next; *next != ':'; ++next) {
628    if (*next == ',') {
629      el.push_back(e);
630      e = 0;
631    } else {
632      e *= 10;
633      e += (*next - '0');
634    }
635  }
636
637  // Get document list & the document frequency list
638  docnum.clear();
639  docfrq.clear();
640  bool readnum = false;
641  unsigned long d = 0;
642  for (++next; *next != ':'; ++next) {
643    if (*next == ',') {
644      docnum.push_back(d);
645      readnum = true;
646      d = 0;
647    } else if (*next == ';') {
648      if (readnum) {
649    docfrq.push_back(d);
650      } else {
651    docnum.push_back(d);
652    docfrq.push_back(1);
653      }
654      readnum = false;
655      d = 0;
656    } else {
657      d *= 10;
658      d += (*next - '0');
659    }
660  }
661
662  // Get thesaurus link frequency & link list
663  text.push_back(':');
664  text.push_back(':');
665
666  // link frequency
667  lf = 0;
668  for (++next; *next != ':'; ++next) {
669    lf *= 10;
670    lf += (*next - '0');
671  }
672
673  // two lists of link data
674  linkdest.clear();
675  linktype.clear();
676 
677  UCArray thistype;
678  thistype.clear();
679  bool typedone = false;
680  unsigned long l = 0;
681  for (++next; *next != ':'; ++next) {
682   
683    if (!typedone) {
684      // first read the link type, a charactor string
685      if (*next == ',') {
686    typedone = true;
687      } else {
688    thistype.push_back(*next);
689      }
690    } else {
691      // having read the link type, read the list of link destinations
692      if (*next == ',') {
693    linkdest.push_back(l);
694    linktype.push_back(thistype);
695    l = 0;
696      } else if (*next == ';') {
697    linkdest.push_back(l);
698    linktype.push_back(thistype);
699    l = 0;
700    thistype.clear();
701    typedone = false;
702      } else {
703    l *= 10;
704    l += (*next - '0');
705      }
706    }
707  }
708
709  return true; // Indicates that what happened is all good, baby.
710}
711
712bool phindaction::print_thesaurus_links(const text_t &collection, bool XMLmode,
713                    TextData &textdata, vector <unsigned long> &linkdest,
714                    vector <UCArray> &linktype, unsigned long first,
715                    unsigned long last, displayclass &disp,
716                    outconvertclass &outconvert, ostream &textout) {
717
718  // information describing each link in the list
719  unsigned long phrase, tf, ef, df;
720  UCArray type, text;
721 
722  for (unsigned long l = first; l < last; ++l) {
723
724    // get the phrase data
725    phrase = linkdest[l];
726    type = linktype[l];
727
728    /** DLConsulting 12-07-2004 */
729    if(!get_phrase_freq_data(textdata, phrase, text, tf, ef, df)) {
730      return false;
731    }
732   
733    if (XMLmode) {
734      textout << "<thesaurus num=\"" << l
735          << "\" id=\"" << phrase
736          << "\" tf=\"" << tf
737          << "\" df=\"" << df
738          << "\" type=\"" << type
739          << "\" text=\"" << text
740          << "\"/>\n";
741    } else {
742      textout << "<tr valign=top><td>" << type << "</td><td>";
743      textout << outconvert << disp
744          << "<a href=\"_gwcgi_?c=" << collection;
745      textout << "&ppnum=" << phrase << "\">" << text << "</a>"
746          << "</td><td>" << tf << "</td><td>" << df << "</td></tr>\n";
747    }
748  }
749
750  /** DLConsulting 12-07-2004 */
751  return true;
752}
753
754// Get the frequency data about a phrase
755//
756// The phrase is stored in textData as record phrase.
757// We retrieve:
758//   word - the text of the phrase
759//   tf - the total frequency of the phrase
760//   ef - the expansion frequency of the phrase
761//   df - the document frequency of the phrase
762/**
763 *   Returns:
764 *     false if the method suffered a fatal error, true otherwise
765 */
766bool phindaction::get_phrase_freq_data(TextData &textdata, unsigned long phrase,
767                       UCArray &word, unsigned long &tf,
768                       unsigned long &ef, unsigned long &df) {
769 
770  UCArray text;
771  UCArray docLevel;
772  SetCStr(docLevel, "Document", 8);
773
774  // Look the word up in the textData
775  if (!GetDocText (textdata, docLevel, phrase, text)) {
776    //    FatalError (1, "Error while trying to get phrase %u", phrase);
777    //exit(0);
778    /** DLConsulting 12-07-2004 */
779    return false;
780  }
781
782  // Ignore everything up to the first colon
783  UCArray::iterator next = text.begin();
784  while (*next++ != ':');
785 
786  // Get the word
787  word.clear();
788  for (; *next != ':'; ++next) {
789    word.push_back(*next);
790  }
791 
792  // Get total frequency
793  tf = 0;
794  for (++next; *next != ':'; ++next) {
795    tf *= 10;
796    tf += (*next - '0');
797  }
798 
799  // Get expansion frequency
800  ef = 0;
801  for (++next; *next != ':'; ++next) {
802    ef *= 10;
803    ef += (*next - '0');
804  }
805 
806  // Get document frequency
807  df = 0;
808  for (++next; *next != ':'; ++next) {
809    df *= 10;
810    df += (*next - '0');
811  }
812
813  /** DLConsulting 12-07-2004 */
814  return true;
815}
816
817// Print a list of expansions
818//
819// Given the textData and a list of phrase numbers, print out each of the
820// expansions.
821void phindaction::print_expansions(const text_t &collection, bool XMLmode,
822                   const text_t &body, TextData &textdata,
823                   const vector <unsigned long> &elist,
824                   unsigned long first, unsigned long last,
825                   displayclass &disp, outconvertclass &outconvert,
826                   ostream &textout) {
827 
828  UCArray word;
829  unsigned long phrase, tf, df, ef;
830
831  UCArray suffix, prefix, ucbody;
832 
833  toUCArray(body, ucbody);
834
835  for (unsigned long e = first; e < last; ++e) {
836
837    phrase = elist[e];
838    get_phrase_freq_data(textdata, phrase, word, tf, ef, df);
839
840    split_phrase(word, ucbody, prefix, suffix);
841   
842    if (XMLmode) {
843      // body is always the same as the text of the phrase, so no need to send it
844      textout << "<expansion num=\"" << e
845          << "\" id=\"" << phrase
846          << "\" tf=\"" << tf
847          << "\" df=\"" << df;
848      if (!prefix.empty()) {
849    textout << "\" prefix=\"" << prefix;
850      }
851      if (!suffix.empty()) {
852    textout << "\" suffix=\"" << suffix;
853      }
854      textout << "\"/>\n";
855    } else {
856      textout << outconvert << disp
857          << "<tr valign=top><td align=right><a href=\"_gwcgi_?"
858          << "c=" << collection << "&ppnum=" << phrase << "\">";
859      textout << prefix << "</a></td>";
860      textout <<outconvert << disp
861          << "<td align=center><a href=\"_gwcgi_?"
862          << "c=" << collection << "&ppnum=" << phrase << "\">"
863          << body << "</a></td>"
864          << "<td align=left><a href=\"_gwcgi_?"
865          << "c=" << collection << "&ppnum=" << phrase << "\">";
866      textout << suffix << "</a></td>"
867          << "<td>" << tf << "</td><td>" << df << "</td></tr>\n";
868    }
869  }
870}
871
872// split an expansion into prefix and suffix
873void phindaction::split_phrase(const UCArray &word, const UCArray &body,
874                   UCArray &prefix, UCArray &suffix) {
875
876  prefix.clear();
877  suffix.clear();
878
879  bool readingPrefix = true;
880  UCArray::const_iterator here = word.begin();
881  UCArray::const_iterator end = word.end();
882 
883  while (here != end) {
884
885    // if we've not read all the prefix, add the next char to the prefix
886    if (readingPrefix) {
887      if (phrase_match(body, here, end)) {
888    readingPrefix = false;
889    // trim whitespace from end of prefix & start of suffix
890    if (!prefix.empty()) {
891      prefix.pop_back();
892    }
893    if ((here != end) && (*here == ' ')) {
894      ++here;
895    }
896      } else {
897    prefix.push_back(*here);
898    ++here;
899      }
900    }
901    // if we've finished with the prefix, update the suffix
902    else {
903      suffix.push_back(*here);
904      ++here;
905    }
906  }
907}
908
909// phrase_match
910//
911// compare two strings, one represented as an UCArray, the other as two
912// UCArray iterators.
913//
914// Return true if the UCArray is the same as the phrase the iterators point
915// to for the length of the UCArray.
916bool phindaction::phrase_match(const UCArray &text, UCArray::const_iterator &here,
917                   UCArray::const_iterator end) {
918
919  UCArray::const_iterator one_here = text.begin();
920  UCArray::const_iterator one_end  = text.end();
921  UCArray::const_iterator two_here = here;
922
923  // iterate over the length of the first string, comparing each element to
924  // the corresponding element in the second string.
925  while (one_here != one_end) {
926   
927      if (two_here == end) {
928      return false;
929      } else if (*one_here != *two_here) {
930      return false;
931      }
932      ++one_here;
933      ++two_here;
934  }
935
936  here = two_here;
937  return true;
938}
939
940bool phindaction::print_documents(bool XMLmode, const text_t &basepath,
941                  const text_t &collection,
942                  const vector <unsigned long> &docNums,
943                  const vector <unsigned long> &docFreq,
944                  unsigned long first, unsigned long last,
945                  displayclass &disp, outconvertclass &outconvert,
946                  ostream &textout) {
947 
948  // Create a TextData object to read the document data
949  TextData docdata;
950
951  text_t fullpath = filename_cat(basepath, "docs");
952  char *fullpathc = fullpath.getcstr();
953#if defined __WIN32__
954  char *base = "";
955#else
956  char *base = "/";
957#endif
958
959  if (!docdata.LoadData (base, fullpathc)) {
960    //    FatalError (1, "Couldn't load text information for \"%s\"", fullpathc);
961    //exit(0);
962    /** DLConsulting 12-07-2004 */
963    return false;
964  }
965
966  delete []fullpathc;
967
968  UCArray title, hash;
969  unsigned long freq, doc;
970
971  for (unsigned long d = first; d < last; ++d) {
972    doc = docNums[d];
973    freq = docFreq[d];
974
975    /** DLConsulting 13-07-2004 */
976    if(!get_document_all_data(docdata, doc, title, hash)) {
977      return false;
978    }
979
980    if (XMLmode) {
981      textout << "<document num=\"" << d
982          << "\" hash=\"" << hash
983          << "\" freq=\"" << freq
984          << "\" title=\"" << title << "\"/>\n";
985    } else {
986      textout << outconvert << disp
987          << "<tr valign=top><td><a href=\"_gwcgi_?"
988          << "c=" << collection;
989      textout << "&a=d&d=" << hash << "\">" << title << "</a>"
990          << "</td><td>" << freq << "</td></tr>\n";
991    }
992  }
993
994  docdata.UnloadData();
995
996  /** DLConsulting 12-07-2004 */
997  return true;
998}
999
1000// Get all the data about a docment
1001//
1002// The document's details are stored in docData as record docNum.
1003// We retrieve:
1004//   title - the document's title
1005//   hash - the document's unique OID
1006/** Returns:
1007 *   false if a fatal error occured, true otherwise
1008 *  DLConsulting 12-07-2004
1009 */
1010bool phindaction::get_document_all_data(TextData &docdata, unsigned long docNum,
1011                    UCArray &title, UCArray &hash) {
1012
1013  UCArray text;
1014  UCArray docLevel;
1015  SetCStr(docLevel, "Document", 8);
1016
1017  // Look the word up in the textData
1018  if (!GetDocText (docdata, docLevel, docNum, text)) {
1019    //    FatalError (1, "Error while trying to get document %u", docNum);
1020    //exit(0);
1021    /** DLConsulting 13-07-2004 */
1022    return false;
1023  }
1024
1025  // Ignore everything up to the first colon
1026  UCArray::iterator next = text.begin();
1027  while (*next++ != '\t');
1028 
1029  // Get the document OID (hash)
1030  hash.clear();
1031  for (; *next != '\t'; ++next) {
1032    hash.push_back(*next);
1033  }
1034
1035  // Get the title
1036  text.push_back('\n');
1037  title.clear();
1038  for (++next; *next != '\n'; ++next) {
1039    title.push_back(*next);
1040  }
1041
1042  /** DLConsulting 13-07-2004 */
1043  return true;
1044}
1045
1046void phindaction::toUCArray(const text_t &in, UCArray &out) {
1047  out.clear();
1048  if (out.capacity() < in.size() + 1) {
1049    out.reserve(in.size() + 1);
1050  }
1051  text_t::const_iterator here = in.begin();
1052  text_t::const_iterator end = in.end();
1053  while (here != end) {
1054    out.push_back((unsigned char) *here);
1055    ++here;
1056  }
1057}
1058
1059void phindaction::output_error (const text_t &message, ostream &textout,
1060                outconvertclass &outconvert,
1061                displayclass & disp, ostream &logout,
1062                bool XMLmode) {
1063
1064  logout << outconvert << message << "\n";
1065  if (XMLmode) {
1066    textout << outconvert
1067        << "<phinddata>\n"
1068        << "<phinderror>" << message << "</phinderror>\n"
1069        << "</phinddata>\n";
1070  } else {
1071    textout << outconvert << disp
1072        << "_header_\n"
1073        << message
1074        << "_footer_\n";
1075  }
1076}
1077
1078#endif //GSDL_USE_PHIND_ACTION
1079
Note: See TracBrowser for help on using the browser.