root/main/trunk/greenstone2/runtime-src/src/recpt/phindaction.cpp @ 28899

Revision 28899, 30.8 KB (checked in by ak19, 5 years ago)

Third commit for security, for ensuring cgiargs macros are websafe. This time all the changes to the runtime action classes.

  • Property svn:keywords set to Author Date Id Revision
Line 
1/**********************************************************************
2 *
3 * phindaction.cpp --
4 *
5 * Copyright 2001 Gordon W. Paynter
6 * Copyright 2001 The New Zealand Digital Library Project
7 *
8 * A component of the Greenstone digital library software
9 * from the New Zealand Digital Library Project at the
10 * University of Waikato, New Zealand.
11 *
12 * This program is free software; you can redistribute it and/or modify
13 * it under the terms of the GNU General Public License as published by
14 * the Free Software Foundation; either version 2 of the License, or
15 * (at your option) any later version.
16 *
17 * This program is distributed in the hope that it will be useful,
18 * but WITHOUT ANY WARRANTY; without even the implied warranty of
19 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
20 * GNU General Public License for more details.
21 *
22 * You should have received a copy of the GNU General Public License
23 * along with this program; if not, write to the Free Software
24 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
25 *
26 *********************************************************************/
27
28#include "gsdl_modules_cfg.h"
29#ifdef GSDL_USE_PHIND_ACTION
30
31// Note that this action uses mgpp to retrieve phind info, calling MGQuery
32// etc. directly, not through the protocol. This breaks our receptionist -
33// collection server separation and should be fixed some day I guess.
34
35#include "phindaction.h"
36#include "fileutil.h"
37#include "gsdlunicode.h"
38
39phindaction::phindaction () {
40
41  cgiarginfo arg_ainfo;
42
43  arg_ainfo.shortname = "pc";
44  arg_ainfo.longname = "phind classifier";
45  arg_ainfo.multiplechar = true;
46  arg_ainfo.multiplevalue = false;
47  arg_ainfo.defaultstatus = cgiarginfo::weak;
48  arg_ainfo.argdefault = g_EmptyText;
49  arg_ainfo.savedarginfo = cgiarginfo::mustnot;
50  argsinfo.addarginfo (NULL, arg_ainfo);
51
52  arg_ainfo.shortname = "pxml";
53  arg_ainfo.longname = "phind XML mode";
54  arg_ainfo.multiplechar = false;
55  arg_ainfo.multiplevalue = false;
56  arg_ainfo.defaultstatus = cgiarginfo::weak;
57  arg_ainfo.argdefault = "0";
58  arg_ainfo.savedarginfo = cgiarginfo::mustnot;
59  argsinfo.addarginfo (NULL, arg_ainfo);
60
61  arg_ainfo.shortname = "ppnum";
62  arg_ainfo.longname = "phind phrase number";
63  arg_ainfo.multiplechar = true;
64  arg_ainfo.multiplevalue = false;
65  arg_ainfo.defaultstatus = cgiarginfo::weak;
66  arg_ainfo.argdefault = "0";
67  arg_ainfo.savedarginfo = cgiarginfo::mustnot;
68  argsinfo.addarginfo (NULL, arg_ainfo);
69
70  arg_ainfo.shortname = "pptext";
71  arg_ainfo.longname = "phind phrase text";
72  arg_ainfo.multiplechar = true;
73  arg_ainfo.multiplevalue = false;
74  arg_ainfo.defaultstatus = cgiarginfo::weak;
75  arg_ainfo.argdefault = g_EmptyText;
76  arg_ainfo.savedarginfo = cgiarginfo::mustnot;
77  argsinfo.addarginfo (NULL, arg_ainfo);
78
79  arg_ainfo.shortname = "pfe";
80  arg_ainfo.longname = "phind first_e";
81  arg_ainfo.multiplechar = true;
82  arg_ainfo.multiplevalue = false;
83  arg_ainfo.defaultstatus = cgiarginfo::weak;
84  arg_ainfo.argdefault = "0";
85  arg_ainfo.savedarginfo = cgiarginfo::mustnot;
86  argsinfo.addarginfo (NULL, arg_ainfo);
87
88  arg_ainfo.shortname = "ple";
89  arg_ainfo.longname = "phind last_e";
90  arg_ainfo.multiplechar = true;
91  arg_ainfo.multiplevalue = false;
92  arg_ainfo.defaultstatus = cgiarginfo::weak;
93  arg_ainfo.argdefault = "10";
94  arg_ainfo.savedarginfo = cgiarginfo::mustnot;
95  argsinfo.addarginfo (NULL, arg_ainfo);
96
97  arg_ainfo.shortname = "pfl";
98  arg_ainfo.longname = "phind first_l";
99  arg_ainfo.multiplechar = true;
100  arg_ainfo.multiplevalue = false;
101  arg_ainfo.defaultstatus = cgiarginfo::weak;
102  arg_ainfo.argdefault = "0";
103  arg_ainfo.savedarginfo = cgiarginfo::mustnot;
104  argsinfo.addarginfo (NULL, arg_ainfo);
105
106  arg_ainfo.shortname = "pll";
107  arg_ainfo.longname = "phind last_l";
108  arg_ainfo.multiplechar = true;
109  arg_ainfo.multiplevalue = false;
110  arg_ainfo.defaultstatus = cgiarginfo::weak;
111  arg_ainfo.argdefault = "10";
112  arg_ainfo.savedarginfo = cgiarginfo::mustnot;
113  argsinfo.addarginfo (NULL, arg_ainfo);
114
115  arg_ainfo.shortname = "pfd";
116  arg_ainfo.longname = "phind first_d";
117  arg_ainfo.multiplechar = true;
118  arg_ainfo.multiplevalue = false;
119  arg_ainfo.defaultstatus = cgiarginfo::weak;
120  arg_ainfo.argdefault = "0";
121  arg_ainfo.savedarginfo = cgiarginfo::mustnot;
122  argsinfo.addarginfo (NULL, arg_ainfo);
123
124  arg_ainfo.shortname = "pld";
125  arg_ainfo.longname = "phind last_d";
126  arg_ainfo.multiplechar = true;
127  arg_ainfo.multiplevalue = false;
128  arg_ainfo.defaultstatus = cgiarginfo::weak;
129  arg_ainfo.argdefault = "10";
130  arg_ainfo.savedarginfo = cgiarginfo::mustnot;
131  argsinfo.addarginfo (NULL, arg_ainfo);
132}
133
134phindaction::~phindaction () {
135}
136
137void phindaction::get_cgihead_info (cgiargsclass &args, recptprotolistclass * /*protos*/,
138                    response_t &response,text_t &response_data,
139                    ostream &/*logout*/) {
140  response = content;
141  if (args["pxml"] == "1") {
142    response_data = "text/xml";
143  } else {
144    response_data = "text/html";
145  }
146}
147
148bool phindaction::do_action (cgiargsclass &args, recptprotolistclass *protos,
149                 browsermapclass * /*browsers*/, displayclass &disp,
150                 outconvertclass &outconvert, ostream &textout,
151                 ostream &logout) {
152
153  unsigned long count_l, count_e, count_d;
154  unsigned long phrase = args["ppnum"].getulong(); // needn't encodeFor<web> on vars which have getulong() applied
155  text_t &word = args["pptext"];
156  unsigned long first_e = args["pfe"].getulong();
157  unsigned long last_e = args["ple"].getulong();
158  unsigned long first_l = args["pfl"].getulong();
159  unsigned long last_l = args["pll"].getulong();
160  unsigned long first_d = args["pfd"].getulong();
161  unsigned long last_d = args["pld"].getulong();
162  bool XMLmode = false;
163  if (args["pxml"] == "1") XMLmode = true;
164
165  // must have a valid collection server
166  recptproto *collectproto = protos->getrecptproto (args["c"], logout);
167  if (collectproto == NULL) {
168    output_error("phindaction: ERROR: collection not set", textout,
169         outconvert, disp, logout, XMLmode);
170    return true;
171  }
172
173  // the frequency and occurances of the phrase
174  unsigned long tf;
175  vector <unsigned long> el, linkdest, docNums, docfreq;
176  vector <UCArray> linktype;
177
178  // the number of occurances to display
179  unsigned long ef, lf, df;
180 
181  text_t basepath = filename_cat(collecthome, args["c"],
182                 "index", "phind" + args["pc"]);
183
184  // If we don't know the phrase number, look it up
185  if (phrase == 0) {
186   
187    if (word.empty()) {
188      output_error("phindaction: ERROR: no phrase number or word", textout,
189           outconvert, disp, logout, XMLmode);
190      return true;
191    }
192
193    DocNumArray result;
194    /** In order to prevent browser crashing problems, any method which
195     *  previously suffered a silent fatal error, now instead returns false
196     *  to indicate a fatal error has occured. We can then dispatch an
197     *  appropriate error tag to the Phind applet (rather than leave it
198     *  whiling away the milliseconds until the end of existence - or at
199     *  least your browser - in an infinite loop!)
200     *  DLConsulting 12-07-2004
201     */
202   
203    if(!find_phrase_number_from_word(basepath, word, result)) {
204    output_error("phindaction: Fatal Error! Couldn't load index information in find_phrase_number_from_word()",
205             textout, outconvert, disp, logout, XMLmode);
206    return true;
207    }
208   
209    if (result.empty()) {
210      output_error("phindaction: The search term ("+encodeForHTML(word)+") does not occur in the collection",
211           textout, outconvert, disp, logout, XMLmode);
212      return true;
213    } else {
214      phrase = result[0];
215    }
216  }
217
218  // Create a TextData object to read the phrase data (pdata)
219  TextData textdata;
220
221  text_t fullpath = filename_cat(basepath, "pdata");
222  char *fullpathc = fullpath.getcstr();
223#if defined __WIN32__
224  char *base = "";
225#else
226  char *base = "/";
227#endif
228
229  if (!textdata.LoadData (base, fullpathc)) {
230    //    FatalError (1, "Couldn't load text information for \"%s\"", fullpathc);
231    //exit(0);
232    /** We must return something to the client, whether this error is fatal or
233     *  no, otherwise we risk sending their browser into an infinite loop!
234     *  DLConsulting 12-07-2004
235     */
236    output_error("phindaction: Fatal Error! Couldn't load text information for collection",
237         textout, outconvert, disp, logout, XMLmode);
238    return true;
239  }
240
241  delete []fullpathc;
242
243  /** Another previously silent method can now cry out.
244   *  DLConsulting 12-07-2004
245   */
246  if(!get_phrase_all_data(textdata, phrase, word, tf, ef, lf, df, el,
247              linkdest, linktype, docNums, docfreq)) {
248    output_error(
249      "phindaction: Fatal Error! Couldn't parse phrase in get_phrase_all_data()",
250      textout, outconvert, disp, logout, XMLmode);
251      return true;   
252  }
253
254  // Output the header
255  if (XMLmode) {
256    textout << "<phinddata id=\"" << phrase
257        << "\" text=\"" << encodeForHTMLAttr(word)
258        << "\" tf=\"" << tf
259        << "\" ef=\"" << ef
260        << "\" df=\"" << df
261        << "\" lf=\"" << lf
262        << "\">\n";
263  } else {
264    textout << "<html><head><title>" << encodeForHTML(word) << "</title></head>\n"
265        << "<body><center>\n"
266        << "<p><h1>" << encodeForHTML(word) << "</h1>\n"
267        << "<p><b>"<< encodeForHTML(word) << "</b> occurs "
268        << tf << " times in " << df << " documents\n";
269  }
270
271  // Output the thesaurus links
272  if ((lf > 0) && (first_l < last_l)) {
273
274    // figure out the number of phrases to output
275    if (last_l > lf) {
276      last_l = lf;
277    }
278    count_l = last_l - first_l;
279   
280    if (XMLmode) {
281      textout << "<thesauruslist length=\"" << lf
282          << "\" start=\"" << first_l
283          << "\" end=\"" << last_l << "\">\n";
284      /** DLConsulting 12-07-2004 */
285      if(!print_thesaurus_links(args["c"], XMLmode, textdata, linkdest, linktype,
286                   first_l, last_l, disp, outconvert, textout)) {
287    output_error(
288             "phindaction: Fatal Error! Couldn't get phrase in get_phrase_freq_data()",
289             textout, outconvert, disp, logout, XMLmode);
290    return true;   
291      }
292      textout << "</thesauruslist>\n";
293    }
294
295    // output links as HTML
296    else {
297      if (count_l == lf) {
298    textout << "<p><b> " << count_l << " thesaurus links</b>\n";
299      } else {
300    textout << "<p><b>" << count_l << " of " << lf << " thesaurus links</b>\n";
301      }
302
303      textout << "<p><table border=1><tr><th>type</th><th>topic</th><th>freq</th><th>docs</th></tr>\n";
304      /** DLConsulting 12-07-2004 */
305      if(!print_thesaurus_links(args["c"], XMLmode, textdata, linkdest, linktype,
306                   first_l, last_l, disp, outconvert, textout)) {
307    output_error(
308             "phindaction: Fatal Error! Couldn't get phrase in get_phrase_freq_data()",
309             textout, outconvert, disp, logout, XMLmode);
310    return true;
311      }
312      textout << "</table>\n";
313
314      if (last_l < lf) {
315    if ((last_l + 10) < lf) {
316      textout << outconvert << disp
317          << "<br><a href=\"_gwcgi_?"
318          << "c=" << encodeForURL(args["c"])
319          << "&ppnum=" << phrase
320          << "&pfe=" << first_e
321          << "&ple=" << last_e
322          << "&pfd=" << first_d
323          << "&pld=" << last_d
324          << "&pfl=" << first_l
325          << "&pll=" << (last_l + 10)
326          << "\">Get more thesaurus links</a>\n";
327    }
328    textout << outconvert << disp
329        << "<br><a href=\"_gwcgi_?"
330        << "c=" << encodeForURL(args["c"])
331        << "&ppnum=" << phrase
332        << "&pfe=" << first_e
333        << "&ple=" << last_e
334        << "&pfd=" << first_d
335        << "&pld=" << last_d
336        << "&pfl=" << first_l
337        << "&pll=" << lf
338        << "\">Get every thesaurus link</a>\n" ;
339      }
340    }
341  }
342 
343  // Output the expansions
344  if ((ef > 0) && (first_e < last_e)) {
345
346    // figure out the number of phrases to output
347    if (last_e > el.size()) {
348      last_e = el.size();
349    }
350    count_e = last_e - first_e;
351
352    // output expansions as XML
353    if (XMLmode) {
354      textout << "<expansionlist length=\"" << ef
355          << "\" start=\"" << first_e
356          << "\" end=\"" << last_e << "\">" << endl;
357
358      print_expansions(args["c"], XMLmode, word, textdata, el, first_e,
359               last_e, disp, outconvert, textout);
360
361      textout << "</expansionlist>\n";
362    }
363
364    // output expansions as HTML
365    else {
366      if (count_e == el.size()) {
367    textout << "<p><b> " << count_e << " expansions</b>\n";
368      } else {
369    textout << "<p><b>" << count_e << " of " << ef << " expansions</b>\n";
370      }
371
372      textout << "<p><table border=1><tr><th colspan=3>phrase</th><th>freq</th><th>docs</th></tr>\n";
373      print_expansions(args["c"], XMLmode, word, textdata, el, first_e,
374               last_e, disp, outconvert, textout);
375      textout << "</table>\n";
376
377      if (last_e < ef) {
378    if ((last_e + 10) < ef) {
379      textout << outconvert << disp
380          << "<br><a href=\"_gwcgi_?"
381          << "c=" << encodeForURL(args["c"])
382          << "&ppnum=" << phrase
383          << "&pfe=" << first_e
384          << "&ple=" << (last_e + 10)
385          << "&pfd=" << first_d
386          << "&pld=" << last_d
387          << "&pfl=" << first_l
388          << "&pll=" << last_l
389          << "\">Get more expansions</a>\n";
390    }
391    textout << outconvert << disp
392        << "<br><a href=\"_gwcgi_?"
393        << "c=" << encodeForURL(args["c"])
394        << "&ppnum=" << phrase
395        << "&pfe=" << first_e
396        << "&ple=" << ef
397        << "&pfd=" << first_d
398        << "&pld=" << last_d
399        << "&pfl=" << first_l
400        << "&pll=" << last_l
401        << "\">Get every expansion</a>\n";
402      }
403    }
404  }
405
406  // Output the document occurances
407  if ((df > 0) && (first_d < last_d)) {
408
409    // figure out the phrases to output
410    if (last_d > docNums.size()) {
411      last_d = docNums.size();
412    }
413    count_d = last_d - first_d;
414
415    // output document list as XML
416    if (XMLmode) {
417      textout << "<documentlist length=\"" << df
418          << "\" start=\"" << first_d
419          << "\" end=\"" << last_d << "\">\n";
420     
421      if(!print_documents(XMLmode, basepath, args["c"], docNums, docfreq,
422              first_d, last_d, disp, outconvert, textout)) {
423    output_error(
424             "phindaction: Fatal Error! Couldn't load text information in print_documents() or get_document_all_data()",
425             textout, outconvert, disp, logout, XMLmode);
426    return true;   
427      }
428
429      textout << "</documentlist>\n";
430    }
431
432    // output document list as HTML
433    else {
434     
435      if (count_d == docNums.size()) {
436    textout << "<p><b> " << count_d << " documents</b>\n";
437      } else {
438    textout << "<p><b>" << count_d << " of " << df << " documents</b>\n";
439      }
440
441      textout << "<p><table border=1><tr><th align=left>document</th><th>freq</th></tr>\n";
442      if(!print_documents(XMLmode, basepath, args["c"], docNums, docfreq,
443              first_d, last_d, disp, outconvert, textout)) {
444    output_error(
445             "phindaction: Fatal Error! Couldn't load text information in print_documents()",
446             textout, outconvert, disp, logout, XMLmode);
447    return true;
448      }
449      textout << "</table>\n";
450     
451      if (last_d < df) {
452    if ((last_d + 10) < df) {
453      textout << outconvert << disp
454          << "<br><a href=\"_gwcgi_?"
455          << "c=" << encodeForURL(args["c"])
456          << "&ppnum=" << phrase
457          << "&pfe=" << first_e
458          << "&ple=" << last_e
459          << "&pfd=" << first_d
460          << "&pld=" << (last_d + 10) 
461          << "&pfl=" << first_l
462          << "&pll=" << last_l
463          << "\">Get more documents</a>\n";
464    }
465    textout << outconvert << disp
466        << "<br><a href=\"_gwcgi_?"
467        << "c=" << encodeForURL(args["c"])
468        << "&ppnum=" << phrase
469        << "&pfe=" << first_e
470        << "&ple=" << last_e
471        << "&pfd=" << first_d
472        << "&pld=" << df
473        << "&pfl=" << first_l
474        << "&pll=" << last_l
475        << "\">Get every document</a>\n";
476      }
477    }
478  }
479
480  // Close the document
481  if (XMLmode) {
482    textout << "</phinddata>\n";
483  } else {
484    textout << "</center></body></html>\n";
485  }
486
487  textdata.UnloadData ();
488
489  return true;
490}
491
492// Find the phrase number of a word in the index file
493bool phindaction::find_phrase_number_from_word(const text_t &basepath,
494                           const text_t &query,
495                           DocNumArray &result) {
496
497  // Open the index file for searching
498  IndexData indexData;
499
500  text_t fullpath = filename_cat(basepath, "pword");
501  char *fullpathc = fullpath.getcstr();
502#if defined __WIN32__
503  char *base = "";
504#else
505  char *base = "/";
506#endif
507
508  if (!indexData.LoadData (base, fullpathc)) {
509    //    FatalError (1, "Couldn't load index information for \"%s\"", fullpathc);
510    //exit(0);
511    /** Don't handle fatal errors here anymore.
512     *  DLConsulting 12-07-2004
513     */
514    return false; // Indicates something very bad has happened
515  }
516
517  delete []fullpathc;
518
519  // set up the query object
520  QueryInfo queryInfo;
521  SetCStr (queryInfo.docLevel, "Document", 8);
522  queryInfo.maxDocs = 5;
523  queryInfo.sortByRank = true;
524  queryInfo.exactWeights = false;
525  queryInfo.needRankInfo = true;
526  queryInfo.needTermFreqs = true;
527 
528  // mode 1 = casefolded, unstemmed search
529  UCArray ucquery;
530  // greenstone gives us the query encoded in unicode. We want utf8.
531  char* utf8querystring=to_utf8(query).getcstr();
532  SetCStr(ucquery, utf8querystring);
533  delete []utf8querystring;
534
535  //toUCArray(query, ucquery);
536  QueryNode *queryTree = ParseQuery(ucquery, 1, 1, 4);
537     
538  // perform the query
539  ExtQueryResult queryResult;
540  MGQuery (indexData, queryInfo, queryTree, queryResult);
541  // cout << "-- word lookup result -- " << endl << queryResult << endl ;
542
543  result.clear();
544  result = queryResult.docs;
545
546  // delete the query
547  if (queryTree != NULL) delete queryTree;
548
549  indexData.UnloadData();
550
551  /** This method now returns a boolean, so...
552   *  DLConsulting 12-07-2004
553   */
554  return true; // Indicates that what happened is all good, baby.
555}
556
557// Get all the data about a phrase
558//
559// The phrase is stored in textData as record phrase.
560// We retrieve:
561//   word - the text of the phrase
562//   tf - the total frequency of the phrase
563//   ef - the expansion frequency of the phrase
564//   lf - the thesaurus link frequency of the phrase
565//   df - the document frequency of the phrase
566//   el - the list of phrases that are expansions of phrase
567//   ll - the list of phrases that are thesaurus links
568//   dl - the list of documents that contain phrase
569bool phindaction::get_phrase_all_data(TextData &textdata, unsigned long phrase,
570                      text_t &word, unsigned long &tf, unsigned long &ef,
571                      unsigned long &lf, unsigned long &df,
572                      vector <unsigned long> &el,
573                      vector <unsigned long> &linkdest,
574                      vector <UCArray> &linktype,
575                      vector <unsigned long> &docnum,
576                      vector <unsigned long> &docfrq) {
577  UCArray text;
578  UCArray docLevel;
579  SetCStr(docLevel, "Document", 8);
580
581  // Look the word up in the textData
582  if (!GetDocText (textdata, docLevel, phrase, text)) {
583    //    FatalError (1, "Error while trying to get phrase %u", phrase);
584    //exit(0);
585    return false; // Something very bad has happened.
586  }
587
588  // Ignore everything up to the first colon
589  UCArray::iterator next = text.begin();
590  while (*next++ != ':');
591
592  // ignore training carriage returns
593  while (text.back() == '\n') {
594    text.pop_back();
595  }
596 
597  // Get the word
598  word.clear();
599  for (; *next != ':'; ++next) {
600    word.push_back(*next);
601  }
602 
603  // Get total frequency
604  tf = 0;
605  for (++next; *next != ':'; ++next) {
606    tf *= 10;
607    tf += (*next - '0');
608  }
609 
610  // Get expansion frequency
611  ef = 0;
612  for (++next; *next != ':'; ++next) {
613    ef *= 10;
614    ef += (*next - '0');
615  }
616 
617  // Get document frequency
618  df = 0;
619  for (++next; *next != ':'; ++next) {
620    df *= 10;
621    df += (*next - '0');
622  }
623 
624  // Get expansion list
625  el.clear();
626  unsigned long e = 0;
627  for (++next; *next != ':'; ++next) {
628    if (*next == ',') {
629      el.push_back(e);
630      e = 0;
631    } else {
632      e *= 10;
633      e += (*next - '0');
634    }
635  }
636
637  // Get document list & the document frequency list
638  docnum.clear();
639  docfrq.clear();
640  bool readnum = false;
641  unsigned long d = 0;
642  for (++next; *next != ':'; ++next) {
643    if (*next == ',') {
644      docnum.push_back(d);
645      readnum = true;
646      d = 0;
647    } else if (*next == ';') {
648      if (readnum) {
649    docfrq.push_back(d);
650      } else {
651    docnum.push_back(d);
652    docfrq.push_back(1);
653      }
654      readnum = false;
655      d = 0;
656    } else {
657      d *= 10;
658      d += (*next - '0');
659    }
660  }
661
662  // Get thesaurus link frequency & link list
663  text.push_back(':');
664  text.push_back(':');
665
666  // link frequency
667  lf = 0;
668  for (++next; *next != ':'; ++next) {
669    lf *= 10;
670    lf += (*next - '0');
671  }
672
673  // two lists of link data
674  linkdest.clear();
675  linktype.clear();
676 
677  UCArray thistype;
678  thistype.clear();
679  bool typedone = false;
680  unsigned long l = 0;
681  for (++next; *next != ':'; ++next) {
682   
683    if (!typedone) {
684      // first read the link type, a charactor string
685      if (*next == ',') {
686    typedone = true;
687      } else {
688    thistype.push_back(*next);
689      }
690    } else {
691      // having read the link type, read the list of link destinations
692      if (*next == ',') {
693    linkdest.push_back(l);
694    linktype.push_back(thistype);
695    l = 0;
696      } else if (*next == ';') {
697    linkdest.push_back(l);
698    linktype.push_back(thistype);
699    l = 0;
700    thistype.clear();
701    typedone = false;
702      } else {
703    l *= 10;
704    l += (*next - '0');
705      }
706    }
707  }
708
709  return true; // Indicates that what happened is all good, baby.
710}
711
712bool phindaction::print_thesaurus_links(const text_t &collection, bool XMLmode,
713                    TextData &textdata, vector <unsigned long> &linkdest,
714                    vector <UCArray> &linktype, unsigned long first,
715                    unsigned long last, displayclass &disp,
716                    outconvertclass &outconvert, ostream &textout) {
717
718  // information describing each link in the list
719  unsigned long phrase, tf, ef, df;
720  UCArray type, text;
721 
722  for (unsigned long l = first; l < last; ++l) {
723
724    // get the phrase data
725    phrase = linkdest[l];
726    type = linktype[l];
727
728    /** DLConsulting 12-07-2004 */
729    if(!get_phrase_freq_data(textdata, phrase, text, tf, ef, df)) {
730      return false;
731    }
732   
733    if (XMLmode) {
734      textout << "<thesaurus num=\"" << l
735          << "\" id=\"" << phrase
736          << "\" tf=\"" << tf
737          << "\" df=\"" << df
738          << "\" type=\"" << type
739          << "\" text=\"" << text
740          << "\"/>\n";
741    } else {
742      textout << "<tr valign=top><td>" << type << "</td><td>";
743      textout << outconvert << disp
744          << "<a href=\"_gwcgi_?c=" << encodeForURL(collection);
745      textout << "&ppnum=" << phrase << "\">" << text << "</a>"
746          << "</td><td>" << tf << "</td><td>" << df << "</td></tr>\n";
747    }
748  }
749
750  /** DLConsulting 12-07-2004 */
751  return true;
752}
753
754// Get the frequency data about a phrase
755//
756// The phrase is stored in textData as record phrase.
757// We retrieve:
758//   word - the text of the phrase
759//   tf - the total frequency of the phrase
760//   ef - the expansion frequency of the phrase
761//   df - the document frequency of the phrase
762/**
763 *   Returns:
764 *     false if the method suffered a fatal error, true otherwise
765 */
766bool phindaction::get_phrase_freq_data(TextData &textdata, unsigned long phrase,
767                       UCArray &word, unsigned long &tf,
768                       unsigned long &ef, unsigned long &df) {
769 
770  UCArray text;
771  UCArray docLevel;
772  SetCStr(docLevel, "Document", 8);
773
774  // Look the word up in the textData
775  if (!GetDocText (textdata, docLevel, phrase, text)) {
776    //    FatalError (1, "Error while trying to get phrase %u", phrase);
777    //exit(0);
778    /** DLConsulting 12-07-2004 */
779    return false;
780  }
781
782  // Ignore everything up to the first colon
783  UCArray::iterator next = text.begin();
784  while (*next++ != ':');
785 
786  // Get the word
787  word.clear();
788  for (; *next != ':'; ++next) {
789    word.push_back(*next);
790  }
791 
792  // Get total frequency
793  tf = 0;
794  for (++next; *next != ':'; ++next) {
795    tf *= 10;
796    tf += (*next - '0');
797  }
798 
799  // Get expansion frequency
800  ef = 0;
801  for (++next; *next != ':'; ++next) {
802    ef *= 10;
803    ef += (*next - '0');
804  }
805 
806  // Get document frequency
807  df = 0;
808  for (++next; *next != ':'; ++next) {
809    df *= 10;
810    df += (*next - '0');
811  }
812
813  /** DLConsulting 12-07-2004 */
814  return true;
815}
816
817// Print a list of expansions
818//
819// Given the textData and a list of phrase numbers, print out each of the
820// expansions.
821void phindaction::print_expansions(const text_t &collection, bool XMLmode,
822                   const text_t &body, TextData &textdata,
823                   const vector <unsigned long> &elist,
824                   unsigned long first, unsigned long last,
825                   displayclass &disp, outconvertclass &outconvert,
826                   ostream &textout) {
827 
828  UCArray word;
829  unsigned long phrase, tf, df, ef;
830
831  UCArray suffix, prefix, ucbody;
832 
833  toUCArray(body, ucbody);
834
835  for (unsigned long e = first; e < last; ++e) {
836
837    phrase = elist[e];
838    get_phrase_freq_data(textdata, phrase, word, tf, ef, df);
839
840    split_phrase(word, ucbody, prefix, suffix);
841   
842    if (XMLmode) {
843      // body is always the same as the text of the phrase, so no need to send it
844      textout << "<expansion num=\"" << e
845          << "\" id=\"" << phrase
846          << "\" tf=\"" << tf
847          << "\" df=\"" << df;
848      if (!prefix.empty()) {
849    text_t prefix_txt;
850    fromUCArray(prefix, prefix_txt);
851    textout << "\" prefix=\"" << encodeForHTMLAttr(prefix_txt);
852      }
853      if (!suffix.empty()) {
854    text_t suffix_txt;
855    fromUCArray(suffix, suffix_txt);
856    textout << "\" suffix=\"" << encodeForHTMLAttr(suffix_txt);
857      }
858      textout << "\"/>\n";
859    } else {
860      textout << outconvert << disp
861          << "<tr valign=top><td align=right><a href=\"_gwcgi_?"
862          << "c=" << encodeForURL(collection) << "&ppnum=" << phrase << "\">";
863      textout << prefix << "</a></td>";
864      textout <<outconvert << disp
865          << "<td align=center><a href=\"_gwcgi_?"
866          << "c=" << encodeForURL(collection) << "&ppnum=" << phrase << "\">"
867          << encodeForHTML(body) << "</a></td>"
868          << "<td align=left><a href=\"_gwcgi_?"
869          << "c=" << encodeForURL(collection) << "&ppnum=" << phrase << "\">";
870      textout << suffix << "</a></td>"
871          << "<td>" << tf << "</td><td>" << df << "</td></tr>\n";
872    }
873  }
874}
875
876// split an expansion into prefix and suffix
877void phindaction::split_phrase(const UCArray &word, const UCArray &body,
878                   UCArray &prefix, UCArray &suffix) {
879
880  prefix.clear();
881  suffix.clear();
882
883  bool readingPrefix = true;
884  UCArray::const_iterator here = word.begin();
885  UCArray::const_iterator end = word.end();
886 
887  while (here != end) {
888
889    // if we've not read all the prefix, add the next char to the prefix
890    if (readingPrefix) {
891      if (phrase_match(body, here, end)) {
892    readingPrefix = false;
893    // trim whitespace from end of prefix & start of suffix
894    if (!prefix.empty()) {
895      prefix.pop_back();
896    }
897    if ((here != end) && (*here == ' ')) {
898      ++here;
899    }
900      } else {
901    prefix.push_back(*here);
902    ++here;
903      }
904    }
905    // if we've finished with the prefix, update the suffix
906    else {
907      suffix.push_back(*here);
908      ++here;
909    }
910  }
911}
912
913// phrase_match
914//
915// compare two strings, one represented as an UCArray, the other as two
916// UCArray iterators.
917//
918// Return true if the UCArray is the same as the phrase the iterators point
919// to for the length of the UCArray.
920bool phindaction::phrase_match(const UCArray &text, UCArray::const_iterator &here,
921                   UCArray::const_iterator end) {
922
923  UCArray::const_iterator one_here = text.begin();
924  UCArray::const_iterator one_end  = text.end();
925  UCArray::const_iterator two_here = here;
926
927  // iterate over the length of the first string, comparing each element to
928  // the corresponding element in the second string.
929  while (one_here != one_end) {
930   
931      if (two_here == end) {
932      return false;
933      } else if (*one_here != *two_here) {
934      return false;
935      }
936      ++one_here;
937      ++two_here;
938  }
939
940  here = two_here;
941  return true;
942}
943
944bool phindaction::print_documents(bool XMLmode, const text_t &basepath,
945                  const text_t &collection,
946                  const vector <unsigned long> &docNums,
947                  const vector <unsigned long> &docFreq,
948                  unsigned long first, unsigned long last,
949                  displayclass &disp, outconvertclass &outconvert,
950                  ostream &textout) {
951 
952  // Create a TextData object to read the document data
953  TextData docdata;
954
955  text_t fullpath = filename_cat(basepath, "docs");
956  char *fullpathc = fullpath.getcstr();
957#if defined __WIN32__
958  char *base = "";
959#else
960  char *base = "/";
961#endif
962
963  if (!docdata.LoadData (base, fullpathc)) {
964    //    FatalError (1, "Couldn't load text information for \"%s\"", fullpathc);
965    //exit(0);
966    /** DLConsulting 12-07-2004 */
967    return false;
968  }
969
970  delete []fullpathc;
971
972  UCArray title, hash;
973  unsigned long freq, doc;
974
975  for (unsigned long d = first; d < last; ++d) {
976    doc = docNums[d];
977    freq = docFreq[d];
978
979    /** DLConsulting 13-07-2004 */
980    if(!get_document_all_data(docdata, doc, title, hash)) {
981      return false;
982    }
983
984    if (XMLmode) {
985      textout << "<document num=\"" << d
986          << "\" hash=\"" << hash
987          << "\" freq=\"" << freq
988          << "\" title=\"" << title << "\"/>\n";
989    } else {
990      textout << outconvert << disp
991          << "<tr valign=top><td><a href=\"_gwcgi_?"
992          << "c=" << encodeForURL(collection);
993      textout << "&a=d&d=" << hash << "\">" << title << "</a>"
994          << "</td><td>" << freq << "</td></tr>\n";
995    }
996  }
997
998  docdata.UnloadData();
999
1000  /** DLConsulting 12-07-2004 */
1001  return true;
1002}
1003
1004// Get all the data about a docment
1005//
1006// The document's details are stored in docData as record docNum.
1007// We retrieve:
1008//   title - the document's title
1009//   hash - the document's unique OID
1010/** Returns:
1011 *   false if a fatal error occured, true otherwise
1012 *  DLConsulting 12-07-2004
1013 */
1014bool phindaction::get_document_all_data(TextData &docdata, unsigned long docNum,
1015                    UCArray &title, UCArray &hash) {
1016
1017  UCArray text;
1018  UCArray docLevel;
1019  SetCStr(docLevel, "Document", 8);
1020
1021  // Look the word up in the textData
1022  if (!GetDocText (docdata, docLevel, docNum, text)) {
1023    //    FatalError (1, "Error while trying to get document %u", docNum);
1024    //exit(0);
1025    /** DLConsulting 13-07-2004 */
1026    return false;
1027  }
1028
1029  // Ignore everything up to the first colon
1030  UCArray::iterator next = text.begin();
1031  while (*next++ != '\t');
1032 
1033  // Get the document OID (hash)
1034  hash.clear();
1035  for (; *next != '\t'; ++next) {
1036    hash.push_back(*next);
1037  }
1038
1039  // Get the title
1040  text.push_back('\n');
1041  title.clear();
1042  for (++next; *next != '\n'; ++next) {
1043    title.push_back(*next);
1044  }
1045
1046  /** DLConsulting 13-07-2004 */
1047  return true;
1048}
1049
1050void phindaction::toUCArray(const text_t &in, UCArray &out) {
1051  out.clear();
1052  if (out.capacity() < in.size() + 1) {
1053    out.reserve(in.size() + 1);
1054  }
1055  text_t::const_iterator here = in.begin();
1056  text_t::const_iterator end = in.end();
1057  while (here != end) {
1058    out.push_back((unsigned char) *here);
1059    ++here;
1060  }
1061}
1062
1063void phindaction::fromUCArray(const UCArray &arrin, text_t &txtout) {
1064  txtout.clear();
1065  if (txtout.capacity() < arrin.size() + 1) {
1066    txtout.reserve(arrin.size() + 1);
1067  }
1068  vector<unsigned char>::const_iterator here = arrin.begin();
1069  vector<unsigned char>::const_iterator end = arrin.end();
1070  while (here != end) {
1071    txtout.push_back(*here); // don't need to cast unsigned char to unsigned short
1072    ++here;
1073  }
1074}
1075
1076
1077void phindaction::output_error (const text_t &message, ostream &textout,
1078                outconvertclass &outconvert,
1079                displayclass & disp, ostream &logout,
1080                bool XMLmode) {
1081
1082  logout << outconvert << message << "\n";
1083  if (XMLmode) {
1084    textout << outconvert
1085        << "<phinddata>\n"
1086        << "<phinderror>" << message << "</phinderror>\n"
1087        << "</phinddata>\n";
1088  } else {
1089    textout << outconvert << disp
1090        << "_header_\n"
1091        << message
1092        << "_footer_\n";
1093  }
1094}
1095
1096#endif //GSDL_USE_PHIND_ACTION
1097
Note: See TracBrowser for help on using the browser.