source: main/trunk/greenstone2/runtime-src/src/recpt/phindaction.cpp@ 28899

Last change on this file since 28899 was 28899, checked in by ak19, 7 years ago

Third commit for security, for ensuring cgiargs macros are websafe. This time all the changes to the runtime action classes.

  • Property svn:keywords set to Author Date Id Revision
File size: 30.8 KB
Line 
1/**********************************************************************
2 *
3 * phindaction.cpp --
4 *
5 * Copyright 2001 Gordon W. Paynter
6 * Copyright 2001 The New Zealand Digital Library Project
7 *
8 * A component of the Greenstone digital library software
9 * from the New Zealand Digital Library Project at the
10 * University of Waikato, New Zealand.
11 *
12 * This program is free software; you can redistribute it and/or modify
13 * it under the terms of the GNU General Public License as published by
14 * the Free Software Foundation; either version 2 of the License, or
15 * (at your option) any later version.
16 *
17 * This program is distributed in the hope that it will be useful,
18 * but WITHOUT ANY WARRANTY; without even the implied warranty of
19 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20 * GNU General Public License for more details.
21 *
22 * You should have received a copy of the GNU General Public License
23 * along with this program; if not, write to the Free Software
24 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
25 *
26 *********************************************************************/
27
28#include "gsdl_modules_cfg.h"
29#ifdef GSDL_USE_PHIND_ACTION
30
31// Note that this action uses mgpp to retrieve phind info, calling MGQuery
32// etc. directly, not through the protocol. This breaks our receptionist -
33// collection server separation and should be fixed some day I guess.
34
35#include "phindaction.h"
36#include "fileutil.h"
37#include "gsdlunicode.h"
38
39phindaction::phindaction () {
40
41 cgiarginfo arg_ainfo;
42
43 arg_ainfo.shortname = "pc";
44 arg_ainfo.longname = "phind classifier";
45 arg_ainfo.multiplechar = true;
46 arg_ainfo.multiplevalue = false;
47 arg_ainfo.defaultstatus = cgiarginfo::weak;
48 arg_ainfo.argdefault = g_EmptyText;
49 arg_ainfo.savedarginfo = cgiarginfo::mustnot;
50 argsinfo.addarginfo (NULL, arg_ainfo);
51
52 arg_ainfo.shortname = "pxml";
53 arg_ainfo.longname = "phind XML mode";
54 arg_ainfo.multiplechar = false;
55 arg_ainfo.multiplevalue = false;
56 arg_ainfo.defaultstatus = cgiarginfo::weak;
57 arg_ainfo.argdefault = "0";
58 arg_ainfo.savedarginfo = cgiarginfo::mustnot;
59 argsinfo.addarginfo (NULL, arg_ainfo);
60
61 arg_ainfo.shortname = "ppnum";
62 arg_ainfo.longname = "phind phrase number";
63 arg_ainfo.multiplechar = true;
64 arg_ainfo.multiplevalue = false;
65 arg_ainfo.defaultstatus = cgiarginfo::weak;
66 arg_ainfo.argdefault = "0";
67 arg_ainfo.savedarginfo = cgiarginfo::mustnot;
68 argsinfo.addarginfo (NULL, arg_ainfo);
69
70 arg_ainfo.shortname = "pptext";
71 arg_ainfo.longname = "phind phrase text";
72 arg_ainfo.multiplechar = true;
73 arg_ainfo.multiplevalue = false;
74 arg_ainfo.defaultstatus = cgiarginfo::weak;
75 arg_ainfo.argdefault = g_EmptyText;
76 arg_ainfo.savedarginfo = cgiarginfo::mustnot;
77 argsinfo.addarginfo (NULL, arg_ainfo);
78
79 arg_ainfo.shortname = "pfe";
80 arg_ainfo.longname = "phind first_e";
81 arg_ainfo.multiplechar = true;
82 arg_ainfo.multiplevalue = false;
83 arg_ainfo.defaultstatus = cgiarginfo::weak;
84 arg_ainfo.argdefault = "0";
85 arg_ainfo.savedarginfo = cgiarginfo::mustnot;
86 argsinfo.addarginfo (NULL, arg_ainfo);
87
88 arg_ainfo.shortname = "ple";
89 arg_ainfo.longname = "phind last_e";
90 arg_ainfo.multiplechar = true;
91 arg_ainfo.multiplevalue = false;
92 arg_ainfo.defaultstatus = cgiarginfo::weak;
93 arg_ainfo.argdefault = "10";
94 arg_ainfo.savedarginfo = cgiarginfo::mustnot;
95 argsinfo.addarginfo (NULL, arg_ainfo);
96
97 arg_ainfo.shortname = "pfl";
98 arg_ainfo.longname = "phind first_l";
99 arg_ainfo.multiplechar = true;
100 arg_ainfo.multiplevalue = false;
101 arg_ainfo.defaultstatus = cgiarginfo::weak;
102 arg_ainfo.argdefault = "0";
103 arg_ainfo.savedarginfo = cgiarginfo::mustnot;
104 argsinfo.addarginfo (NULL, arg_ainfo);
105
106 arg_ainfo.shortname = "pll";
107 arg_ainfo.longname = "phind last_l";
108 arg_ainfo.multiplechar = true;
109 arg_ainfo.multiplevalue = false;
110 arg_ainfo.defaultstatus = cgiarginfo::weak;
111 arg_ainfo.argdefault = "10";
112 arg_ainfo.savedarginfo = cgiarginfo::mustnot;
113 argsinfo.addarginfo (NULL, arg_ainfo);
114
115 arg_ainfo.shortname = "pfd";
116 arg_ainfo.longname = "phind first_d";
117 arg_ainfo.multiplechar = true;
118 arg_ainfo.multiplevalue = false;
119 arg_ainfo.defaultstatus = cgiarginfo::weak;
120 arg_ainfo.argdefault = "0";
121 arg_ainfo.savedarginfo = cgiarginfo::mustnot;
122 argsinfo.addarginfo (NULL, arg_ainfo);
123
124 arg_ainfo.shortname = "pld";
125 arg_ainfo.longname = "phind last_d";
126 arg_ainfo.multiplechar = true;
127 arg_ainfo.multiplevalue = false;
128 arg_ainfo.defaultstatus = cgiarginfo::weak;
129 arg_ainfo.argdefault = "10";
130 arg_ainfo.savedarginfo = cgiarginfo::mustnot;
131 argsinfo.addarginfo (NULL, arg_ainfo);
132}
133
134phindaction::~phindaction () {
135}
136
137void phindaction::get_cgihead_info (cgiargsclass &args, recptprotolistclass * /*protos*/,
138 response_t &response,text_t &response_data,
139 ostream &/*logout*/) {
140 response = content;
141 if (args["pxml"] == "1") {
142 response_data = "text/xml";
143 } else {
144 response_data = "text/html";
145 }
146}
147
148bool phindaction::do_action (cgiargsclass &args, recptprotolistclass *protos,
149 browsermapclass * /*browsers*/, displayclass &disp,
150 outconvertclass &outconvert, ostream &textout,
151 ostream &logout) {
152
153 unsigned long count_l, count_e, count_d;
154 unsigned long phrase = args["ppnum"].getulong(); // needn't encodeFor<web> on vars which have getulong() applied
155 text_t &word = args["pptext"];
156 unsigned long first_e = args["pfe"].getulong();
157 unsigned long last_e = args["ple"].getulong();
158 unsigned long first_l = args["pfl"].getulong();
159 unsigned long last_l = args["pll"].getulong();
160 unsigned long first_d = args["pfd"].getulong();
161 unsigned long last_d = args["pld"].getulong();
162 bool XMLmode = false;
163 if (args["pxml"] == "1") XMLmode = true;
164
165 // must have a valid collection server
166 recptproto *collectproto = protos->getrecptproto (args["c"], logout);
167 if (collectproto == NULL) {
168 output_error("phindaction: ERROR: collection not set", textout,
169 outconvert, disp, logout, XMLmode);
170 return true;
171 }
172
173 // the frequency and occurances of the phrase
174 unsigned long tf;
175 vector <unsigned long> el, linkdest, docNums, docfreq;
176 vector <UCArray> linktype;
177
178 // the number of occurances to display
179 unsigned long ef, lf, df;
180
181 text_t basepath = filename_cat(collecthome, args["c"],
182 "index", "phind" + args["pc"]);
183
184 // If we don't know the phrase number, look it up
185 if (phrase == 0) {
186
187 if (word.empty()) {
188 output_error("phindaction: ERROR: no phrase number or word", textout,
189 outconvert, disp, logout, XMLmode);
190 return true;
191 }
192
193 DocNumArray result;
194 /** In order to prevent browser crashing problems, any method which
195 * previously suffered a silent fatal error, now instead returns false
196 * to indicate a fatal error has occured. We can then dispatch an
197 * appropriate error tag to the Phind applet (rather than leave it
198 * whiling away the milliseconds until the end of existence - or at
199 * least your browser - in an infinite loop!)
200 * DLConsulting 12-07-2004
201 */
202
203 if(!find_phrase_number_from_word(basepath, word, result)) {
204 output_error("phindaction: Fatal Error! Couldn't load index information in find_phrase_number_from_word()",
205 textout, outconvert, disp, logout, XMLmode);
206 return true;
207 }
208
209 if (result.empty()) {
210 output_error("phindaction: The search term ("+encodeForHTML(word)+") does not occur in the collection",
211 textout, outconvert, disp, logout, XMLmode);
212 return true;
213 } else {
214 phrase = result[0];
215 }
216 }
217
218 // Create a TextData object to read the phrase data (pdata)
219 TextData textdata;
220
221 text_t fullpath = filename_cat(basepath, "pdata");
222 char *fullpathc = fullpath.getcstr();
223#if defined __WIN32__
224 char *base = "";
225#else
226 char *base = "/";
227#endif
228
229 if (!textdata.LoadData (base, fullpathc)) {
230 // FatalError (1, "Couldn't load text information for \"%s\"", fullpathc);
231 //exit(0);
232 /** We must return something to the client, whether this error is fatal or
233 * no, otherwise we risk sending their browser into an infinite loop!
234 * DLConsulting 12-07-2004
235 */
236 output_error("phindaction: Fatal Error! Couldn't load text information for collection",
237 textout, outconvert, disp, logout, XMLmode);
238 return true;
239 }
240
241 delete []fullpathc;
242
243 /** Another previously silent method can now cry out.
244 * DLConsulting 12-07-2004
245 */
246 if(!get_phrase_all_data(textdata, phrase, word, tf, ef, lf, df, el,
247 linkdest, linktype, docNums, docfreq)) {
248 output_error(
249 "phindaction: Fatal Error! Couldn't parse phrase in get_phrase_all_data()",
250 textout, outconvert, disp, logout, XMLmode);
251 return true;
252 }
253
254 // Output the header
255 if (XMLmode) {
256 textout << "<phinddata id=\"" << phrase
257 << "\" text=\"" << encodeForHTMLAttr(word)
258 << "\" tf=\"" << tf
259 << "\" ef=\"" << ef
260 << "\" df=\"" << df
261 << "\" lf=\"" << lf
262 << "\">\n";
263 } else {
264 textout << "<html><head><title>" << encodeForHTML(word) << "</title></head>\n"
265 << "<body><center>\n"
266 << "<p><h1>" << encodeForHTML(word) << "</h1>\n"
267 << "<p><b>"<< encodeForHTML(word) << "</b> occurs "
268 << tf << " times in " << df << " documents\n";
269 }
270
271 // Output the thesaurus links
272 if ((lf > 0) && (first_l < last_l)) {
273
274 // figure out the number of phrases to output
275 if (last_l > lf) {
276 last_l = lf;
277 }
278 count_l = last_l - first_l;
279
280 if (XMLmode) {
281 textout << "<thesauruslist length=\"" << lf
282 << "\" start=\"" << first_l
283 << "\" end=\"" << last_l << "\">\n";
284 /** DLConsulting 12-07-2004 */
285 if(!print_thesaurus_links(args["c"], XMLmode, textdata, linkdest, linktype,
286 first_l, last_l, disp, outconvert, textout)) {
287 output_error(
288 "phindaction: Fatal Error! Couldn't get phrase in get_phrase_freq_data()",
289 textout, outconvert, disp, logout, XMLmode);
290 return true;
291 }
292 textout << "</thesauruslist>\n";
293 }
294
295 // output links as HTML
296 else {
297 if (count_l == lf) {
298 textout << "<p><b> " << count_l << " thesaurus links</b>\n";
299 } else {
300 textout << "<p><b>" << count_l << " of " << lf << " thesaurus links</b>\n";
301 }
302
303 textout << "<p><table border=1><tr><th>type</th><th>topic</th><th>freq</th><th>docs</th></tr>\n";
304 /** DLConsulting 12-07-2004 */
305 if(!print_thesaurus_links(args["c"], XMLmode, textdata, linkdest, linktype,
306 first_l, last_l, disp, outconvert, textout)) {
307 output_error(
308 "phindaction: Fatal Error! Couldn't get phrase in get_phrase_freq_data()",
309 textout, outconvert, disp, logout, XMLmode);
310 return true;
311 }
312 textout << "</table>\n";
313
314 if (last_l < lf) {
315 if ((last_l + 10) < lf) {
316 textout << outconvert << disp
317 << "<br><a href=\"_gwcgi_?"
318 << "c=" << encodeForURL(args["c"])
319 << "&ppnum=" << phrase
320 << "&pfe=" << first_e
321 << "&ple=" << last_e
322 << "&pfd=" << first_d
323 << "&pld=" << last_d
324 << "&pfl=" << first_l
325 << "&pll=" << (last_l + 10)
326 << "\">Get more thesaurus links</a>\n";
327 }
328 textout << outconvert << disp
329 << "<br><a href=\"_gwcgi_?"
330 << "c=" << encodeForURL(args["c"])
331 << "&ppnum=" << phrase
332 << "&pfe=" << first_e
333 << "&ple=" << last_e
334 << "&pfd=" << first_d
335 << "&pld=" << last_d
336 << "&pfl=" << first_l
337 << "&pll=" << lf
338 << "\">Get every thesaurus link</a>\n" ;
339 }
340 }
341 }
342
343 // Output the expansions
344 if ((ef > 0) && (first_e < last_e)) {
345
346 // figure out the number of phrases to output
347 if (last_e > el.size()) {
348 last_e = el.size();
349 }
350 count_e = last_e - first_e;
351
352 // output expansions as XML
353 if (XMLmode) {
354 textout << "<expansionlist length=\"" << ef
355 << "\" start=\"" << first_e
356 << "\" end=\"" << last_e << "\">" << endl;
357
358 print_expansions(args["c"], XMLmode, word, textdata, el, first_e,
359 last_e, disp, outconvert, textout);
360
361 textout << "</expansionlist>\n";
362 }
363
364 // output expansions as HTML
365 else {
366 if (count_e == el.size()) {
367 textout << "<p><b> " << count_e << " expansions</b>\n";
368 } else {
369 textout << "<p><b>" << count_e << " of " << ef << " expansions</b>\n";
370 }
371
372 textout << "<p><table border=1><tr><th colspan=3>phrase</th><th>freq</th><th>docs</th></tr>\n";
373 print_expansions(args["c"], XMLmode, word, textdata, el, first_e,
374 last_e, disp, outconvert, textout);
375 textout << "</table>\n";
376
377 if (last_e < ef) {
378 if ((last_e + 10) < ef) {
379 textout << outconvert << disp
380 << "<br><a href=\"_gwcgi_?"
381 << "c=" << encodeForURL(args["c"])
382 << "&ppnum=" << phrase
383 << "&pfe=" << first_e
384 << "&ple=" << (last_e + 10)
385 << "&pfd=" << first_d
386 << "&pld=" << last_d
387 << "&pfl=" << first_l
388 << "&pll=" << last_l
389 << "\">Get more expansions</a>\n";
390 }
391 textout << outconvert << disp
392 << "<br><a href=\"_gwcgi_?"
393 << "c=" << encodeForURL(args["c"])
394 << "&ppnum=" << phrase
395 << "&pfe=" << first_e
396 << "&ple=" << ef
397 << "&pfd=" << first_d
398 << "&pld=" << last_d
399 << "&pfl=" << first_l
400 << "&pll=" << last_l
401 << "\">Get every expansion</a>\n";
402 }
403 }
404 }
405
406 // Output the document occurances
407 if ((df > 0) && (first_d < last_d)) {
408
409 // figure out the phrases to output
410 if (last_d > docNums.size()) {
411 last_d = docNums.size();
412 }
413 count_d = last_d - first_d;
414
415 // output document list as XML
416 if (XMLmode) {
417 textout << "<documentlist length=\"" << df
418 << "\" start=\"" << first_d
419 << "\" end=\"" << last_d << "\">\n";
420
421 if(!print_documents(XMLmode, basepath, args["c"], docNums, docfreq,
422 first_d, last_d, disp, outconvert, textout)) {
423 output_error(
424 "phindaction: Fatal Error! Couldn't load text information in print_documents() or get_document_all_data()",
425 textout, outconvert, disp, logout, XMLmode);
426 return true;
427 }
428
429 textout << "</documentlist>\n";
430 }
431
432 // output document list as HTML
433 else {
434
435 if (count_d == docNums.size()) {
436 textout << "<p><b> " << count_d << " documents</b>\n";
437 } else {
438 textout << "<p><b>" << count_d << " of " << df << " documents</b>\n";
439 }
440
441 textout << "<p><table border=1><tr><th align=left>document</th><th>freq</th></tr>\n";
442 if(!print_documents(XMLmode, basepath, args["c"], docNums, docfreq,
443 first_d, last_d, disp, outconvert, textout)) {
444 output_error(
445 "phindaction: Fatal Error! Couldn't load text information in print_documents()",
446 textout, outconvert, disp, logout, XMLmode);
447 return true;
448 }
449 textout << "</table>\n";
450
451 if (last_d < df) {
452 if ((last_d + 10) < df) {
453 textout << outconvert << disp
454 << "<br><a href=\"_gwcgi_?"
455 << "c=" << encodeForURL(args["c"])
456 << "&ppnum=" << phrase
457 << "&pfe=" << first_e
458 << "&ple=" << last_e
459 << "&pfd=" << first_d
460 << "&pld=" << (last_d + 10)
461 << "&pfl=" << first_l
462 << "&pll=" << last_l
463 << "\">Get more documents</a>\n";
464 }
465 textout << outconvert << disp
466 << "<br><a href=\"_gwcgi_?"
467 << "c=" << encodeForURL(args["c"])
468 << "&ppnum=" << phrase
469 << "&pfe=" << first_e
470 << "&ple=" << last_e
471 << "&pfd=" << first_d
472 << "&pld=" << df
473 << "&pfl=" << first_l
474 << "&pll=" << last_l
475 << "\">Get every document</a>\n";
476 }
477 }
478 }
479
480 // Close the document
481 if (XMLmode) {
482 textout << "</phinddata>\n";
483 } else {
484 textout << "</center></body></html>\n";
485 }
486
487 textdata.UnloadData ();
488
489 return true;
490}
491
492// Find the phrase number of a word in the index file
493bool phindaction::find_phrase_number_from_word(const text_t &basepath,
494 const text_t &query,
495 DocNumArray &result) {
496
497 // Open the index file for searching
498 IndexData indexData;
499
500 text_t fullpath = filename_cat(basepath, "pword");
501 char *fullpathc = fullpath.getcstr();
502#if defined __WIN32__
503 char *base = "";
504#else
505 char *base = "/";
506#endif
507
508 if (!indexData.LoadData (base, fullpathc)) {
509 // FatalError (1, "Couldn't load index information for \"%s\"", fullpathc);
510 //exit(0);
511 /** Don't handle fatal errors here anymore.
512 * DLConsulting 12-07-2004
513 */
514 return false; // Indicates something very bad has happened
515 }
516
517 delete []fullpathc;
518
519 // set up the query object
520 QueryInfo queryInfo;
521 SetCStr (queryInfo.docLevel, "Document", 8);
522 queryInfo.maxDocs = 5;
523 queryInfo.sortByRank = true;
524 queryInfo.exactWeights = false;
525 queryInfo.needRankInfo = true;
526 queryInfo.needTermFreqs = true;
527
528 // mode 1 = casefolded, unstemmed search
529 UCArray ucquery;
530 // greenstone gives us the query encoded in unicode. We want utf8.
531 char* utf8querystring=to_utf8(query).getcstr();
532 SetCStr(ucquery, utf8querystring);
533 delete []utf8querystring;
534
535 //toUCArray(query, ucquery);
536 QueryNode *queryTree = ParseQuery(ucquery, 1, 1, 4);
537
538 // perform the query
539 ExtQueryResult queryResult;
540 MGQuery (indexData, queryInfo, queryTree, queryResult);
541 // cout << "-- word lookup result -- " << endl << queryResult << endl ;
542
543 result.clear();
544 result = queryResult.docs;
545
546 // delete the query
547 if (queryTree != NULL) delete queryTree;
548
549 indexData.UnloadData();
550
551 /** This method now returns a boolean, so...
552 * DLConsulting 12-07-2004
553 */
554 return true; // Indicates that what happened is all good, baby.
555}
556
557// Get all the data about a phrase
558//
559// The phrase is stored in textData as record phrase.
560// We retrieve:
561// word - the text of the phrase
562// tf - the total frequency of the phrase
563// ef - the expansion frequency of the phrase
564// lf - the thesaurus link frequency of the phrase
565// df - the document frequency of the phrase
566// el - the list of phrases that are expansions of phrase
567// ll - the list of phrases that are thesaurus links
568// dl - the list of documents that contain phrase
569bool phindaction::get_phrase_all_data(TextData &textdata, unsigned long phrase,
570 text_t &word, unsigned long &tf, unsigned long &ef,
571 unsigned long &lf, unsigned long &df,
572 vector <unsigned long> &el,
573 vector <unsigned long> &linkdest,
574 vector <UCArray> &linktype,
575 vector <unsigned long> &docnum,
576 vector <unsigned long> &docfrq) {
577 UCArray text;
578 UCArray docLevel;
579 SetCStr(docLevel, "Document", 8);
580
581 // Look the word up in the textData
582 if (!GetDocText (textdata, docLevel, phrase, text)) {
583 // FatalError (1, "Error while trying to get phrase %u", phrase);
584 //exit(0);
585 return false; // Something very bad has happened.
586 }
587
588 // Ignore everything up to the first colon
589 UCArray::iterator next = text.begin();
590 while (*next++ != ':');
591
592 // ignore training carriage returns
593 while (text.back() == '\n') {
594 text.pop_back();
595 }
596
597 // Get the word
598 word.clear();
599 for (; *next != ':'; ++next) {
600 word.push_back(*next);
601 }
602
603 // Get total frequency
604 tf = 0;
605 for (++next; *next != ':'; ++next) {
606 tf *= 10;
607 tf += (*next - '0');
608 }
609
610 // Get expansion frequency
611 ef = 0;
612 for (++next; *next != ':'; ++next) {
613 ef *= 10;
614 ef += (*next - '0');
615 }
616
617 // Get document frequency
618 df = 0;
619 for (++next; *next != ':'; ++next) {
620 df *= 10;
621 df += (*next - '0');
622 }
623
624 // Get expansion list
625 el.clear();
626 unsigned long e = 0;
627 for (++next; *next != ':'; ++next) {
628 if (*next == ',') {
629 el.push_back(e);
630 e = 0;
631 } else {
632 e *= 10;
633 e += (*next - '0');
634 }
635 }
636
637 // Get document list & the document frequency list
638 docnum.clear();
639 docfrq.clear();
640 bool readnum = false;
641 unsigned long d = 0;
642 for (++next; *next != ':'; ++next) {
643 if (*next == ',') {
644 docnum.push_back(d);
645 readnum = true;
646 d = 0;
647 } else if (*next == ';') {
648 if (readnum) {
649 docfrq.push_back(d);
650 } else {
651 docnum.push_back(d);
652 docfrq.push_back(1);
653 }
654 readnum = false;
655 d = 0;
656 } else {
657 d *= 10;
658 d += (*next - '0');
659 }
660 }
661
662 // Get thesaurus link frequency & link list
663 text.push_back(':');
664 text.push_back(':');
665
666 // link frequency
667 lf = 0;
668 for (++next; *next != ':'; ++next) {
669 lf *= 10;
670 lf += (*next - '0');
671 }
672
673 // two lists of link data
674 linkdest.clear();
675 linktype.clear();
676
677 UCArray thistype;
678 thistype.clear();
679 bool typedone = false;
680 unsigned long l = 0;
681 for (++next; *next != ':'; ++next) {
682
683 if (!typedone) {
684 // first read the link type, a charactor string
685 if (*next == ',') {
686 typedone = true;
687 } else {
688 thistype.push_back(*next);
689 }
690 } else {
691 // having read the link type, read the list of link destinations
692 if (*next == ',') {
693 linkdest.push_back(l);
694 linktype.push_back(thistype);
695 l = 0;
696 } else if (*next == ';') {
697 linkdest.push_back(l);
698 linktype.push_back(thistype);
699 l = 0;
700 thistype.clear();
701 typedone = false;
702 } else {
703 l *= 10;
704 l += (*next - '0');
705 }
706 }
707 }
708
709 return true; // Indicates that what happened is all good, baby.
710}
711
712bool phindaction::print_thesaurus_links(const text_t &collection, bool XMLmode,
713 TextData &textdata, vector <unsigned long> &linkdest,
714 vector <UCArray> &linktype, unsigned long first,
715 unsigned long last, displayclass &disp,
716 outconvertclass &outconvert, ostream &textout) {
717
718 // information describing each link in the list
719 unsigned long phrase, tf, ef, df;
720 UCArray type, text;
721
722 for (unsigned long l = first; l < last; ++l) {
723
724 // get the phrase data
725 phrase = linkdest[l];
726 type = linktype[l];
727
728 /** DLConsulting 12-07-2004 */
729 if(!get_phrase_freq_data(textdata, phrase, text, tf, ef, df)) {
730 return false;
731 }
732
733 if (XMLmode) {
734 textout << "<thesaurus num=\"" << l
735 << "\" id=\"" << phrase
736 << "\" tf=\"" << tf
737 << "\" df=\"" << df
738 << "\" type=\"" << type
739 << "\" text=\"" << text
740 << "\"/>\n";
741 } else {
742 textout << "<tr valign=top><td>" << type << "</td><td>";
743 textout << outconvert << disp
744 << "<a href=\"_gwcgi_?c=" << encodeForURL(collection);
745 textout << "&ppnum=" << phrase << "\">" << text << "</a>"
746 << "</td><td>" << tf << "</td><td>" << df << "</td></tr>\n";
747 }
748 }
749
750 /** DLConsulting 12-07-2004 */
751 return true;
752}
753
754// Get the frequency data about a phrase
755//
756// The phrase is stored in textData as record phrase.
757// We retrieve:
758// word - the text of the phrase
759// tf - the total frequency of the phrase
760// ef - the expansion frequency of the phrase
761// df - the document frequency of the phrase
762/**
763 * Returns:
764 * false if the method suffered a fatal error, true otherwise
765 */
766bool phindaction::get_phrase_freq_data(TextData &textdata, unsigned long phrase,
767 UCArray &word, unsigned long &tf,
768 unsigned long &ef, unsigned long &df) {
769
770 UCArray text;
771 UCArray docLevel;
772 SetCStr(docLevel, "Document", 8);
773
774 // Look the word up in the textData
775 if (!GetDocText (textdata, docLevel, phrase, text)) {
776 // FatalError (1, "Error while trying to get phrase %u", phrase);
777 //exit(0);
778 /** DLConsulting 12-07-2004 */
779 return false;
780 }
781
782 // Ignore everything up to the first colon
783 UCArray::iterator next = text.begin();
784 while (*next++ != ':');
785
786 // Get the word
787 word.clear();
788 for (; *next != ':'; ++next) {
789 word.push_back(*next);
790 }
791
792 // Get total frequency
793 tf = 0;
794 for (++next; *next != ':'; ++next) {
795 tf *= 10;
796 tf += (*next - '0');
797 }
798
799 // Get expansion frequency
800 ef = 0;
801 for (++next; *next != ':'; ++next) {
802 ef *= 10;
803 ef += (*next - '0');
804 }
805
806 // Get document frequency
807 df = 0;
808 for (++next; *next != ':'; ++next) {
809 df *= 10;
810 df += (*next - '0');
811 }
812
813 /** DLConsulting 12-07-2004 */
814 return true;
815}
816
817// Print a list of expansions
818//
819// Given the textData and a list of phrase numbers, print out each of the
820// expansions.
821void phindaction::print_expansions(const text_t &collection, bool XMLmode,
822 const text_t &body, TextData &textdata,
823 const vector <unsigned long> &elist,
824 unsigned long first, unsigned long last,
825 displayclass &disp, outconvertclass &outconvert,
826 ostream &textout) {
827
828 UCArray word;
829 unsigned long phrase, tf, df, ef;
830
831 UCArray suffix, prefix, ucbody;
832
833 toUCArray(body, ucbody);
834
835 for (unsigned long e = first; e < last; ++e) {
836
837 phrase = elist[e];
838 get_phrase_freq_data(textdata, phrase, word, tf, ef, df);
839
840 split_phrase(word, ucbody, prefix, suffix);
841
842 if (XMLmode) {
843 // body is always the same as the text of the phrase, so no need to send it
844 textout << "<expansion num=\"" << e
845 << "\" id=\"" << phrase
846 << "\" tf=\"" << tf
847 << "\" df=\"" << df;
848 if (!prefix.empty()) {
849 text_t prefix_txt;
850 fromUCArray(prefix, prefix_txt);
851 textout << "\" prefix=\"" << encodeForHTMLAttr(prefix_txt);
852 }
853 if (!suffix.empty()) {
854 text_t suffix_txt;
855 fromUCArray(suffix, suffix_txt);
856 textout << "\" suffix=\"" << encodeForHTMLAttr(suffix_txt);
857 }
858 textout << "\"/>\n";
859 } else {
860 textout << outconvert << disp
861 << "<tr valign=top><td align=right><a href=\"_gwcgi_?"
862 << "c=" << encodeForURL(collection) << "&ppnum=" << phrase << "\">";
863 textout << prefix << "</a></td>";
864 textout <<outconvert << disp
865 << "<td align=center><a href=\"_gwcgi_?"
866 << "c=" << encodeForURL(collection) << "&ppnum=" << phrase << "\">"
867 << encodeForHTML(body) << "</a></td>"
868 << "<td align=left><a href=\"_gwcgi_?"
869 << "c=" << encodeForURL(collection) << "&ppnum=" << phrase << "\">";
870 textout << suffix << "</a></td>"
871 << "<td>" << tf << "</td><td>" << df << "</td></tr>\n";
872 }
873 }
874}
875
876// split an expansion into prefix and suffix
877void phindaction::split_phrase(const UCArray &word, const UCArray &body,
878 UCArray &prefix, UCArray &suffix) {
879
880 prefix.clear();
881 suffix.clear();
882
883 bool readingPrefix = true;
884 UCArray::const_iterator here = word.begin();
885 UCArray::const_iterator end = word.end();
886
887 while (here != end) {
888
889 // if we've not read all the prefix, add the next char to the prefix
890 if (readingPrefix) {
891 if (phrase_match(body, here, end)) {
892 readingPrefix = false;
893 // trim whitespace from end of prefix & start of suffix
894 if (!prefix.empty()) {
895 prefix.pop_back();
896 }
897 if ((here != end) && (*here == ' ')) {
898 ++here;
899 }
900 } else {
901 prefix.push_back(*here);
902 ++here;
903 }
904 }
905 // if we've finished with the prefix, update the suffix
906 else {
907 suffix.push_back(*here);
908 ++here;
909 }
910 }
911}
912
913// phrase_match
914//
915// compare two strings, one represented as an UCArray, the other as two
916// UCArray iterators.
917//
918// Return true if the UCArray is the same as the phrase the iterators point
919// to for the length of the UCArray.
920bool phindaction::phrase_match(const UCArray &text, UCArray::const_iterator &here,
921 UCArray::const_iterator end) {
922
923 UCArray::const_iterator one_here = text.begin();
924 UCArray::const_iterator one_end = text.end();
925 UCArray::const_iterator two_here = here;
926
927 // iterate over the length of the first string, comparing each element to
928 // the corresponding element in the second string.
929 while (one_here != one_end) {
930
931 if (two_here == end) {
932 return false;
933 } else if (*one_here != *two_here) {
934 return false;
935 }
936 ++one_here;
937 ++two_here;
938 }
939
940 here = two_here;
941 return true;
942}
943
944bool phindaction::print_documents(bool XMLmode, const text_t &basepath,
945 const text_t &collection,
946 const vector <unsigned long> &docNums,
947 const vector <unsigned long> &docFreq,
948 unsigned long first, unsigned long last,
949 displayclass &disp, outconvertclass &outconvert,
950 ostream &textout) {
951
952 // Create a TextData object to read the document data
953 TextData docdata;
954
955 text_t fullpath = filename_cat(basepath, "docs");
956 char *fullpathc = fullpath.getcstr();
957#if defined __WIN32__
958 char *base = "";
959#else
960 char *base = "/";
961#endif
962
963 if (!docdata.LoadData (base, fullpathc)) {
964 // FatalError (1, "Couldn't load text information for \"%s\"", fullpathc);
965 //exit(0);
966 /** DLConsulting 12-07-2004 */
967 return false;
968 }
969
970 delete []fullpathc;
971
972 UCArray title, hash;
973 unsigned long freq, doc;
974
975 for (unsigned long d = first; d < last; ++d) {
976 doc = docNums[d];
977 freq = docFreq[d];
978
979 /** DLConsulting 13-07-2004 */
980 if(!get_document_all_data(docdata, doc, title, hash)) {
981 return false;
982 }
983
984 if (XMLmode) {
985 textout << "<document num=\"" << d
986 << "\" hash=\"" << hash
987 << "\" freq=\"" << freq
988 << "\" title=\"" << title << "\"/>\n";
989 } else {
990 textout << outconvert << disp
991 << "<tr valign=top><td><a href=\"_gwcgi_?"
992 << "c=" << encodeForURL(collection);
993 textout << "&a=d&d=" << hash << "\">" << title << "</a>"
994 << "</td><td>" << freq << "</td></tr>\n";
995 }
996 }
997
998 docdata.UnloadData();
999
1000 /** DLConsulting 12-07-2004 */
1001 return true;
1002}
1003
1004// Get all the data about a docment
1005//
1006// The document's details are stored in docData as record docNum.
1007// We retrieve:
1008// title - the document's title
1009// hash - the document's unique OID
1010/** Returns:
1011 * false if a fatal error occured, true otherwise
1012 * DLConsulting 12-07-2004
1013 */
1014bool phindaction::get_document_all_data(TextData &docdata, unsigned long docNum,
1015 UCArray &title, UCArray &hash) {
1016
1017 UCArray text;
1018 UCArray docLevel;
1019 SetCStr(docLevel, "Document", 8);
1020
1021 // Look the word up in the textData
1022 if (!GetDocText (docdata, docLevel, docNum, text)) {
1023 // FatalError (1, "Error while trying to get document %u", docNum);
1024 //exit(0);
1025 /** DLConsulting 13-07-2004 */
1026 return false;
1027 }
1028
1029 // Ignore everything up to the first colon
1030 UCArray::iterator next = text.begin();
1031 while (*next++ != '\t');
1032
1033 // Get the document OID (hash)
1034 hash.clear();
1035 for (; *next != '\t'; ++next) {
1036 hash.push_back(*next);
1037 }
1038
1039 // Get the title
1040 text.push_back('\n');
1041 title.clear();
1042 for (++next; *next != '\n'; ++next) {
1043 title.push_back(*next);
1044 }
1045
1046 /** DLConsulting 13-07-2004 */
1047 return true;
1048}
1049
1050void phindaction::toUCArray(const text_t &in, UCArray &out) {
1051 out.clear();
1052 if (out.capacity() < in.size() + 1) {
1053 out.reserve(in.size() + 1);
1054 }
1055 text_t::const_iterator here = in.begin();
1056 text_t::const_iterator end = in.end();
1057 while (here != end) {
1058 out.push_back((unsigned char) *here);
1059 ++here;
1060 }
1061}
1062
1063void phindaction::fromUCArray(const UCArray &arrin, text_t &txtout) {
1064 txtout.clear();
1065 if (txtout.capacity() < arrin.size() + 1) {
1066 txtout.reserve(arrin.size() + 1);
1067 }
1068 vector<unsigned char>::const_iterator here = arrin.begin();
1069 vector<unsigned char>::const_iterator end = arrin.end();
1070 while (here != end) {
1071 txtout.push_back(*here); // don't need to cast unsigned char to unsigned short
1072 ++here;
1073 }
1074}
1075
1076
1077void phindaction::output_error (const text_t &message, ostream &textout,
1078 outconvertclass &outconvert,
1079 displayclass & disp, ostream &logout,
1080 bool XMLmode) {
1081
1082 logout << outconvert << message << "\n";
1083 if (XMLmode) {
1084 textout << outconvert
1085 << "<phinddata>\n"
1086 << "<phinderror>" << message << "</phinderror>\n"
1087 << "</phinddata>\n";
1088 } else {
1089 textout << outconvert << disp
1090 << "_header_\n"
1091 << message
1092 << "_footer_\n";
1093 }
1094}
1095
1096#endif //GSDL_USE_PHIND_ACTION
1097
Note: See TracBrowser for help on using the repository browser.