source: main/trunk/greenstone2/runtime-src/src/recpt/phindaction.cpp@ 27065

Last change on this file since 27065 was 22984, checked in by ak19, 14 years ago
  1. Undoing commit of 22934 where decode_commas was called on stem and fold comma separated list: previously separated due to url-encoding of commas. Now that the problem has been fixed at the source, the decode_commas hack is no longer necessary. 2. Commas in stem and fold are no longer url-encoded because the multiple_value field of the continuously-reused struct arg_ainfo is always set back to the default false after ever being set to true. So it no longer subtly stays at true to affect Greenstone functioning in unforeseen ways (such as suddenly and unnecessarily URL-encoding commas where this is not wanted).
  • Property svn:keywords set to Author Date Id Revision
File size: 29.9 KB
Line 
1/**********************************************************************
2 *
3 * phindaction.cpp --
4 *
5 * Copyright 2001 Gordon W. Paynter
6 * Copyright 2001 The New Zealand Digital Library Project
7 *
8 * A component of the Greenstone digital library software
9 * from the New Zealand Digital Library Project at the
10 * University of Waikato, New Zealand.
11 *
12 * This program is free software; you can redistribute it and/or modify
13 * it under the terms of the GNU General Public License as published by
14 * the Free Software Foundation; either version 2 of the License, or
15 * (at your option) any later version.
16 *
17 * This program is distributed in the hope that it will be useful,
18 * but WITHOUT ANY WARRANTY; without even the implied warranty of
19 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20 * GNU General Public License for more details.
21 *
22 * You should have received a copy of the GNU General Public License
23 * along with this program; if not, write to the Free Software
24 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
25 *
26 *********************************************************************/
27
28#include "gsdl_modules_cfg.h"
29#ifdef GSDL_USE_PHIND_ACTION
30
31// Note that this action uses mgpp to retrieve phind info, calling MGQuery
32// etc. directly, not through the protocol. This breaks our receptionist -
33// collection server separation and should be fixed some day I guess.
34
35#include "phindaction.h"
36#include "fileutil.h"
37#include "gsdlunicode.h"
38
39phindaction::phindaction () {
40
41 cgiarginfo arg_ainfo;
42
43 arg_ainfo.shortname = "pc";
44 arg_ainfo.longname = "phind classifier";
45 arg_ainfo.multiplechar = true;
46 arg_ainfo.multiplevalue = false;
47 arg_ainfo.defaultstatus = cgiarginfo::weak;
48 arg_ainfo.argdefault = g_EmptyText;
49 arg_ainfo.savedarginfo = cgiarginfo::mustnot;
50 argsinfo.addarginfo (NULL, arg_ainfo);
51
52 arg_ainfo.shortname = "pxml";
53 arg_ainfo.longname = "phind XML mode";
54 arg_ainfo.multiplechar = false;
55 arg_ainfo.multiplevalue = false;
56 arg_ainfo.defaultstatus = cgiarginfo::weak;
57 arg_ainfo.argdefault = "0";
58 arg_ainfo.savedarginfo = cgiarginfo::mustnot;
59 argsinfo.addarginfo (NULL, arg_ainfo);
60
61 arg_ainfo.shortname = "ppnum";
62 arg_ainfo.longname = "phind phrase number";
63 arg_ainfo.multiplechar = true;
64 arg_ainfo.multiplevalue = false;
65 arg_ainfo.defaultstatus = cgiarginfo::weak;
66 arg_ainfo.argdefault = "0";
67 arg_ainfo.savedarginfo = cgiarginfo::mustnot;
68 argsinfo.addarginfo (NULL, arg_ainfo);
69
70 arg_ainfo.shortname = "pptext";
71 arg_ainfo.longname = "phind phrase text";
72 arg_ainfo.multiplechar = true;
73 arg_ainfo.multiplevalue = false;
74 arg_ainfo.defaultstatus = cgiarginfo::weak;
75 arg_ainfo.argdefault = g_EmptyText;
76 arg_ainfo.savedarginfo = cgiarginfo::mustnot;
77 argsinfo.addarginfo (NULL, arg_ainfo);
78
79 arg_ainfo.shortname = "pfe";
80 arg_ainfo.longname = "phind first_e";
81 arg_ainfo.multiplechar = true;
82 arg_ainfo.multiplevalue = false;
83 arg_ainfo.defaultstatus = cgiarginfo::weak;
84 arg_ainfo.argdefault = "0";
85 arg_ainfo.savedarginfo = cgiarginfo::mustnot;
86 argsinfo.addarginfo (NULL, arg_ainfo);
87
88 arg_ainfo.shortname = "ple";
89 arg_ainfo.longname = "phind last_e";
90 arg_ainfo.multiplechar = true;
91 arg_ainfo.multiplevalue = false;
92 arg_ainfo.defaultstatus = cgiarginfo::weak;
93 arg_ainfo.argdefault = "10";
94 arg_ainfo.savedarginfo = cgiarginfo::mustnot;
95 argsinfo.addarginfo (NULL, arg_ainfo);
96
97 arg_ainfo.shortname = "pfl";
98 arg_ainfo.longname = "phind first_l";
99 arg_ainfo.multiplechar = true;
100 arg_ainfo.multiplevalue = false;
101 arg_ainfo.defaultstatus = cgiarginfo::weak;
102 arg_ainfo.argdefault = "0";
103 arg_ainfo.savedarginfo = cgiarginfo::mustnot;
104 argsinfo.addarginfo (NULL, arg_ainfo);
105
106 arg_ainfo.shortname = "pll";
107 arg_ainfo.longname = "phind last_l";
108 arg_ainfo.multiplechar = true;
109 arg_ainfo.multiplevalue = false;
110 arg_ainfo.defaultstatus = cgiarginfo::weak;
111 arg_ainfo.argdefault = "10";
112 arg_ainfo.savedarginfo = cgiarginfo::mustnot;
113 argsinfo.addarginfo (NULL, arg_ainfo);
114
115 arg_ainfo.shortname = "pfd";
116 arg_ainfo.longname = "phind first_d";
117 arg_ainfo.multiplechar = true;
118 arg_ainfo.multiplevalue = false;
119 arg_ainfo.defaultstatus = cgiarginfo::weak;
120 arg_ainfo.argdefault = "0";
121 arg_ainfo.savedarginfo = cgiarginfo::mustnot;
122 argsinfo.addarginfo (NULL, arg_ainfo);
123
124 arg_ainfo.shortname = "pld";
125 arg_ainfo.longname = "phind last_d";
126 arg_ainfo.multiplechar = true;
127 arg_ainfo.multiplevalue = false;
128 arg_ainfo.defaultstatus = cgiarginfo::weak;
129 arg_ainfo.argdefault = "10";
130 arg_ainfo.savedarginfo = cgiarginfo::mustnot;
131 argsinfo.addarginfo (NULL, arg_ainfo);
132}
133
134phindaction::~phindaction () {
135}
136
137void phindaction::get_cgihead_info (cgiargsclass &args, recptprotolistclass * /*protos*/,
138 response_t &response,text_t &response_data,
139 ostream &/*logout*/) {
140 response = content;
141 if (args["pxml"] == "1") {
142 response_data = "text/xml";
143 } else {
144 response_data = "text/html";
145 }
146}
147
148bool phindaction::do_action (cgiargsclass &args, recptprotolistclass *protos,
149 browsermapclass * /*browsers*/, displayclass &disp,
150 outconvertclass &outconvert, ostream &textout,
151 ostream &logout) {
152
153 unsigned long count_l, count_e, count_d;
154 unsigned long phrase = args["ppnum"].getulong();
155 text_t &word = args["pptext"];
156 unsigned long first_e = args["pfe"].getulong();
157 unsigned long last_e = args["ple"].getulong();
158 unsigned long first_l = args["pfl"].getulong();
159 unsigned long last_l = args["pll"].getulong();
160 unsigned long first_d = args["pfd"].getulong();
161 unsigned long last_d = args["pld"].getulong();
162 bool XMLmode = false;
163 if (args["pxml"] == "1") XMLmode = true;
164
165 // must have a valid collection server
166 recptproto *collectproto = protos->getrecptproto (args["c"], logout);
167 if (collectproto == NULL) {
168 output_error("phindaction: ERROR: collection not set", textout,
169 outconvert, disp, logout, XMLmode);
170 return true;
171 }
172
173 // the frequency and occurances of the phrase
174 unsigned long tf;
175 vector <unsigned long> el, linkdest, docNums, docfreq;
176 vector <UCArray> linktype;
177
178 // the number of occurances to display
179 unsigned long ef, lf, df;
180
181 text_t basepath = filename_cat(collecthome, args["c"],
182 "index", "phind" + args["pc"]);
183
184 // If we don't know the phrase number, look it up
185 if (phrase == 0) {
186
187 if (word.empty()) {
188 output_error("phindaction: ERROR: no phrase number or word", textout,
189 outconvert, disp, logout, XMLmode);
190 return true;
191 }
192
193 DocNumArray result;
194 /** In order to prevent browser crashing problems, any method which
195 * previously suffered a silent fatal error, now instead returns false
196 * to indicate a fatal error has occured. We can then dispatch an
197 * appropriate error tag to the Phind applet (rather than leave it
198 * whiling away the milliseconds until the end of existence - or at
199 * least your browser - in an infinite loop!)
200 * DLConsulting 12-07-2004
201 */
202
203 if(!find_phrase_number_from_word(basepath, word, result)) {
204 output_error("phindaction: Fatal Error! Couldn't load index information in find_phrase_number_from_word()",
205 textout, outconvert, disp, logout, XMLmode);
206 return true;
207 }
208
209 if (result.empty()) {
210 output_error("phindaction: The search term ("+word+") does not occur in the collection",
211 textout, outconvert, disp, logout, XMLmode);
212 return true;
213 } else {
214 phrase = result[0];
215 }
216 }
217
218 // Create a TextData object to read the phrase data (pdata)
219 TextData textdata;
220
221 text_t fullpath = filename_cat(basepath, "pdata");
222 char *fullpathc = fullpath.getcstr();
223#if defined __WIN32__
224 char *base = "";
225#else
226 char *base = "/";
227#endif
228
229 if (!textdata.LoadData (base, fullpathc)) {
230 // FatalError (1, "Couldn't load text information for \"%s\"", fullpathc);
231 //exit(0);
232 /** We must return something to the client, whether this error is fatal or
233 * no, otherwise we risk sending their browser into an infinite loop!
234 * DLConsulting 12-07-2004
235 */
236 output_error("phindaction: Fatal Error! Couldn't load text information for collection",
237 textout, outconvert, disp, logout, XMLmode);
238 return true;
239 }
240
241 delete []fullpathc;
242
243 /** Another previously silent method can now cry out.
244 * DLConsulting 12-07-2004
245 */
246 if(!get_phrase_all_data(textdata, phrase, word, tf, ef, lf, df, el,
247 linkdest, linktype, docNums, docfreq)) {
248 output_error(
249 "phindaction: Fatal Error! Couldn't parse phrase in get_phrase_all_data()",
250 textout, outconvert, disp, logout, XMLmode);
251 return true;
252 }
253
254 // Output the header
255 if (XMLmode) {
256 textout << "<phinddata id=\"" << phrase
257 << "\" text=\"" << word
258 << "\" tf=\"" << tf
259 << "\" ef=\"" << ef
260 << "\" df=\"" << df
261 << "\" lf=\"" << lf
262 << "\">\n";
263 } else {
264 textout << "<html><head><title>" << word << "</title></head>\n"
265 << "<body><center>\n"
266 << "<p><h1>" << word << "</h1>\n"
267 << "<p><b>"<< word << "</b> occurs "
268 << tf << " times in " << df << " documents\n";
269 }
270
271 // Output the thesaurus links
272 if ((lf > 0) && (first_l < last_l)) {
273
274 // figure out the number of phrases to output
275 if (last_l > lf) {
276 last_l = lf;
277 }
278 count_l = last_l - first_l;
279
280 if (XMLmode) {
281 textout << "<thesauruslist length=\"" << lf
282 << "\" start=\"" << first_l
283 << "\" end=\"" << last_l << "\">\n";
284 /** DLConsulting 12-07-2004 */
285 if(!print_thesaurus_links(args["c"], XMLmode, textdata, linkdest, linktype,
286 first_l, last_l, disp, outconvert, textout)) {
287 output_error(
288 "phindaction: Fatal Error! Couldn't get phrase in get_phrase_freq_data()",
289 textout, outconvert, disp, logout, XMLmode);
290 return true;
291 }
292 textout << "</thesauruslist>\n";
293 }
294
295 // output links as HTML
296 else {
297 if (count_l == lf) {
298 textout << "<p><b> " << count_l << " thesaurus links</b>\n";
299 } else {
300 textout << "<p><b>" << count_l << " of " << lf << " thesaurus links</b>\n";
301 }
302
303 textout << "<p><table border=1><tr><th>type</th><th>topic</th><th>freq</th><th>docs</th></tr>\n";
304 /** DLConsulting 12-07-2004 */
305 if(!print_thesaurus_links(args["c"], XMLmode, textdata, linkdest, linktype,
306 first_l, last_l, disp, outconvert, textout)) {
307 output_error(
308 "phindaction: Fatal Error! Couldn't get phrase in get_phrase_freq_data()",
309 textout, outconvert, disp, logout, XMLmode);
310 return true;
311 }
312 textout << "</table>\n";
313
314 if (last_l < lf) {
315 if ((last_l + 10) < lf) {
316 textout << outconvert << disp
317 << "<br><a href=\"_gwcgi_?"
318 << "c=" << args["c"]
319 << "&ppnum=" << phrase
320 << "&pfe=" << first_e
321 << "&ple=" << last_e
322 << "&pfd=" << first_d
323 << "&pld=" << last_d
324 << "&pfl=" << first_l
325 << "&pll=" << (last_l + 10)
326 << "\">Get more thesaurus links</a>\n";
327 }
328 textout << outconvert << disp
329 << "<br><a href=\"_gwcgi_?"
330 << "c=" << args["c"]
331 << "&ppnum=" << phrase
332 << "&pfe=" << first_e
333 << "&ple=" << last_e
334 << "&pfd=" << first_d
335 << "&pld=" << last_d
336 << "&pfl=" << first_l
337 << "&pll=" << lf
338 << "\">Get every thesaurus link</a>\n" ;
339 }
340 }
341 }
342
343 // Output the expansions
344 if ((ef > 0) && (first_e < last_e)) {
345
346 // figure out the number of phrases to output
347 if (last_e > el.size()) {
348 last_e = el.size();
349 }
350 count_e = last_e - first_e;
351
352 // output expansions as XML
353 if (XMLmode) {
354 textout << "<expansionlist length=\"" << ef
355 << "\" start=\"" << first_e
356 << "\" end=\"" << last_e << "\">" << endl;
357
358 print_expansions(args["c"], XMLmode, word, textdata, el, first_e,
359 last_e, disp, outconvert, textout);
360
361 textout << "</expansionlist>\n";
362 }
363
364 // output expansions as HTML
365 else {
366 if (count_e == el.size()) {
367 textout << "<p><b> " << count_e << " expansions</b>\n";
368 } else {
369 textout << "<p><b>" << count_e << " of " << ef << " expansions</b>\n";
370 }
371
372 textout << "<p><table border=1><tr><th colspan=3>phrase</th><th>freq</th><th>docs</th></tr>\n";
373 print_expansions(args["c"], XMLmode, word, textdata, el, first_e,
374 last_e, disp, outconvert, textout);
375 textout << "</table>\n";
376
377 if (last_e < ef) {
378 if ((last_e + 10) < ef) {
379 textout << outconvert << disp
380 << "<br><a href=\"_gwcgi_?"
381 << "c=" << args["c"]
382 << "&ppnum=" << phrase
383 << "&pfe=" << first_e
384 << "&ple=" << (last_e + 10)
385 << "&pfd=" << first_d
386 << "&pld=" << last_d
387 << "&pfl=" << first_l
388 << "&pll=" << last_l
389 << "\">Get more expansions</a>\n";
390 }
391 textout << outconvert << disp
392 << "<br><a href=\"_gwcgi_?"
393 << "c=" << args["c"]
394 << "&ppnum=" << phrase
395 << "&pfe=" << first_e
396 << "&ple=" << ef
397 << "&pfd=" << first_d
398 << "&pld=" << last_d
399 << "&pfl=" << first_l
400 << "&pll=" << last_l
401 << "\">Get every expansion</a>\n";
402 }
403 }
404 }
405
406 // Output the document occurances
407 if ((df > 0) && (first_d < last_d)) {
408
409 // figure out the phrases to output
410 if (last_d > docNums.size()) {
411 last_d = docNums.size();
412 }
413 count_d = last_d - first_d;
414
415 // output document list as XML
416 if (XMLmode) {
417 textout << "<documentlist length=\"" << df
418 << "\" start=\"" << first_d
419 << "\" end=\"" << last_d << "\">\n";
420
421 if(!print_documents(XMLmode, basepath, args["c"], docNums, docfreq,
422 first_d, last_d, disp, outconvert, textout)) {
423 output_error(
424 "phindaction: Fatal Error! Couldn't load text information in print_documents() or get_document_all_data()",
425 textout, outconvert, disp, logout, XMLmode);
426 return true;
427 }
428
429 textout << "</documentlist>\n";
430 }
431
432 // output document list as HTML
433 else {
434
435 if (count_d == docNums.size()) {
436 textout << "<p><b> " << count_d << " documents</b>\n";
437 } else {
438 textout << "<p><b>" << count_d << " of " << df << " documents</b>\n";
439 }
440
441 textout << "<p><table border=1><tr><th align=left>document</th><th>freq</th></tr>\n";
442 if(!print_documents(XMLmode, basepath, args["c"], docNums, docfreq,
443 first_d, last_d, disp, outconvert, textout)) {
444 output_error(
445 "phindaction: Fatal Error! Couldn't load text information in print_documents()",
446 textout, outconvert, disp, logout, XMLmode);
447 return true;
448 }
449 textout << "</table>\n";
450
451 if (last_d < df) {
452 if ((last_d + 10) < df) {
453 textout << outconvert << disp
454 << "<br><a href=\"_gwcgi_?"
455 << "c=" << args["c"]
456 << "&ppnum=" << phrase
457 << "&pfe=" << first_e
458 << "&ple=" << last_e
459 << "&pfd=" << first_d
460 << "&pld=" << (last_d + 10)
461 << "&pfl=" << first_l
462 << "&pll=" << last_l
463 << "\">Get more documents</a>\n";
464 }
465 textout << outconvert << disp
466 << "<br><a href=\"_gwcgi_?"
467 << "c=" << args["c"]
468 << "&ppnum=" << phrase
469 << "&pfe=" << first_e
470 << "&ple=" << last_e
471 << "&pfd=" << first_d
472 << "&pld=" << df
473 << "&pfl=" << first_l
474 << "&pll=" << last_l
475 << "\">Get every document</a>\n";
476 }
477 }
478 }
479
480 // Close the document
481 if (XMLmode) {
482 textout << "</phinddata>\n";
483 } else {
484 textout << "</center></body></html>\n";
485 }
486
487 textdata.UnloadData ();
488
489 return true;
490}
491
492// Find the phrase number of a word in the index file
493bool phindaction::find_phrase_number_from_word(const text_t &basepath,
494 const text_t &query,
495 DocNumArray &result) {
496
497 // Open the index file for searching
498 IndexData indexData;
499
500 text_t fullpath = filename_cat(basepath, "pword");
501 char *fullpathc = fullpath.getcstr();
502#if defined __WIN32__
503 char *base = "";
504#else
505 char *base = "/";
506#endif
507
508 if (!indexData.LoadData (base, fullpathc)) {
509 // FatalError (1, "Couldn't load index information for \"%s\"", fullpathc);
510 //exit(0);
511 /** Don't handle fatal errors here anymore.
512 * DLConsulting 12-07-2004
513 */
514 return false; // Indicates something very bad has happened
515 }
516
517 delete []fullpathc;
518
519 // set up the query object
520 QueryInfo queryInfo;
521 SetCStr (queryInfo.docLevel, "Document", 8);
522 queryInfo.maxDocs = 5;
523 queryInfo.sortByRank = true;
524 queryInfo.exactWeights = false;
525 queryInfo.needRankInfo = true;
526 queryInfo.needTermFreqs = true;
527
528 // mode 1 = casefolded, unstemmed search
529 UCArray ucquery;
530 // greenstone gives us the query encoded in unicode. We want utf8.
531 char* utf8querystring=to_utf8(query).getcstr();
532 SetCStr(ucquery, utf8querystring);
533 delete []utf8querystring;
534
535 //toUCArray(query, ucquery);
536 QueryNode *queryTree = ParseQuery(ucquery, 1, 1, 4);
537
538 // perform the query
539 ExtQueryResult queryResult;
540 MGQuery (indexData, queryInfo, queryTree, queryResult);
541 // cout << "-- word lookup result -- " << endl << queryResult << endl ;
542
543 result.clear();
544 result = queryResult.docs;
545
546 // delete the query
547 if (queryTree != NULL) delete queryTree;
548
549 indexData.UnloadData();
550
551 /** This method now returns a boolean, so...
552 * DLConsulting 12-07-2004
553 */
554 return true; // Indicates that what happened is all good, baby.
555}
556
557// Get all the data about a phrase
558//
559// The phrase is stored in textData as record phrase.
560// We retrieve:
561// word - the text of the phrase
562// tf - the total frequency of the phrase
563// ef - the expansion frequency of the phrase
564// lf - the thesaurus link frequency of the phrase
565// df - the document frequency of the phrase
566// el - the list of phrases that are expansions of phrase
567// ll - the list of phrases that are thesaurus links
568// dl - the list of documents that contain phrase
569bool phindaction::get_phrase_all_data(TextData &textdata, unsigned long phrase,
570 text_t &word, unsigned long &tf, unsigned long &ef,
571 unsigned long &lf, unsigned long &df,
572 vector <unsigned long> &el,
573 vector <unsigned long> &linkdest,
574 vector <UCArray> &linktype,
575 vector <unsigned long> &docnum,
576 vector <unsigned long> &docfrq) {
577 UCArray text;
578 UCArray docLevel;
579 SetCStr(docLevel, "Document", 8);
580
581 // Look the word up in the textData
582 if (!GetDocText (textdata, docLevel, phrase, text)) {
583 // FatalError (1, "Error while trying to get phrase %u", phrase);
584 //exit(0);
585 return false; // Something very bad has happened.
586 }
587
588 // Ignore everything up to the first colon
589 UCArray::iterator next = text.begin();
590 while (*next++ != ':');
591
592 // ignore training carriage returns
593 while (text.back() == '\n') {
594 text.pop_back();
595 }
596
597 // Get the word
598 word.clear();
599 for (; *next != ':'; ++next) {
600 word.push_back(*next);
601 }
602
603 // Get total frequency
604 tf = 0;
605 for (++next; *next != ':'; ++next) {
606 tf *= 10;
607 tf += (*next - '0');
608 }
609
610 // Get expansion frequency
611 ef = 0;
612 for (++next; *next != ':'; ++next) {
613 ef *= 10;
614 ef += (*next - '0');
615 }
616
617 // Get document frequency
618 df = 0;
619 for (++next; *next != ':'; ++next) {
620 df *= 10;
621 df += (*next - '0');
622 }
623
624 // Get expansion list
625 el.clear();
626 unsigned long e = 0;
627 for (++next; *next != ':'; ++next) {
628 if (*next == ',') {
629 el.push_back(e);
630 e = 0;
631 } else {
632 e *= 10;
633 e += (*next - '0');
634 }
635 }
636
637 // Get document list & the document frequency list
638 docnum.clear();
639 docfrq.clear();
640 bool readnum = false;
641 unsigned long d = 0;
642 for (++next; *next != ':'; ++next) {
643 if (*next == ',') {
644 docnum.push_back(d);
645 readnum = true;
646 d = 0;
647 } else if (*next == ';') {
648 if (readnum) {
649 docfrq.push_back(d);
650 } else {
651 docnum.push_back(d);
652 docfrq.push_back(1);
653 }
654 readnum = false;
655 d = 0;
656 } else {
657 d *= 10;
658 d += (*next - '0');
659 }
660 }
661
662 // Get thesaurus link frequency & link list
663 text.push_back(':');
664 text.push_back(':');
665
666 // link frequency
667 lf = 0;
668 for (++next; *next != ':'; ++next) {
669 lf *= 10;
670 lf += (*next - '0');
671 }
672
673 // two lists of link data
674 linkdest.clear();
675 linktype.clear();
676
677 UCArray thistype;
678 thistype.clear();
679 bool typedone = false;
680 unsigned long l = 0;
681 for (++next; *next != ':'; ++next) {
682
683 if (!typedone) {
684 // first read the link type, a charactor string
685 if (*next == ',') {
686 typedone = true;
687 } else {
688 thistype.push_back(*next);
689 }
690 } else {
691 // having read the link type, read the list of link destinations
692 if (*next == ',') {
693 linkdest.push_back(l);
694 linktype.push_back(thistype);
695 l = 0;
696 } else if (*next == ';') {
697 linkdest.push_back(l);
698 linktype.push_back(thistype);
699 l = 0;
700 thistype.clear();
701 typedone = false;
702 } else {
703 l *= 10;
704 l += (*next - '0');
705 }
706 }
707 }
708
709 return true; // Indicates that what happened is all good, baby.
710}
711
712bool phindaction::print_thesaurus_links(const text_t &collection, bool XMLmode,
713 TextData &textdata, vector <unsigned long> &linkdest,
714 vector <UCArray> &linktype, unsigned long first,
715 unsigned long last, displayclass &disp,
716 outconvertclass &outconvert, ostream &textout) {
717
718 // information describing each link in the list
719 unsigned long phrase, tf, ef, df;
720 UCArray type, text;
721
722 for (unsigned long l = first; l < last; ++l) {
723
724 // get the phrase data
725 phrase = linkdest[l];
726 type = linktype[l];
727
728 /** DLConsulting 12-07-2004 */
729 if(!get_phrase_freq_data(textdata, phrase, text, tf, ef, df)) {
730 return false;
731 }
732
733 if (XMLmode) {
734 textout << "<thesaurus num=\"" << l
735 << "\" id=\"" << phrase
736 << "\" tf=\"" << tf
737 << "\" df=\"" << df
738 << "\" type=\"" << type
739 << "\" text=\"" << text
740 << "\"/>\n";
741 } else {
742 textout << "<tr valign=top><td>" << type << "</td><td>";
743 textout << outconvert << disp
744 << "<a href=\"_gwcgi_?c=" << collection;
745 textout << "&ppnum=" << phrase << "\">" << text << "</a>"
746 << "</td><td>" << tf << "</td><td>" << df << "</td></tr>\n";
747 }
748 }
749
750 /** DLConsulting 12-07-2004 */
751 return true;
752}
753
754// Get the frequency data about a phrase
755//
756// The phrase is stored in textData as record phrase.
757// We retrieve:
758// word - the text of the phrase
759// tf - the total frequency of the phrase
760// ef - the expansion frequency of the phrase
761// df - the document frequency of the phrase
762/**
763 * Returns:
764 * false if the method suffered a fatal error, true otherwise
765 */
766bool phindaction::get_phrase_freq_data(TextData &textdata, unsigned long phrase,
767 UCArray &word, unsigned long &tf,
768 unsigned long &ef, unsigned long &df) {
769
770 UCArray text;
771 UCArray docLevel;
772 SetCStr(docLevel, "Document", 8);
773
774 // Look the word up in the textData
775 if (!GetDocText (textdata, docLevel, phrase, text)) {
776 // FatalError (1, "Error while trying to get phrase %u", phrase);
777 //exit(0);
778 /** DLConsulting 12-07-2004 */
779 return false;
780 }
781
782 // Ignore everything up to the first colon
783 UCArray::iterator next = text.begin();
784 while (*next++ != ':');
785
786 // Get the word
787 word.clear();
788 for (; *next != ':'; ++next) {
789 word.push_back(*next);
790 }
791
792 // Get total frequency
793 tf = 0;
794 for (++next; *next != ':'; ++next) {
795 tf *= 10;
796 tf += (*next - '0');
797 }
798
799 // Get expansion frequency
800 ef = 0;
801 for (++next; *next != ':'; ++next) {
802 ef *= 10;
803 ef += (*next - '0');
804 }
805
806 // Get document frequency
807 df = 0;
808 for (++next; *next != ':'; ++next) {
809 df *= 10;
810 df += (*next - '0');
811 }
812
813 /** DLConsulting 12-07-2004 */
814 return true;
815}
816
817// Print a list of expansions
818//
819// Given the textData and a list of phrase numbers, print out each of the
820// expansions.
821void phindaction::print_expansions(const text_t &collection, bool XMLmode,
822 const text_t &body, TextData &textdata,
823 const vector <unsigned long> &elist,
824 unsigned long first, unsigned long last,
825 displayclass &disp, outconvertclass &outconvert,
826 ostream &textout) {
827
828 UCArray word;
829 unsigned long phrase, tf, df, ef;
830
831 UCArray suffix, prefix, ucbody;
832
833 toUCArray(body, ucbody);
834
835 for (unsigned long e = first; e < last; ++e) {
836
837 phrase = elist[e];
838 get_phrase_freq_data(textdata, phrase, word, tf, ef, df);
839
840 split_phrase(word, ucbody, prefix, suffix);
841
842 if (XMLmode) {
843 // body is always the same as the text of the phrase, so no need to send it
844 textout << "<expansion num=\"" << e
845 << "\" id=\"" << phrase
846 << "\" tf=\"" << tf
847 << "\" df=\"" << df;
848 if (!prefix.empty()) {
849 textout << "\" prefix=\"" << prefix;
850 }
851 if (!suffix.empty()) {
852 textout << "\" suffix=\"" << suffix;
853 }
854 textout << "\"/>\n";
855 } else {
856 textout << outconvert << disp
857 << "<tr valign=top><td align=right><a href=\"_gwcgi_?"
858 << "c=" << collection << "&ppnum=" << phrase << "\">";
859 textout << prefix << "</a></td>";
860 textout <<outconvert << disp
861 << "<td align=center><a href=\"_gwcgi_?"
862 << "c=" << collection << "&ppnum=" << phrase << "\">"
863 << body << "</a></td>"
864 << "<td align=left><a href=\"_gwcgi_?"
865 << "c=" << collection << "&ppnum=" << phrase << "\">";
866 textout << suffix << "</a></td>"
867 << "<td>" << tf << "</td><td>" << df << "</td></tr>\n";
868 }
869 }
870}
871
872// split an expansion into prefix and suffix
873void phindaction::split_phrase(const UCArray &word, const UCArray &body,
874 UCArray &prefix, UCArray &suffix) {
875
876 prefix.clear();
877 suffix.clear();
878
879 bool readingPrefix = true;
880 UCArray::const_iterator here = word.begin();
881 UCArray::const_iterator end = word.end();
882
883 while (here != end) {
884
885 // if we've not read all the prefix, add the next char to the prefix
886 if (readingPrefix) {
887 if (phrase_match(body, here, end)) {
888 readingPrefix = false;
889 // trim whitespace from end of prefix & start of suffix
890 if (!prefix.empty()) {
891 prefix.pop_back();
892 }
893 if ((here != end) && (*here == ' ')) {
894 ++here;
895 }
896 } else {
897 prefix.push_back(*here);
898 ++here;
899 }
900 }
901 // if we've finished with the prefix, update the suffix
902 else {
903 suffix.push_back(*here);
904 ++here;
905 }
906 }
907}
908
909// phrase_match
910//
911// compare two strings, one represented as an UCArray, the other as two
912// UCArray iterators.
913//
914// Return true if the UCArray is the same as the phrase the iterators point
915// to for the length of the UCArray.
916bool phindaction::phrase_match(const UCArray &text, UCArray::const_iterator &here,
917 UCArray::const_iterator end) {
918
919 UCArray::const_iterator one_here = text.begin();
920 UCArray::const_iterator one_end = text.end();
921 UCArray::const_iterator two_here = here;
922
923 // iterate over the length of the first string, comparing each element to
924 // the corresponding element in the second string.
925 while (one_here != one_end) {
926
927 if (two_here == end) {
928 return false;
929 } else if (*one_here != *two_here) {
930 return false;
931 }
932 ++one_here;
933 ++two_here;
934 }
935
936 here = two_here;
937 return true;
938}
939
940bool phindaction::print_documents(bool XMLmode, const text_t &basepath,
941 const text_t &collection,
942 const vector <unsigned long> &docNums,
943 const vector <unsigned long> &docFreq,
944 unsigned long first, unsigned long last,
945 displayclass &disp, outconvertclass &outconvert,
946 ostream &textout) {
947
948 // Create a TextData object to read the document data
949 TextData docdata;
950
951 text_t fullpath = filename_cat(basepath, "docs");
952 char *fullpathc = fullpath.getcstr();
953#if defined __WIN32__
954 char *base = "";
955#else
956 char *base = "/";
957#endif
958
959 if (!docdata.LoadData (base, fullpathc)) {
960 // FatalError (1, "Couldn't load text information for \"%s\"", fullpathc);
961 //exit(0);
962 /** DLConsulting 12-07-2004 */
963 return false;
964 }
965
966 delete []fullpathc;
967
968 UCArray title, hash;
969 unsigned long freq, doc;
970
971 for (unsigned long d = first; d < last; ++d) {
972 doc = docNums[d];
973 freq = docFreq[d];
974
975 /** DLConsulting 13-07-2004 */
976 if(!get_document_all_data(docdata, doc, title, hash)) {
977 return false;
978 }
979
980 if (XMLmode) {
981 textout << "<document num=\"" << d
982 << "\" hash=\"" << hash
983 << "\" freq=\"" << freq
984 << "\" title=\"" << title << "\"/>\n";
985 } else {
986 textout << outconvert << disp
987 << "<tr valign=top><td><a href=\"_gwcgi_?"
988 << "c=" << collection;
989 textout << "&a=d&d=" << hash << "\">" << title << "</a>"
990 << "</td><td>" << freq << "</td></tr>\n";
991 }
992 }
993
994 docdata.UnloadData();
995
996 /** DLConsulting 12-07-2004 */
997 return true;
998}
999
1000// Get all the data about a docment
1001//
1002// The document's details are stored in docData as record docNum.
1003// We retrieve:
1004// title - the document's title
1005// hash - the document's unique OID
1006/** Returns:
1007 * false if a fatal error occured, true otherwise
1008 * DLConsulting 12-07-2004
1009 */
1010bool phindaction::get_document_all_data(TextData &docdata, unsigned long docNum,
1011 UCArray &title, UCArray &hash) {
1012
1013 UCArray text;
1014 UCArray docLevel;
1015 SetCStr(docLevel, "Document", 8);
1016
1017 // Look the word up in the textData
1018 if (!GetDocText (docdata, docLevel, docNum, text)) {
1019 // FatalError (1, "Error while trying to get document %u", docNum);
1020 //exit(0);
1021 /** DLConsulting 13-07-2004 */
1022 return false;
1023 }
1024
1025 // Ignore everything up to the first colon
1026 UCArray::iterator next = text.begin();
1027 while (*next++ != '\t');
1028
1029 // Get the document OID (hash)
1030 hash.clear();
1031 for (; *next != '\t'; ++next) {
1032 hash.push_back(*next);
1033 }
1034
1035 // Get the title
1036 text.push_back('\n');
1037 title.clear();
1038 for (++next; *next != '\n'; ++next) {
1039 title.push_back(*next);
1040 }
1041
1042 /** DLConsulting 13-07-2004 */
1043 return true;
1044}
1045
1046void phindaction::toUCArray(const text_t &in, UCArray &out) {
1047 out.clear();
1048 if (out.capacity() < in.size() + 1) {
1049 out.reserve(in.size() + 1);
1050 }
1051 text_t::const_iterator here = in.begin();
1052 text_t::const_iterator end = in.end();
1053 while (here != end) {
1054 out.push_back((unsigned char) *here);
1055 ++here;
1056 }
1057}
1058
1059void phindaction::output_error (const text_t &message, ostream &textout,
1060 outconvertclass &outconvert,
1061 displayclass & disp, ostream &logout,
1062 bool XMLmode) {
1063
1064 logout << outconvert << message << "\n";
1065 if (XMLmode) {
1066 textout << outconvert
1067 << "<phinddata>\n"
1068 << "<phinderror>" << message << "</phinderror>\n"
1069 << "</phinddata>\n";
1070 } else {
1071 textout << outconvert << disp
1072 << "_header_\n"
1073 << message
1074 << "_footer_\n";
1075 }
1076}
1077
1078#endif //GSDL_USE_PHIND_ACTION
1079
Note: See TracBrowser for help on using the repository browser.