source: gsdl/trunk/src/recpt/phindaction.cpp@ 16310

Last change on this file since 16310 was 16310, checked in by davidb, 13 years ago

Introduction of 'collecthome' which parallels 'gsdlhome' to allow the toplevel collect folder to be outside of the gsdlhome area

  • Property svn:keywords set to Author Date Id Revision
File size: 29.3 KB
Line 
1/**********************************************************************
2 *
3 * phindaction.cpp --
4 *
5 * Copyright 2001 Gordon W. Paynter
6 * Copyright 2001 The New Zealand Digital Library Project
7 *
8 * A component of the Greenstone digital library software
9 * from the New Zealand Digital Library Project at the
10 * University of Waikato, New Zealand.
11 *
12 * This program is free software; you can redistribute it and/or modify
13 * it under the terms of the GNU General Public License as published by
14 * the Free Software Foundation; either version 2 of the License, or
15 * (at your option) any later version.
16 *
17 * This program is distributed in the hope that it will be useful,
18 * but WITHOUT ANY WARRANTY; without even the implied warranty of
19 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20 * GNU General Public License for more details.
21 *
22 * You should have received a copy of the GNU General Public License
23 * along with this program; if not, write to the Free Software
24 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
25 *
26 *********************************************************************/
27
28#include "gsdl_modules_cfg.h"
29#ifdef GSDL_USE_PHIND_ACTION
30
31// Note that this action uses mgpp to retrieve phind info, calling MGQuery
32// etc. directly, not through the protocol. This breaks our receptionist -
33// collection server separation and should be fixed some day I guess.
34
35#include "phindaction.h"
36#include "fileutil.h"
37
38phindaction::phindaction () {
39
40 cgiarginfo arg_ainfo;
41
42 arg_ainfo.shortname = "pc";
43 arg_ainfo.longname = "phind classifier";
44 arg_ainfo.multiplechar = true;
45 arg_ainfo.defaultstatus = cgiarginfo::weak;
46 arg_ainfo.argdefault = g_EmptyText;
47 arg_ainfo.savedarginfo = cgiarginfo::mustnot;
48 argsinfo.addarginfo (NULL, arg_ainfo);
49
50 arg_ainfo.shortname = "pxml";
51 arg_ainfo.longname = "phind XML mode";
52 arg_ainfo.multiplechar = false;
53 arg_ainfo.defaultstatus = cgiarginfo::weak;
54 arg_ainfo.argdefault = "0";
55 arg_ainfo.savedarginfo = cgiarginfo::mustnot;
56 argsinfo.addarginfo (NULL, arg_ainfo);
57
58 arg_ainfo.shortname = "ppnum";
59 arg_ainfo.longname = "phind phrase number";
60 arg_ainfo.multiplechar = true;
61 arg_ainfo.defaultstatus = cgiarginfo::weak;
62 arg_ainfo.argdefault = "0";
63 arg_ainfo.savedarginfo = cgiarginfo::mustnot;
64 argsinfo.addarginfo (NULL, arg_ainfo);
65
66 arg_ainfo.shortname = "pptext";
67 arg_ainfo.longname = "phind phrase text";
68 arg_ainfo.multiplechar = true;
69 arg_ainfo.defaultstatus = cgiarginfo::weak;
70 arg_ainfo.argdefault = g_EmptyText;
71 arg_ainfo.savedarginfo = cgiarginfo::mustnot;
72 argsinfo.addarginfo (NULL, arg_ainfo);
73
74 arg_ainfo.shortname = "pfe";
75 arg_ainfo.longname = "phind first_e";
76 arg_ainfo.multiplechar = true;
77 arg_ainfo.defaultstatus = cgiarginfo::weak;
78 arg_ainfo.argdefault = "0";
79 arg_ainfo.savedarginfo = cgiarginfo::mustnot;
80 argsinfo.addarginfo (NULL, arg_ainfo);
81
82 arg_ainfo.shortname = "ple";
83 arg_ainfo.longname = "phind last_e";
84 arg_ainfo.multiplechar = true;
85 arg_ainfo.defaultstatus = cgiarginfo::weak;
86 arg_ainfo.argdefault = "10";
87 arg_ainfo.savedarginfo = cgiarginfo::mustnot;
88 argsinfo.addarginfo (NULL, arg_ainfo);
89
90 arg_ainfo.shortname = "pfl";
91 arg_ainfo.longname = "phind first_l";
92 arg_ainfo.multiplechar = true;
93 arg_ainfo.defaultstatus = cgiarginfo::weak;
94 arg_ainfo.argdefault = "0";
95 arg_ainfo.savedarginfo = cgiarginfo::mustnot;
96 argsinfo.addarginfo (NULL, arg_ainfo);
97
98 arg_ainfo.shortname = "pll";
99 arg_ainfo.longname = "phind last_l";
100 arg_ainfo.multiplechar = true;
101 arg_ainfo.defaultstatus = cgiarginfo::weak;
102 arg_ainfo.argdefault = "10";
103 arg_ainfo.savedarginfo = cgiarginfo::mustnot;
104 argsinfo.addarginfo (NULL, arg_ainfo);
105
106 arg_ainfo.shortname = "pfd";
107 arg_ainfo.longname = "phind first_d";
108 arg_ainfo.multiplechar = true;
109 arg_ainfo.defaultstatus = cgiarginfo::weak;
110 arg_ainfo.argdefault = "0";
111 arg_ainfo.savedarginfo = cgiarginfo::mustnot;
112 argsinfo.addarginfo (NULL, arg_ainfo);
113
114 arg_ainfo.shortname = "pld";
115 arg_ainfo.longname = "phind last_d";
116 arg_ainfo.multiplechar = true;
117 arg_ainfo.defaultstatus = cgiarginfo::weak;
118 arg_ainfo.argdefault = "10";
119 arg_ainfo.savedarginfo = cgiarginfo::mustnot;
120 argsinfo.addarginfo (NULL, arg_ainfo);
121}
122
123phindaction::~phindaction () {
124}
125
126void phindaction::get_cgihead_info (cgiargsclass &args, recptprotolistclass * /*protos*/,
127 response_t &response,text_t &response_data,
128 ostream &/*logout*/) {
129 response = content;
130 if (args["pxml"] == "1") {
131 response_data = "text/xml";
132 } else {
133 response_data = "text/html";
134 }
135}
136
137bool phindaction::do_action (cgiargsclass &args, recptprotolistclass *protos,
138 browsermapclass * /*browsers*/, displayclass &disp,
139 outconvertclass &outconvert, ostream &textout,
140 ostream &logout) {
141
142 unsigned long count_l, count_e, count_d;
143 unsigned long phrase = args["ppnum"].getulong();
144 text_t &word = args["pptext"];
145 unsigned long first_e = args["pfe"].getulong();
146 unsigned long last_e = args["ple"].getulong();
147 unsigned long first_l = args["pfl"].getulong();
148 unsigned long last_l = args["pll"].getulong();
149 unsigned long first_d = args["pfd"].getulong();
150 unsigned long last_d = args["pld"].getulong();
151 bool XMLmode = false;
152 if (args["pxml"] == "1") XMLmode = true;
153
154 // must have a valid collection server
155 recptproto *collectproto = protos->getrecptproto (args["c"], logout);
156 if (collectproto == NULL) {
157 output_error("phindaction: ERROR: collection not set", textout,
158 outconvert, disp, logout, XMLmode);
159 return true;
160 }
161
162 // the frequency and occurances of the phrase
163 unsigned long tf;
164 vector <unsigned long> el, linkdest, docNums, docfreq;
165 vector <UCArray> linktype;
166
167 // the number of occurances to display
168 unsigned long ef, lf, df;
169
170 text_t basepath = filename_cat(collecthome, args["c"],
171 "index", "phind" + args["pc"]);
172
173 // If we don't know the phrase number, look it up
174 if (phrase == 0) {
175
176 if (word.empty()) {
177 output_error("phindaction: ERROR: no phrase number or word", textout,
178 outconvert, disp, logout, XMLmode);
179 return true;
180 }
181
182 DocNumArray result;
183 /** In order to prevent browser crashing problems, any method which
184 * previously suffered a silent fatal error, now instead returns false
185 * to indicate a fatal error has occured. We can then dispatch an
186 * appropriate error tag to the Phind applet (rather than leave it
187 * whiling away the milliseconds until the end of existence - or at
188 * least your browser - in an infinite loop!)
189 * DLConsulting 12-07-2004
190 */
191 if(!find_phrase_number_from_word(basepath, word, result)) {
192 output_error("phindaction: Fatal Error! Couldn't load index information in find_phrase_number_from_word()",
193 textout, outconvert, disp, logout, XMLmode);
194 return true;
195 }
196
197 if (result.empty()) {
198 output_error("phindaction: The search term does not occur in the collection",
199 textout, outconvert, disp, logout, XMLmode);
200 return true;
201 } else {
202 phrase = result[0];
203 }
204 }
205
206 // Create a TextData object to read the phrase data (pdata)
207 TextData textdata;
208
209 text_t fullpath = filename_cat(basepath, "pdata");
210 char *fullpathc = fullpath.getcstr();
211#if defined __WIN32__
212 char *base = "";
213#else
214 char *base = "/";
215#endif
216
217 if (!textdata.LoadData (base, fullpathc)) {
218 // FatalError (1, "Couldn't load text information for \"%s\"", fullpathc);
219 //exit(0);
220 /** We must return something to the client, whether this error is fatal or
221 * no, otherwise we risk sending their browser into an infinite loop!
222 * DLConsulting 12-07-2004
223 */
224 output_error("phindaction: Fatal Error! Couldn't load text information for collection",
225 textout, outconvert, disp, logout, XMLmode);
226 return true;
227 }
228
229 delete []fullpathc;
230
231 /** Another previously silent method can now cry out.
232 * DLConsulting 12-07-2004
233 */
234 if(!get_phrase_all_data(textdata, phrase, word, tf, ef, lf, df, el,
235 linkdest, linktype, docNums, docfreq)) {
236 output_error(
237 "phindaction: Fatal Error! Couldn't parse phrase in get_phrase_all_data()",
238 textout, outconvert, disp, logout, XMLmode);
239 return true;
240 }
241
242 // Output the header
243 if (XMLmode) {
244 textout << "<phinddata id=\"" << phrase
245 << "\" text=\"" << word
246 << "\" tf=\"" << tf
247 << "\" ef=\"" << ef
248 << "\" df=\"" << df
249 << "\" lf=\"" << lf
250 << "\">\n";
251 } else {
252 textout << "<html><head><title>" << word << "</title></head>\n"
253 << "<body><center>\n"
254 << "<p><h1>" << word << "</h1>\n"
255 << "<p><b>"<< word << "</b> occurs "
256 << tf << " times in " << df << " documents\n";
257 }
258
259 // Output the thesaurus links
260 if ((lf > 0) && (first_l < last_l)) {
261
262 // figure out the number of phrases to output
263 if (last_l > lf) {
264 last_l = lf;
265 }
266 count_l = last_l - first_l;
267
268 if (XMLmode) {
269 textout << "<thesauruslist length=\"" << lf
270 << "\" start=\"" << first_l
271 << "\" end=\"" << last_l << "\">\n";
272 /** DLConsulting 12-07-2004 */
273 if(!print_thesaurus_links(args["c"], XMLmode, textdata, linkdest, linktype,
274 first_l, last_l, disp, outconvert, textout)) {
275 output_error(
276 "phindaction: Fatal Error! Couldn't get phrase in get_phrase_freq_data()",
277 textout, outconvert, disp, logout, XMLmode);
278 return true;
279 }
280 textout << "</thesauruslist>\n";
281 }
282
283 // output links as HTML
284 else {
285 if (count_l == lf) {
286 textout << "<p><b> " << count_l << " thesaurus links</b>\n";
287 } else {
288 textout << "<p><b>" << count_l << " of " << lf << " thesaurus links</b>\n";
289 }
290
291 textout << "<p><table border=1><tr><th>type</th><th>topic</th><th>freq</th><th>docs</th></tr>\n";
292 /** DLConsulting 12-07-2004 */
293 if(!print_thesaurus_links(args["c"], XMLmode, textdata, linkdest, linktype,
294 first_l, last_l, disp, outconvert, textout)) {
295 output_error(
296 "phindaction: Fatal Error! Couldn't get phrase in get_phrase_freq_data()",
297 textout, outconvert, disp, logout, XMLmode);
298 return true;
299 }
300 textout << "</table>\n";
301
302 if (last_l < lf) {
303 if ((last_l + 10) < lf) {
304 textout << outconvert << disp
305 << "<br><a href=\"_gwcgi_?"
306 << "c=" << args["c"]
307 << "&ppnum=" << phrase
308 << "&pfe=" << first_e
309 << "&ple=" << last_e
310 << "&pfd=" << first_d
311 << "&pld=" << last_d
312 << "&pfl=" << first_l
313 << "&pll=" << (last_l + 10)
314 << "\">Get more thesaurus links</a>\n";
315 }
316 textout << outconvert << disp
317 << "<br><a href=\"_gwcgi_?"
318 << "c=" << args["c"]
319 << "&ppnum=" << phrase
320 << "&pfe=" << first_e
321 << "&ple=" << last_e
322 << "&pfd=" << first_d
323 << "&pld=" << last_d
324 << "&pfl=" << first_l
325 << "&pll=" << lf
326 << "\">Get every thesaurus link</a>\n" ;
327 }
328 }
329 }
330
331 // Output the expansions
332 if ((ef > 0) && (first_e < last_e)) {
333
334 // figure out the number of phrases to output
335 if (last_e > el.size()) {
336 last_e = el.size();
337 }
338 count_e = last_e - first_e;
339
340 // output expansions as XML
341 if (XMLmode) {
342 textout << "<expansionlist length=\"" << ef
343 << "\" start=\"" << first_e
344 << "\" end=\"" << last_e << "\">" << endl;
345
346 print_expansions(args["c"], XMLmode, word, textdata, el, first_e,
347 last_e, disp, outconvert, textout);
348
349 textout << "</expansionlist>\n";
350 }
351
352 // output expansions as HTML
353 else {
354 if (count_e == el.size()) {
355 textout << "<p><b> " << count_e << " expansions</b>\n";
356 } else {
357 textout << "<p><b>" << count_e << " of " << ef << " expansions</b>\n";
358 }
359
360 textout << "<p><table border=1><tr><th colspan=3>phrase</th><th>freq</th><th>docs</th></tr>\n";
361 print_expansions(args["c"], XMLmode, word, textdata, el, first_e,
362 last_e, disp, outconvert, textout);
363 textout << "</table>\n";
364
365 if (last_e < ef) {
366 if ((last_e + 10) < ef) {
367 textout << outconvert << disp
368 << "<br><a href=\"_gwcgi_?"
369 << "c=" << args["c"]
370 << "&ppnum=" << phrase
371 << "&pfe=" << first_e
372 << "&ple=" << (last_e + 10)
373 << "&pfd=" << first_d
374 << "&pld=" << last_d
375 << "&pfl=" << first_l
376 << "&pll=" << last_l
377 << "\">Get more expansions</a>\n";
378 }
379 textout << outconvert << disp
380 << "<br><a href=\"_gwcgi_?"
381 << "c=" << args["c"]
382 << "&ppnum=" << phrase
383 << "&pfe=" << first_e
384 << "&ple=" << ef
385 << "&pfd=" << first_d
386 << "&pld=" << last_d
387 << "&pfl=" << first_l
388 << "&pll=" << last_l
389 << "\">Get every expansion</a>\n";
390 }
391 }
392 }
393
394 // Output the document occurances
395 if ((df > 0) && (first_d < last_d)) {
396
397 // figure out the phrases to output
398 if (last_d > docNums.size()) {
399 last_d = docNums.size();
400 }
401 count_d = last_d - first_d;
402
403 // output document list as XML
404 if (XMLmode) {
405 textout << "<documentlist length=\"" << df
406 << "\" start=\"" << first_d
407 << "\" end=\"" << last_d << "\">\n";
408
409 if(!print_documents(XMLmode, basepath, args["c"], docNums, docfreq,
410 first_d, last_d, disp, outconvert, textout)) {
411 output_error(
412 "phindaction: Fatal Error! Couldn't load text information in print_documents() or get_document_all_data()",
413 textout, outconvert, disp, logout, XMLmode);
414 return true;
415 }
416
417 textout << "</documentlist>\n";
418 }
419
420 // output document list as HTML
421 else {
422
423 if (count_d == docNums.size()) {
424 textout << "<p><b> " << count_d << " documents</b>\n";
425 } else {
426 textout << "<p><b>" << count_d << " of " << df << " documents</b>\n";
427 }
428
429 textout << "<p><table border=1><tr><th align=left>document</th><th>freq</th></tr>\n";
430 if(!print_documents(XMLmode, basepath, args["c"], docNums, docfreq,
431 first_d, last_d, disp, outconvert, textout)) {
432 output_error(
433 "phindaction: Fatal Error! Couldn't load text information in print_documents()",
434 textout, outconvert, disp, logout, XMLmode);
435 return true;
436 }
437 textout << "</table>\n";
438
439 if (last_d < df) {
440 if ((last_d + 10) < df) {
441 textout << outconvert << disp
442 << "<br><a href=\"_gwcgi_?"
443 << "c=" << args["c"]
444 << "&ppnum=" << phrase
445 << "&pfe=" << first_e
446 << "&ple=" << last_e
447 << "&pfd=" << first_d
448 << "&pld=" << (last_d + 10)
449 << "&pfl=" << first_l
450 << "&pll=" << last_l
451 << "\">Get more documents</a>\n";
452 }
453 textout << outconvert << disp
454 << "<br><a href=\"_gwcgi_?"
455 << "c=" << args["c"]
456 << "&ppnum=" << phrase
457 << "&pfe=" << first_e
458 << "&ple=" << last_e
459 << "&pfd=" << first_d
460 << "&pld=" << df
461 << "&pfl=" << first_l
462 << "&pll=" << last_l
463 << "\">Get every document</a>\n";
464 }
465 }
466 }
467
468 // Close the document
469 if (XMLmode) {
470 textout << "</phinddata>\n";
471 } else {
472 textout << "</center></body></html>\n";
473 }
474
475 textdata.UnloadData ();
476
477 return true;
478}
479
480// Find the phrase number of a word in the index file
481bool phindaction::find_phrase_number_from_word(const text_t &basepath,
482 const text_t &query,
483 DocNumArray &result) {
484
485 // Open the index file for searching
486 IndexData indexData;
487
488 text_t fullpath = filename_cat(basepath, "pword");
489 char *fullpathc = fullpath.getcstr();
490#if defined __WIN32__
491 char *base = "";
492#else
493 char *base = "/";
494#endif
495
496 if (!indexData.LoadData (base, fullpathc)) {
497 // FatalError (1, "Couldn't load index information for \"%s\"", fullpathc);
498 //exit(0);
499 /** Don't handle fatal errors here anymore.
500 * DLConsulting 12-07-2004
501 */
502 return false; // Indicates something very bad has happened
503 }
504
505 delete []fullpathc;
506
507 // set up the query object
508 QueryInfo queryInfo;
509 SetCStr (queryInfo.docLevel, "Document", 8);
510 queryInfo.maxDocs = 5;
511 queryInfo.sortByRank = true;
512 queryInfo.exactWeights = false;
513 queryInfo.needRankInfo = true;
514 queryInfo.needTermFreqs = true;
515
516 // mode 1 = casefolded, unstemmed search
517 UCArray ucquery;
518 toUCArray(query, ucquery);
519 QueryNode *queryTree = ParseQuery(ucquery, 1, 1, 4);
520
521 // perform the query
522 ExtQueryResult queryResult;
523 MGQuery (indexData, queryInfo, queryTree, queryResult);
524 // cout << "-- word lookup result -- " << endl << queryResult << endl ;
525
526 result.clear();
527 result = queryResult.docs;
528
529 // delete the query
530 if (queryTree != NULL) delete queryTree;
531
532 indexData.UnloadData();
533
534 /** This method now returns a boolean, so...
535 * DLConsulting 12-07-2004
536 */
537 return true; // Indicates that what happened is all good, baby.
538}
539
540// Get all the data about a phrase
541//
542// The phrase is stored in textData as record phrase.
543// We retrieve:
544// word - the text of the phrase
545// tf - the total frequency of the phrase
546// ef - the expansion frequency of the phrase
547// lf - the thesaurus link frequency of the phrase
548// df - the document frequency of the phrase
549// el - the list of phrases that are expansions of phrase
550// ll - the list of phrases that are thesaurus links
551// dl - the list of documents that contain phrase
552bool phindaction::get_phrase_all_data(TextData &textdata, unsigned long phrase,
553 text_t &word, unsigned long &tf, unsigned long &ef,
554 unsigned long &lf, unsigned long &df,
555 vector <unsigned long> &el,
556 vector <unsigned long> &linkdest,
557 vector <UCArray> &linktype,
558 vector <unsigned long> &docnum,
559 vector <unsigned long> &docfrq) {
560 UCArray text;
561 UCArray docLevel;
562 SetCStr(docLevel, "Document", 8);
563
564 // Look the word up in the textData
565 if (!GetDocText (textdata, docLevel, phrase, text)) {
566 // FatalError (1, "Error while trying to get phrase %u", phrase);
567 //exit(0);
568 return false; // Something very bad has happened.
569 }
570
571 // Ignore everything up to the first colon
572 UCArray::iterator next = text.begin();
573 while (*next++ != ':');
574
575 // ignore training carriage returns
576 while (text.back() == '\n') {
577 text.pop_back();
578 }
579
580 // Get the word
581 word.clear();
582 for (; *next != ':'; ++next) {
583 word.push_back(*next);
584 }
585
586 // Get total frequency
587 tf = 0;
588 for (++next; *next != ':'; ++next) {
589 tf *= 10;
590 tf += (*next - '0');
591 }
592
593 // Get expansion frequency
594 ef = 0;
595 for (++next; *next != ':'; ++next) {
596 ef *= 10;
597 ef += (*next - '0');
598 }
599
600 // Get document frequency
601 df = 0;
602 for (++next; *next != ':'; ++next) {
603 df *= 10;
604 df += (*next - '0');
605 }
606
607 // Get expansion list
608 el.clear();
609 unsigned long e = 0;
610 for (++next; *next != ':'; ++next) {
611 if (*next == ',') {
612 el.push_back(e);
613 e = 0;
614 } else {
615 e *= 10;
616 e += (*next - '0');
617 }
618 }
619
620 // Get document list & the document frequency list
621 docnum.clear();
622 docfrq.clear();
623 bool readnum = false;
624 unsigned long d = 0;
625 for (++next; *next != ':'; ++next) {
626 if (*next == ',') {
627 docnum.push_back(d);
628 readnum = true;
629 d = 0;
630 } else if (*next == ';') {
631 if (readnum) {
632 docfrq.push_back(d);
633 } else {
634 docnum.push_back(d);
635 docfrq.push_back(1);
636 }
637 readnum = false;
638 d = 0;
639 } else {
640 d *= 10;
641 d += (*next - '0');
642 }
643 }
644
645 // Get thesaurus link frequency & link list
646 text.push_back(':');
647 text.push_back(':');
648
649 // link frequency
650 lf = 0;
651 for (++next; *next != ':'; ++next) {
652 lf *= 10;
653 lf += (*next - '0');
654 }
655
656 // two lists of link data
657 linkdest.clear();
658 linktype.clear();
659
660 UCArray thistype;
661 thistype.clear();
662 bool typedone = false;
663 unsigned long l = 0;
664 for (++next; *next != ':'; ++next) {
665
666 if (!typedone) {
667 // first read the link type, a charactor string
668 if (*next == ',') {
669 typedone = true;
670 } else {
671 thistype.push_back(*next);
672 }
673 } else {
674 // having read the link type, read the list of link destinations
675 if (*next == ',') {
676 linkdest.push_back(l);
677 linktype.push_back(thistype);
678 l = 0;
679 } else if (*next == ';') {
680 linkdest.push_back(l);
681 linktype.push_back(thistype);
682 l = 0;
683 thistype.clear();
684 typedone = false;
685 } else {
686 l *= 10;
687 l += (*next - '0');
688 }
689 }
690 }
691
692 return true; // Indicates that what happened is all good, baby.
693}
694
695bool phindaction::print_thesaurus_links(const text_t &collection, bool XMLmode,
696 TextData &textdata, vector <unsigned long> &linkdest,
697 vector <UCArray> &linktype, unsigned long first,
698 unsigned long last, displayclass &disp,
699 outconvertclass &outconvert, ostream &textout) {
700
701 // information describing each link in the list
702 unsigned long phrase, tf, ef, df;
703 UCArray type, text;
704
705 for (unsigned long l = first; l < last; ++l) {
706
707 // get the phrase data
708 phrase = linkdest[l];
709 type = linktype[l];
710
711 /** DLConsulting 12-07-2004 */
712 if(!get_phrase_freq_data(textdata, phrase, text, tf, ef, df)) {
713 return false;
714 }
715
716 if (XMLmode) {
717 textout << "<thesaurus num=\"" << l
718 << "\" id=\"" << phrase
719 << "\" tf=\"" << tf
720 << "\" df=\"" << df
721 << "\" type=\"" << type
722 << "\" text=\"" << text
723 << "\"/>\n";
724 } else {
725 textout << "<tr valign=top><td>" << type << "</td><td>";
726 textout << outconvert << disp
727 << "<a href=\"_gwcgi_?c=" << collection;
728 textout << "&ppnum=" << phrase << "\">" << text << "</a>"
729 << "</td><td>" << tf << "</td><td>" << df << "</td></tr>\n";
730 }
731 }
732
733 /** DLConsulting 12-07-2004 */
734 return true;
735}
736
737// Get the frequency data about a phrase
738//
739// The phrase is stored in textData as record phrase.
740// We retrieve:
741// word - the text of the phrase
742// tf - the total frequency of the phrase
743// ef - the expansion frequency of the phrase
744// df - the document frequency of the phrase
745/**
746 * Returns:
747 * false if the method suffered a fatal error, true otherwise
748 */
749bool phindaction::get_phrase_freq_data(TextData &textdata, unsigned long phrase,
750 UCArray &word, unsigned long &tf,
751 unsigned long &ef, unsigned long &df) {
752
753 UCArray text;
754 UCArray docLevel;
755 SetCStr(docLevel, "Document", 8);
756
757 // Look the word up in the textData
758 if (!GetDocText (textdata, docLevel, phrase, text)) {
759 // FatalError (1, "Error while trying to get phrase %u", phrase);
760 //exit(0);
761 /** DLConsulting 12-07-2004 */
762 return false;
763 }
764
765 // Ignore everything up to the first colon
766 UCArray::iterator next = text.begin();
767 while (*next++ != ':');
768
769 // Get the word
770 word.clear();
771 for (; *next != ':'; ++next) {
772 word.push_back(*next);
773 }
774
775 // Get total frequency
776 tf = 0;
777 for (++next; *next != ':'; ++next) {
778 tf *= 10;
779 tf += (*next - '0');
780 }
781
782 // Get expansion frequency
783 ef = 0;
784 for (++next; *next != ':'; ++next) {
785 ef *= 10;
786 ef += (*next - '0');
787 }
788
789 // Get document frequency
790 df = 0;
791 for (++next; *next != ':'; ++next) {
792 df *= 10;
793 df += (*next - '0');
794 }
795
796 /** DLConsulting 12-07-2004 */
797 return true;
798}
799
800// Print a list of expansions
801//
802// Given the textData and a list of phrase numbers, print out each of the
803// expansions.
804void phindaction::print_expansions(const text_t &collection, bool XMLmode,
805 const text_t &body, TextData &textdata,
806 const vector <unsigned long> &elist,
807 unsigned long first, unsigned long last,
808 displayclass &disp, outconvertclass &outconvert,
809 ostream &textout) {
810
811 UCArray word;
812 unsigned long phrase, tf, df, ef;
813
814 UCArray suffix, prefix, ucbody;
815
816 toUCArray(body, ucbody);
817
818 for (unsigned long e = first; e < last; ++e) {
819
820 phrase = elist[e];
821 get_phrase_freq_data(textdata, phrase, word, tf, ef, df);
822
823 split_phrase(word, ucbody, prefix, suffix);
824
825 if (XMLmode) {
826 // body is always the same as the text of the phrase, so no need to send it
827 textout << "<expansion num=\"" << e
828 << "\" id=\"" << phrase
829 << "\" tf=\"" << tf
830 << "\" df=\"" << df;
831 if (!prefix.empty()) {
832 textout << "\" prefix=\"" << prefix;
833 }
834 if (!suffix.empty()) {
835 textout << "\" suffix=\"" << suffix;
836 }
837 textout << "\"/>\n";
838 } else {
839 textout << outconvert << disp
840 << "<tr valign=top><td align=right><a href=\"_gwcgi_?"
841 << "c=" << collection << "&ppnum=" << phrase << "\">";
842 textout << prefix << "</a></td>";
843 textout <<outconvert << disp
844 << "<td align=center><a href=\"_gwcgi_?"
845 << "c=" << collection << "&ppnum=" << phrase << "\">"
846 << body << "</a></td>"
847 << "<td align=left><a href=\"_gwcgi_?"
848 << "c=" << collection << "&ppnum=" << phrase << "\">";
849 textout << suffix << "</a></td>"
850 << "<td>" << tf << "</td><td>" << df << "</td></tr>\n";
851 }
852 }
853}
854
855// split an expansion into prefix and suffix
856void phindaction::split_phrase(const UCArray &word, const UCArray &body,
857 UCArray &prefix, UCArray &suffix) {
858
859 prefix.clear();
860 suffix.clear();
861
862 bool readingPrefix = true;
863 UCArray::const_iterator here = word.begin();
864 UCArray::const_iterator end = word.end();
865
866 while (here != end) {
867
868 // if we've not read all the prefix, add the next char to the prefix
869 if (readingPrefix) {
870 if (phrase_match(body, here, end)) {
871 readingPrefix = false;
872 // trim whitespace from end of prefix & start of suffix
873 if (!prefix.empty()) {
874 prefix.pop_back();
875 }
876 if ((here != end) && (*here == ' ')) {
877 ++here;
878 }
879 } else {
880 prefix.push_back(*here);
881 ++here;
882 }
883 }
884 // if we've finished with the prefix, update the suffix
885 else {
886 suffix.push_back(*here);
887 ++here;
888 }
889 }
890}
891
892// phrase_match
893//
894// compare two strings, one represented as an UCArray, the other as two
895// UCArray iterators.
896//
897// Return true if the UCArray is the same as the phrase the iterators point
898// to for the length of the UCArray.
899bool phindaction::phrase_match(const UCArray &text, UCArray::const_iterator &here,
900 UCArray::const_iterator end) {
901
902 UCArray::const_iterator one_here = text.begin();
903 UCArray::const_iterator one_end = text.end();
904 UCArray::const_iterator two_here = here;
905
906 // iterate over the length of the first string, comparing each element to
907 // the corresponding element in the second string.
908 while (one_here != one_end) {
909
910 if (two_here == end) {
911 return false;
912 } else if (*one_here != *two_here) {
913 return false;
914 }
915 ++one_here;
916 ++two_here;
917 }
918
919 here = two_here;
920 return true;
921}
922
923bool phindaction::print_documents(bool XMLmode, const text_t &basepath,
924 const text_t &collection,
925 const vector <unsigned long> &docNums,
926 const vector <unsigned long> &docFreq,
927 unsigned long first, unsigned long last,
928 displayclass &disp, outconvertclass &outconvert,
929 ostream &textout) {
930
931 // Create a TextData object to read the document data
932 TextData docdata;
933
934 text_t fullpath = filename_cat(basepath, "docs");
935 char *fullpathc = fullpath.getcstr();
936#if defined __WIN32__
937 char *base = "";
938#else
939 char *base = "/";
940#endif
941
942 if (!docdata.LoadData (base, fullpathc)) {
943 // FatalError (1, "Couldn't load text information for \"%s\"", fullpathc);
944 //exit(0);
945 /** DLConsulting 12-07-2004 */
946 return false;
947 }
948
949 delete []fullpathc;
950
951 UCArray title, hash;
952 unsigned long freq, doc;
953
954 for (unsigned long d = first; d < last; ++d) {
955 doc = docNums[d];
956 freq = docFreq[d];
957
958 /** DLConsulting 13-07-2004 */
959 if(!get_document_all_data(docdata, doc, title, hash)) {
960 return false;
961 }
962
963 if (XMLmode) {
964 textout << "<document num=\"" << d
965 << "\" hash=\"" << hash
966 << "\" freq=\"" << freq
967 << "\" title=\"" << title << "\"/>\n";
968 } else {
969 textout << outconvert << disp
970 << "<tr valign=top><td><a href=\"_gwcgi_?"
971 << "c=" << collection;
972 textout << "&a=d&d=" << hash << "\">" << title << "</a>"
973 << "</td><td>" << freq << "</td></tr>\n";
974 }
975 }
976
977 docdata.UnloadData();
978
979 /** DLConsulting 12-07-2004 */
980 return true;
981}
982
983// Get all the data about a docment
984//
985// The document's details are stored in docData as record docNum.
986// We retrieve:
987// title - the document's title
988// hash - the document's unique OID
989/** Returns:
990 * false if a fatal error occured, true otherwise
991 * DLConsulting 12-07-2004
992 */
993bool phindaction::get_document_all_data(TextData &docdata, unsigned long docNum,
994 UCArray &title, UCArray &hash) {
995
996 UCArray text;
997 UCArray docLevel;
998 SetCStr(docLevel, "Document", 8);
999
1000 // Look the word up in the textData
1001 if (!GetDocText (docdata, docLevel, docNum, text)) {
1002 // FatalError (1, "Error while trying to get document %u", docNum);
1003 //exit(0);
1004 /** DLConsulting 13-07-2004 */
1005 return false;
1006 }
1007
1008 // Ignore everything up to the first colon
1009 UCArray::iterator next = text.begin();
1010 while (*next++ != '\t');
1011
1012 // Get the document OID (hash)
1013 hash.clear();
1014 for (; *next != '\t'; ++next) {
1015 hash.push_back(*next);
1016 }
1017
1018 // Get the title
1019 text.push_back('\n');
1020 title.clear();
1021 for (++next; *next != '\n'; ++next) {
1022 title.push_back(*next);
1023 }
1024
1025 /** DLConsulting 13-07-2004 */
1026 return true;
1027}
1028
1029void phindaction::toUCArray(const text_t &in, UCArray &out) {
1030 out.clear();
1031 if (out.capacity() < in.size() + 1) {
1032 out.reserve(in.size() + 1);
1033 }
1034 text_t::const_iterator here = in.begin();
1035 text_t::const_iterator end = in.end();
1036 while (here != end) {
1037 out.push_back((unsigned char) *here);
1038 ++here;
1039 }
1040}
1041
1042void phindaction::output_error (const text_t &message, ostream &textout,
1043 outconvertclass &outconvert,
1044 displayclass & disp, ostream &logout,
1045 bool XMLmode) {
1046
1047 logout << outconvert << message << "\n";
1048 if (XMLmode) {
1049 textout << outconvert
1050 << "<phinddata>\n"
1051 << "<phinderror>" << message << "</phinderror>\n"
1052 << "</phinddata>\n";
1053 } else {
1054 textout << outconvert << disp
1055 << "_header_\n"
1056 << message
1057 << "_footer_\n";
1058 }
1059}
1060
1061#endif //GSDL_USE_PHIND_ACTION
1062
Note: See TracBrowser for help on using the repository browser.