source: main/trunk/greenstone2/runtime-src/src/recpt/phindaction.cpp@ 21973

Last change on this file since 21973 was 21973, checked in by kjdon, 11 years ago

Need to convert query between unicode and utf8 for mgpp

  • Property svn:keywords set to Author Date Id Revision
File size: 29.5 KB
Line 
1/**********************************************************************
2 *
3 * phindaction.cpp --
4 *
5 * Copyright 2001 Gordon W. Paynter
6 * Copyright 2001 The New Zealand Digital Library Project
7 *
8 * A component of the Greenstone digital library software
9 * from the New Zealand Digital Library Project at the
10 * University of Waikato, New Zealand.
11 *
12 * This program is free software; you can redistribute it and/or modify
13 * it under the terms of the GNU General Public License as published by
14 * the Free Software Foundation; either version 2 of the License, or
15 * (at your option) any later version.
16 *
17 * This program is distributed in the hope that it will be useful,
18 * but WITHOUT ANY WARRANTY; without even the implied warranty of
19 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20 * GNU General Public License for more details.
21 *
22 * You should have received a copy of the GNU General Public License
23 * along with this program; if not, write to the Free Software
24 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
25 *
26 *********************************************************************/
27
28#include "gsdl_modules_cfg.h"
29#ifdef GSDL_USE_PHIND_ACTION
30
31// Note that this action uses mgpp to retrieve phind info, calling MGQuery
32// etc. directly, not through the protocol. This breaks our receptionist -
33// collection server separation and should be fixed some day I guess.
34
35#include "phindaction.h"
36#include "fileutil.h"
37#include "gsdlunicode.h"
38
39phindaction::phindaction () {
40
41 cgiarginfo arg_ainfo;
42
43 arg_ainfo.shortname = "pc";
44 arg_ainfo.longname = "phind classifier";
45 arg_ainfo.multiplechar = true;
46 arg_ainfo.defaultstatus = cgiarginfo::weak;
47 arg_ainfo.argdefault = g_EmptyText;
48 arg_ainfo.savedarginfo = cgiarginfo::mustnot;
49 argsinfo.addarginfo (NULL, arg_ainfo);
50
51 arg_ainfo.shortname = "pxml";
52 arg_ainfo.longname = "phind XML mode";
53 arg_ainfo.multiplechar = false;
54 arg_ainfo.defaultstatus = cgiarginfo::weak;
55 arg_ainfo.argdefault = "0";
56 arg_ainfo.savedarginfo = cgiarginfo::mustnot;
57 argsinfo.addarginfo (NULL, arg_ainfo);
58
59 arg_ainfo.shortname = "ppnum";
60 arg_ainfo.longname = "phind phrase number";
61 arg_ainfo.multiplechar = true;
62 arg_ainfo.defaultstatus = cgiarginfo::weak;
63 arg_ainfo.argdefault = "0";
64 arg_ainfo.savedarginfo = cgiarginfo::mustnot;
65 argsinfo.addarginfo (NULL, arg_ainfo);
66
67 arg_ainfo.shortname = "pptext";
68 arg_ainfo.longname = "phind phrase text";
69 arg_ainfo.multiplechar = true;
70 arg_ainfo.defaultstatus = cgiarginfo::weak;
71 arg_ainfo.argdefault = g_EmptyText;
72 arg_ainfo.savedarginfo = cgiarginfo::mustnot;
73 argsinfo.addarginfo (NULL, arg_ainfo);
74
75 arg_ainfo.shortname = "pfe";
76 arg_ainfo.longname = "phind first_e";
77 arg_ainfo.multiplechar = true;
78 arg_ainfo.defaultstatus = cgiarginfo::weak;
79 arg_ainfo.argdefault = "0";
80 arg_ainfo.savedarginfo = cgiarginfo::mustnot;
81 argsinfo.addarginfo (NULL, arg_ainfo);
82
83 arg_ainfo.shortname = "ple";
84 arg_ainfo.longname = "phind last_e";
85 arg_ainfo.multiplechar = true;
86 arg_ainfo.defaultstatus = cgiarginfo::weak;
87 arg_ainfo.argdefault = "10";
88 arg_ainfo.savedarginfo = cgiarginfo::mustnot;
89 argsinfo.addarginfo (NULL, arg_ainfo);
90
91 arg_ainfo.shortname = "pfl";
92 arg_ainfo.longname = "phind first_l";
93 arg_ainfo.multiplechar = true;
94 arg_ainfo.defaultstatus = cgiarginfo::weak;
95 arg_ainfo.argdefault = "0";
96 arg_ainfo.savedarginfo = cgiarginfo::mustnot;
97 argsinfo.addarginfo (NULL, arg_ainfo);
98
99 arg_ainfo.shortname = "pll";
100 arg_ainfo.longname = "phind last_l";
101 arg_ainfo.multiplechar = true;
102 arg_ainfo.defaultstatus = cgiarginfo::weak;
103 arg_ainfo.argdefault = "10";
104 arg_ainfo.savedarginfo = cgiarginfo::mustnot;
105 argsinfo.addarginfo (NULL, arg_ainfo);
106
107 arg_ainfo.shortname = "pfd";
108 arg_ainfo.longname = "phind first_d";
109 arg_ainfo.multiplechar = true;
110 arg_ainfo.defaultstatus = cgiarginfo::weak;
111 arg_ainfo.argdefault = "0";
112 arg_ainfo.savedarginfo = cgiarginfo::mustnot;
113 argsinfo.addarginfo (NULL, arg_ainfo);
114
115 arg_ainfo.shortname = "pld";
116 arg_ainfo.longname = "phind last_d";
117 arg_ainfo.multiplechar = true;
118 arg_ainfo.defaultstatus = cgiarginfo::weak;
119 arg_ainfo.argdefault = "10";
120 arg_ainfo.savedarginfo = cgiarginfo::mustnot;
121 argsinfo.addarginfo (NULL, arg_ainfo);
122}
123
124phindaction::~phindaction () {
125}
126
127void phindaction::get_cgihead_info (cgiargsclass &args, recptprotolistclass * /*protos*/,
128 response_t &response,text_t &response_data,
129 ostream &/*logout*/) {
130 response = content;
131 if (args["pxml"] == "1") {
132 response_data = "text/xml";
133 } else {
134 response_data = "text/html";
135 }
136}
137
138bool phindaction::do_action (cgiargsclass &args, recptprotolistclass *protos,
139 browsermapclass * /*browsers*/, displayclass &disp,
140 outconvertclass &outconvert, ostream &textout,
141 ostream &logout) {
142
143 unsigned long count_l, count_e, count_d;
144 unsigned long phrase = args["ppnum"].getulong();
145 text_t &word = args["pptext"];
146 unsigned long first_e = args["pfe"].getulong();
147 unsigned long last_e = args["ple"].getulong();
148 unsigned long first_l = args["pfl"].getulong();
149 unsigned long last_l = args["pll"].getulong();
150 unsigned long first_d = args["pfd"].getulong();
151 unsigned long last_d = args["pld"].getulong();
152 bool XMLmode = false;
153 if (args["pxml"] == "1") XMLmode = true;
154
155 // must have a valid collection server
156 recptproto *collectproto = protos->getrecptproto (args["c"], logout);
157 if (collectproto == NULL) {
158 output_error("phindaction: ERROR: collection not set", textout,
159 outconvert, disp, logout, XMLmode);
160 return true;
161 }
162
163 // the frequency and occurances of the phrase
164 unsigned long tf;
165 vector <unsigned long> el, linkdest, docNums, docfreq;
166 vector <UCArray> linktype;
167
168 // the number of occurances to display
169 unsigned long ef, lf, df;
170
171 text_t basepath = filename_cat(collecthome, args["c"],
172 "index", "phind" + args["pc"]);
173
174 // If we don't know the phrase number, look it up
175 if (phrase == 0) {
176
177 if (word.empty()) {
178 output_error("phindaction: ERROR: no phrase number or word", textout,
179 outconvert, disp, logout, XMLmode);
180 return true;
181 }
182
183 DocNumArray result;
184 /** In order to prevent browser crashing problems, any method which
185 * previously suffered a silent fatal error, now instead returns false
186 * to indicate a fatal error has occured. We can then dispatch an
187 * appropriate error tag to the Phind applet (rather than leave it
188 * whiling away the milliseconds until the end of existence - or at
189 * least your browser - in an infinite loop!)
190 * DLConsulting 12-07-2004
191 */
192
193 if(!find_phrase_number_from_word(basepath, word, result)) {
194 output_error("phindaction: Fatal Error! Couldn't load index information in find_phrase_number_from_word()",
195 textout, outconvert, disp, logout, XMLmode);
196 return true;
197 }
198
199 if (result.empty()) {
200 output_error("phindaction: The search term ("+word+") does not occur in the collection",
201 textout, outconvert, disp, logout, XMLmode);
202 return true;
203 } else {
204 phrase = result[0];
205 }
206 }
207
208 // Create a TextData object to read the phrase data (pdata)
209 TextData textdata;
210
211 text_t fullpath = filename_cat(basepath, "pdata");
212 char *fullpathc = fullpath.getcstr();
213#if defined __WIN32__
214 char *base = "";
215#else
216 char *base = "/";
217#endif
218
219 if (!textdata.LoadData (base, fullpathc)) {
220 // FatalError (1, "Couldn't load text information for \"%s\"", fullpathc);
221 //exit(0);
222 /** We must return something to the client, whether this error is fatal or
223 * no, otherwise we risk sending their browser into an infinite loop!
224 * DLConsulting 12-07-2004
225 */
226 output_error("phindaction: Fatal Error! Couldn't load text information for collection",
227 textout, outconvert, disp, logout, XMLmode);
228 return true;
229 }
230
231 delete []fullpathc;
232
233 /** Another previously silent method can now cry out.
234 * DLConsulting 12-07-2004
235 */
236 if(!get_phrase_all_data(textdata, phrase, word, tf, ef, lf, df, el,
237 linkdest, linktype, docNums, docfreq)) {
238 output_error(
239 "phindaction: Fatal Error! Couldn't parse phrase in get_phrase_all_data()",
240 textout, outconvert, disp, logout, XMLmode);
241 return true;
242 }
243
244 // Output the header
245 if (XMLmode) {
246 textout << "<phinddata id=\"" << phrase
247 << "\" text=\"" << word
248 << "\" tf=\"" << tf
249 << "\" ef=\"" << ef
250 << "\" df=\"" << df
251 << "\" lf=\"" << lf
252 << "\">\n";
253 } else {
254 textout << "<html><head><title>" << word << "</title></head>\n"
255 << "<body><center>\n"
256 << "<p><h1>" << word << "</h1>\n"
257 << "<p><b>"<< word << "</b> occurs "
258 << tf << " times in " << df << " documents\n";
259 }
260
261 // Output the thesaurus links
262 if ((lf > 0) && (first_l < last_l)) {
263
264 // figure out the number of phrases to output
265 if (last_l > lf) {
266 last_l = lf;
267 }
268 count_l = last_l - first_l;
269
270 if (XMLmode) {
271 textout << "<thesauruslist length=\"" << lf
272 << "\" start=\"" << first_l
273 << "\" end=\"" << last_l << "\">\n";
274 /** DLConsulting 12-07-2004 */
275 if(!print_thesaurus_links(args["c"], XMLmode, textdata, linkdest, linktype,
276 first_l, last_l, disp, outconvert, textout)) {
277 output_error(
278 "phindaction: Fatal Error! Couldn't get phrase in get_phrase_freq_data()",
279 textout, outconvert, disp, logout, XMLmode);
280 return true;
281 }
282 textout << "</thesauruslist>\n";
283 }
284
285 // output links as HTML
286 else {
287 if (count_l == lf) {
288 textout << "<p><b> " << count_l << " thesaurus links</b>\n";
289 } else {
290 textout << "<p><b>" << count_l << " of " << lf << " thesaurus links</b>\n";
291 }
292
293 textout << "<p><table border=1><tr><th>type</th><th>topic</th><th>freq</th><th>docs</th></tr>\n";
294 /** DLConsulting 12-07-2004 */
295 if(!print_thesaurus_links(args["c"], XMLmode, textdata, linkdest, linktype,
296 first_l, last_l, disp, outconvert, textout)) {
297 output_error(
298 "phindaction: Fatal Error! Couldn't get phrase in get_phrase_freq_data()",
299 textout, outconvert, disp, logout, XMLmode);
300 return true;
301 }
302 textout << "</table>\n";
303
304 if (last_l < lf) {
305 if ((last_l + 10) < lf) {
306 textout << outconvert << disp
307 << "<br><a href=\"_gwcgi_?"
308 << "c=" << args["c"]
309 << "&ppnum=" << phrase
310 << "&pfe=" << first_e
311 << "&ple=" << last_e
312 << "&pfd=" << first_d
313 << "&pld=" << last_d
314 << "&pfl=" << first_l
315 << "&pll=" << (last_l + 10)
316 << "\">Get more thesaurus links</a>\n";
317 }
318 textout << outconvert << disp
319 << "<br><a href=\"_gwcgi_?"
320 << "c=" << args["c"]
321 << "&ppnum=" << phrase
322 << "&pfe=" << first_e
323 << "&ple=" << last_e
324 << "&pfd=" << first_d
325 << "&pld=" << last_d
326 << "&pfl=" << first_l
327 << "&pll=" << lf
328 << "\">Get every thesaurus link</a>\n" ;
329 }
330 }
331 }
332
333 // Output the expansions
334 if ((ef > 0) && (first_e < last_e)) {
335
336 // figure out the number of phrases to output
337 if (last_e > el.size()) {
338 last_e = el.size();
339 }
340 count_e = last_e - first_e;
341
342 // output expansions as XML
343 if (XMLmode) {
344 textout << "<expansionlist length=\"" << ef
345 << "\" start=\"" << first_e
346 << "\" end=\"" << last_e << "\">" << endl;
347
348 print_expansions(args["c"], XMLmode, word, textdata, el, first_e,
349 last_e, disp, outconvert, textout);
350
351 textout << "</expansionlist>\n";
352 }
353
354 // output expansions as HTML
355 else {
356 if (count_e == el.size()) {
357 textout << "<p><b> " << count_e << " expansions</b>\n";
358 } else {
359 textout << "<p><b>" << count_e << " of " << ef << " expansions</b>\n";
360 }
361
362 textout << "<p><table border=1><tr><th colspan=3>phrase</th><th>freq</th><th>docs</th></tr>\n";
363 print_expansions(args["c"], XMLmode, word, textdata, el, first_e,
364 last_e, disp, outconvert, textout);
365 textout << "</table>\n";
366
367 if (last_e < ef) {
368 if ((last_e + 10) < ef) {
369 textout << outconvert << disp
370 << "<br><a href=\"_gwcgi_?"
371 << "c=" << args["c"]
372 << "&ppnum=" << phrase
373 << "&pfe=" << first_e
374 << "&ple=" << (last_e + 10)
375 << "&pfd=" << first_d
376 << "&pld=" << last_d
377 << "&pfl=" << first_l
378 << "&pll=" << last_l
379 << "\">Get more expansions</a>\n";
380 }
381 textout << outconvert << disp
382 << "<br><a href=\"_gwcgi_?"
383 << "c=" << args["c"]
384 << "&ppnum=" << phrase
385 << "&pfe=" << first_e
386 << "&ple=" << ef
387 << "&pfd=" << first_d
388 << "&pld=" << last_d
389 << "&pfl=" << first_l
390 << "&pll=" << last_l
391 << "\">Get every expansion</a>\n";
392 }
393 }
394 }
395
396 // Output the document occurances
397 if ((df > 0) && (first_d < last_d)) {
398
399 // figure out the phrases to output
400 if (last_d > docNums.size()) {
401 last_d = docNums.size();
402 }
403 count_d = last_d - first_d;
404
405 // output document list as XML
406 if (XMLmode) {
407 textout << "<documentlist length=\"" << df
408 << "\" start=\"" << first_d
409 << "\" end=\"" << last_d << "\">\n";
410
411 if(!print_documents(XMLmode, basepath, args["c"], docNums, docfreq,
412 first_d, last_d, disp, outconvert, textout)) {
413 output_error(
414 "phindaction: Fatal Error! Couldn't load text information in print_documents() or get_document_all_data()",
415 textout, outconvert, disp, logout, XMLmode);
416 return true;
417 }
418
419 textout << "</documentlist>\n";
420 }
421
422 // output document list as HTML
423 else {
424
425 if (count_d == docNums.size()) {
426 textout << "<p><b> " << count_d << " documents</b>\n";
427 } else {
428 textout << "<p><b>" << count_d << " of " << df << " documents</b>\n";
429 }
430
431 textout << "<p><table border=1><tr><th align=left>document</th><th>freq</th></tr>\n";
432 if(!print_documents(XMLmode, basepath, args["c"], docNums, docfreq,
433 first_d, last_d, disp, outconvert, textout)) {
434 output_error(
435 "phindaction: Fatal Error! Couldn't load text information in print_documents()",
436 textout, outconvert, disp, logout, XMLmode);
437 return true;
438 }
439 textout << "</table>\n";
440
441 if (last_d < df) {
442 if ((last_d + 10) < df) {
443 textout << outconvert << disp
444 << "<br><a href=\"_gwcgi_?"
445 << "c=" << args["c"]
446 << "&ppnum=" << phrase
447 << "&pfe=" << first_e
448 << "&ple=" << last_e
449 << "&pfd=" << first_d
450 << "&pld=" << (last_d + 10)
451 << "&pfl=" << first_l
452 << "&pll=" << last_l
453 << "\">Get more documents</a>\n";
454 }
455 textout << outconvert << disp
456 << "<br><a href=\"_gwcgi_?"
457 << "c=" << args["c"]
458 << "&ppnum=" << phrase
459 << "&pfe=" << first_e
460 << "&ple=" << last_e
461 << "&pfd=" << first_d
462 << "&pld=" << df
463 << "&pfl=" << first_l
464 << "&pll=" << last_l
465 << "\">Get every document</a>\n";
466 }
467 }
468 }
469
470 // Close the document
471 if (XMLmode) {
472 textout << "</phinddata>\n";
473 } else {
474 textout << "</center></body></html>\n";
475 }
476
477 textdata.UnloadData ();
478
479 return true;
480}
481
482// Find the phrase number of a word in the index file
483bool phindaction::find_phrase_number_from_word(const text_t &basepath,
484 const text_t &query,
485 DocNumArray &result) {
486
487 // Open the index file for searching
488 IndexData indexData;
489
490 text_t fullpath = filename_cat(basepath, "pword");
491 char *fullpathc = fullpath.getcstr();
492#if defined __WIN32__
493 char *base = "";
494#else
495 char *base = "/";
496#endif
497
498 if (!indexData.LoadData (base, fullpathc)) {
499 // FatalError (1, "Couldn't load index information for \"%s\"", fullpathc);
500 //exit(0);
501 /** Don't handle fatal errors here anymore.
502 * DLConsulting 12-07-2004
503 */
504 return false; // Indicates something very bad has happened
505 }
506
507 delete []fullpathc;
508
509 // set up the query object
510 QueryInfo queryInfo;
511 SetCStr (queryInfo.docLevel, "Document", 8);
512 queryInfo.maxDocs = 5;
513 queryInfo.sortByRank = true;
514 queryInfo.exactWeights = false;
515 queryInfo.needRankInfo = true;
516 queryInfo.needTermFreqs = true;
517
518 // mode 1 = casefolded, unstemmed search
519 UCArray ucquery;
520 // greenstone gives us the query encoded in unicode. We want utf8.
521 char* utf8querystring=to_utf8(query).getcstr();
522 SetCStr(ucquery, utf8querystring);
523 delete []utf8querystring;
524
525 //toUCArray(query, ucquery);
526 QueryNode *queryTree = ParseQuery(ucquery, 1, 1, 4);
527
528 // perform the query
529 ExtQueryResult queryResult;
530 MGQuery (indexData, queryInfo, queryTree, queryResult);
531 // cout << "-- word lookup result -- " << endl << queryResult << endl ;
532
533 result.clear();
534 result = queryResult.docs;
535
536 // delete the query
537 if (queryTree != NULL) delete queryTree;
538
539 indexData.UnloadData();
540
541 /** This method now returns a boolean, so...
542 * DLConsulting 12-07-2004
543 */
544 return true; // Indicates that what happened is all good, baby.
545}
546
547// Get all the data about a phrase
548//
549// The phrase is stored in textData as record phrase.
550// We retrieve:
551// word - the text of the phrase
552// tf - the total frequency of the phrase
553// ef - the expansion frequency of the phrase
554// lf - the thesaurus link frequency of the phrase
555// df - the document frequency of the phrase
556// el - the list of phrases that are expansions of phrase
557// ll - the list of phrases that are thesaurus links
558// dl - the list of documents that contain phrase
559bool phindaction::get_phrase_all_data(TextData &textdata, unsigned long phrase,
560 text_t &word, unsigned long &tf, unsigned long &ef,
561 unsigned long &lf, unsigned long &df,
562 vector <unsigned long> &el,
563 vector <unsigned long> &linkdest,
564 vector <UCArray> &linktype,
565 vector <unsigned long> &docnum,
566 vector <unsigned long> &docfrq) {
567 UCArray text;
568 UCArray docLevel;
569 SetCStr(docLevel, "Document", 8);
570
571 // Look the word up in the textData
572 if (!GetDocText (textdata, docLevel, phrase, text)) {
573 // FatalError (1, "Error while trying to get phrase %u", phrase);
574 //exit(0);
575 return false; // Something very bad has happened.
576 }
577
578 // Ignore everything up to the first colon
579 UCArray::iterator next = text.begin();
580 while (*next++ != ':');
581
582 // ignore training carriage returns
583 while (text.back() == '\n') {
584 text.pop_back();
585 }
586
587 // Get the word
588 word.clear();
589 for (; *next != ':'; ++next) {
590 word.push_back(*next);
591 }
592
593 // Get total frequency
594 tf = 0;
595 for (++next; *next != ':'; ++next) {
596 tf *= 10;
597 tf += (*next - '0');
598 }
599
600 // Get expansion frequency
601 ef = 0;
602 for (++next; *next != ':'; ++next) {
603 ef *= 10;
604 ef += (*next - '0');
605 }
606
607 // Get document frequency
608 df = 0;
609 for (++next; *next != ':'; ++next) {
610 df *= 10;
611 df += (*next - '0');
612 }
613
614 // Get expansion list
615 el.clear();
616 unsigned long e = 0;
617 for (++next; *next != ':'; ++next) {
618 if (*next == ',') {
619 el.push_back(e);
620 e = 0;
621 } else {
622 e *= 10;
623 e += (*next - '0');
624 }
625 }
626
627 // Get document list & the document frequency list
628 docnum.clear();
629 docfrq.clear();
630 bool readnum = false;
631 unsigned long d = 0;
632 for (++next; *next != ':'; ++next) {
633 if (*next == ',') {
634 docnum.push_back(d);
635 readnum = true;
636 d = 0;
637 } else if (*next == ';') {
638 if (readnum) {
639 docfrq.push_back(d);
640 } else {
641 docnum.push_back(d);
642 docfrq.push_back(1);
643 }
644 readnum = false;
645 d = 0;
646 } else {
647 d *= 10;
648 d += (*next - '0');
649 }
650 }
651
652 // Get thesaurus link frequency & link list
653 text.push_back(':');
654 text.push_back(':');
655
656 // link frequency
657 lf = 0;
658 for (++next; *next != ':'; ++next) {
659 lf *= 10;
660 lf += (*next - '0');
661 }
662
663 // two lists of link data
664 linkdest.clear();
665 linktype.clear();
666
667 UCArray thistype;
668 thistype.clear();
669 bool typedone = false;
670 unsigned long l = 0;
671 for (++next; *next != ':'; ++next) {
672
673 if (!typedone) {
674 // first read the link type, a charactor string
675 if (*next == ',') {
676 typedone = true;
677 } else {
678 thistype.push_back(*next);
679 }
680 } else {
681 // having read the link type, read the list of link destinations
682 if (*next == ',') {
683 linkdest.push_back(l);
684 linktype.push_back(thistype);
685 l = 0;
686 } else if (*next == ';') {
687 linkdest.push_back(l);
688 linktype.push_back(thistype);
689 l = 0;
690 thistype.clear();
691 typedone = false;
692 } else {
693 l *= 10;
694 l += (*next - '0');
695 }
696 }
697 }
698
699 return true; // Indicates that what happened is all good, baby.
700}
701
702bool phindaction::print_thesaurus_links(const text_t &collection, bool XMLmode,
703 TextData &textdata, vector <unsigned long> &linkdest,
704 vector <UCArray> &linktype, unsigned long first,
705 unsigned long last, displayclass &disp,
706 outconvertclass &outconvert, ostream &textout) {
707
708 // information describing each link in the list
709 unsigned long phrase, tf, ef, df;
710 UCArray type, text;
711
712 for (unsigned long l = first; l < last; ++l) {
713
714 // get the phrase data
715 phrase = linkdest[l];
716 type = linktype[l];
717
718 /** DLConsulting 12-07-2004 */
719 if(!get_phrase_freq_data(textdata, phrase, text, tf, ef, df)) {
720 return false;
721 }
722
723 if (XMLmode) {
724 textout << "<thesaurus num=\"" << l
725 << "\" id=\"" << phrase
726 << "\" tf=\"" << tf
727 << "\" df=\"" << df
728 << "\" type=\"" << type
729 << "\" text=\"" << text
730 << "\"/>\n";
731 } else {
732 textout << "<tr valign=top><td>" << type << "</td><td>";
733 textout << outconvert << disp
734 << "<a href=\"_gwcgi_?c=" << collection;
735 textout << "&ppnum=" << phrase << "\">" << text << "</a>"
736 << "</td><td>" << tf << "</td><td>" << df << "</td></tr>\n";
737 }
738 }
739
740 /** DLConsulting 12-07-2004 */
741 return true;
742}
743
744// Get the frequency data about a phrase
745//
746// The phrase is stored in textData as record phrase.
747// We retrieve:
748// word - the text of the phrase
749// tf - the total frequency of the phrase
750// ef - the expansion frequency of the phrase
751// df - the document frequency of the phrase
752/**
753 * Returns:
754 * false if the method suffered a fatal error, true otherwise
755 */
756bool phindaction::get_phrase_freq_data(TextData &textdata, unsigned long phrase,
757 UCArray &word, unsigned long &tf,
758 unsigned long &ef, unsigned long &df) {
759
760 UCArray text;
761 UCArray docLevel;
762 SetCStr(docLevel, "Document", 8);
763
764 // Look the word up in the textData
765 if (!GetDocText (textdata, docLevel, phrase, text)) {
766 // FatalError (1, "Error while trying to get phrase %u", phrase);
767 //exit(0);
768 /** DLConsulting 12-07-2004 */
769 return false;
770 }
771
772 // Ignore everything up to the first colon
773 UCArray::iterator next = text.begin();
774 while (*next++ != ':');
775
776 // Get the word
777 word.clear();
778 for (; *next != ':'; ++next) {
779 word.push_back(*next);
780 }
781
782 // Get total frequency
783 tf = 0;
784 for (++next; *next != ':'; ++next) {
785 tf *= 10;
786 tf += (*next - '0');
787 }
788
789 // Get expansion frequency
790 ef = 0;
791 for (++next; *next != ':'; ++next) {
792 ef *= 10;
793 ef += (*next - '0');
794 }
795
796 // Get document frequency
797 df = 0;
798 for (++next; *next != ':'; ++next) {
799 df *= 10;
800 df += (*next - '0');
801 }
802
803 /** DLConsulting 12-07-2004 */
804 return true;
805}
806
807// Print a list of expansions
808//
809// Given the textData and a list of phrase numbers, print out each of the
810// expansions.
811void phindaction::print_expansions(const text_t &collection, bool XMLmode,
812 const text_t &body, TextData &textdata,
813 const vector <unsigned long> &elist,
814 unsigned long first, unsigned long last,
815 displayclass &disp, outconvertclass &outconvert,
816 ostream &textout) {
817
818 UCArray word;
819 unsigned long phrase, tf, df, ef;
820
821 UCArray suffix, prefix, ucbody;
822
823 toUCArray(body, ucbody);
824
825 for (unsigned long e = first; e < last; ++e) {
826
827 phrase = elist[e];
828 get_phrase_freq_data(textdata, phrase, word, tf, ef, df);
829
830 split_phrase(word, ucbody, prefix, suffix);
831
832 if (XMLmode) {
833 // body is always the same as the text of the phrase, so no need to send it
834 textout << "<expansion num=\"" << e
835 << "\" id=\"" << phrase
836 << "\" tf=\"" << tf
837 << "\" df=\"" << df;
838 if (!prefix.empty()) {
839 textout << "\" prefix=\"" << prefix;
840 }
841 if (!suffix.empty()) {
842 textout << "\" suffix=\"" << suffix;
843 }
844 textout << "\"/>\n";
845 } else {
846 textout << outconvert << disp
847 << "<tr valign=top><td align=right><a href=\"_gwcgi_?"
848 << "c=" << collection << "&ppnum=" << phrase << "\">";
849 textout << prefix << "</a></td>";
850 textout <<outconvert << disp
851 << "<td align=center><a href=\"_gwcgi_?"
852 << "c=" << collection << "&ppnum=" << phrase << "\">"
853 << body << "</a></td>"
854 << "<td align=left><a href=\"_gwcgi_?"
855 << "c=" << collection << "&ppnum=" << phrase << "\">";
856 textout << suffix << "</a></td>"
857 << "<td>" << tf << "</td><td>" << df << "</td></tr>\n";
858 }
859 }
860}
861
862// split an expansion into prefix and suffix
863void phindaction::split_phrase(const UCArray &word, const UCArray &body,
864 UCArray &prefix, UCArray &suffix) {
865
866 prefix.clear();
867 suffix.clear();
868
869 bool readingPrefix = true;
870 UCArray::const_iterator here = word.begin();
871 UCArray::const_iterator end = word.end();
872
873 while (here != end) {
874
875 // if we've not read all the prefix, add the next char to the prefix
876 if (readingPrefix) {
877 if (phrase_match(body, here, end)) {
878 readingPrefix = false;
879 // trim whitespace from end of prefix & start of suffix
880 if (!prefix.empty()) {
881 prefix.pop_back();
882 }
883 if ((here != end) && (*here == ' ')) {
884 ++here;
885 }
886 } else {
887 prefix.push_back(*here);
888 ++here;
889 }
890 }
891 // if we've finished with the prefix, update the suffix
892 else {
893 suffix.push_back(*here);
894 ++here;
895 }
896 }
897}
898
899// phrase_match
900//
901// compare two strings, one represented as an UCArray, the other as two
902// UCArray iterators.
903//
904// Return true if the UCArray is the same as the phrase the iterators point
905// to for the length of the UCArray.
906bool phindaction::phrase_match(const UCArray &text, UCArray::const_iterator &here,
907 UCArray::const_iterator end) {
908
909 UCArray::const_iterator one_here = text.begin();
910 UCArray::const_iterator one_end = text.end();
911 UCArray::const_iterator two_here = here;
912
913 // iterate over the length of the first string, comparing each element to
914 // the corresponding element in the second string.
915 while (one_here != one_end) {
916
917 if (two_here == end) {
918 return false;
919 } else if (*one_here != *two_here) {
920 return false;
921 }
922 ++one_here;
923 ++two_here;
924 }
925
926 here = two_here;
927 return true;
928}
929
930bool phindaction::print_documents(bool XMLmode, const text_t &basepath,
931 const text_t &collection,
932 const vector <unsigned long> &docNums,
933 const vector <unsigned long> &docFreq,
934 unsigned long first, unsigned long last,
935 displayclass &disp, outconvertclass &outconvert,
936 ostream &textout) {
937
938 // Create a TextData object to read the document data
939 TextData docdata;
940
941 text_t fullpath = filename_cat(basepath, "docs");
942 char *fullpathc = fullpath.getcstr();
943#if defined __WIN32__
944 char *base = "";
945#else
946 char *base = "/";
947#endif
948
949 if (!docdata.LoadData (base, fullpathc)) {
950 // FatalError (1, "Couldn't load text information for \"%s\"", fullpathc);
951 //exit(0);
952 /** DLConsulting 12-07-2004 */
953 return false;
954 }
955
956 delete []fullpathc;
957
958 UCArray title, hash;
959 unsigned long freq, doc;
960
961 for (unsigned long d = first; d < last; ++d) {
962 doc = docNums[d];
963 freq = docFreq[d];
964
965 /** DLConsulting 13-07-2004 */
966 if(!get_document_all_data(docdata, doc, title, hash)) {
967 return false;
968 }
969
970 if (XMLmode) {
971 textout << "<document num=\"" << d
972 << "\" hash=\"" << hash
973 << "\" freq=\"" << freq
974 << "\" title=\"" << title << "\"/>\n";
975 } else {
976 textout << outconvert << disp
977 << "<tr valign=top><td><a href=\"_gwcgi_?"
978 << "c=" << collection;
979 textout << "&a=d&d=" << hash << "\">" << title << "</a>"
980 << "</td><td>" << freq << "</td></tr>\n";
981 }
982 }
983
984 docdata.UnloadData();
985
986 /** DLConsulting 12-07-2004 */
987 return true;
988}
989
990// Get all the data about a docment
991//
992// The document's details are stored in docData as record docNum.
993// We retrieve:
994// title - the document's title
995// hash - the document's unique OID
996/** Returns:
997 * false if a fatal error occured, true otherwise
998 * DLConsulting 12-07-2004
999 */
1000bool phindaction::get_document_all_data(TextData &docdata, unsigned long docNum,
1001 UCArray &title, UCArray &hash) {
1002
1003 UCArray text;
1004 UCArray docLevel;
1005 SetCStr(docLevel, "Document", 8);
1006
1007 // Look the word up in the textData
1008 if (!GetDocText (docdata, docLevel, docNum, text)) {
1009 // FatalError (1, "Error while trying to get document %u", docNum);
1010 //exit(0);
1011 /** DLConsulting 13-07-2004 */
1012 return false;
1013 }
1014
1015 // Ignore everything up to the first colon
1016 UCArray::iterator next = text.begin();
1017 while (*next++ != '\t');
1018
1019 // Get the document OID (hash)
1020 hash.clear();
1021 for (; *next != '\t'; ++next) {
1022 hash.push_back(*next);
1023 }
1024
1025 // Get the title
1026 text.push_back('\n');
1027 title.clear();
1028 for (++next; *next != '\n'; ++next) {
1029 title.push_back(*next);
1030 }
1031
1032 /** DLConsulting 13-07-2004 */
1033 return true;
1034}
1035
1036void phindaction::toUCArray(const text_t &in, UCArray &out) {
1037 out.clear();
1038 if (out.capacity() < in.size() + 1) {
1039 out.reserve(in.size() + 1);
1040 }
1041 text_t::const_iterator here = in.begin();
1042 text_t::const_iterator end = in.end();
1043 while (here != end) {
1044 out.push_back((unsigned char) *here);
1045 ++here;
1046 }
1047}
1048
1049void phindaction::output_error (const text_t &message, ostream &textout,
1050 outconvertclass &outconvert,
1051 displayclass & disp, ostream &logout,
1052 bool XMLmode) {
1053
1054 logout << outconvert << message << "\n";
1055 if (XMLmode) {
1056 textout << outconvert
1057 << "<phinddata>\n"
1058 << "<phinderror>" << message << "</phinderror>\n"
1059 << "</phinddata>\n";
1060 } else {
1061 textout << outconvert << disp
1062 << "_header_\n"
1063 << message
1064 << "_footer_\n";
1065 }
1066}
1067
1068#endif //GSDL_USE_PHIND_ACTION
1069
Note: See TracBrowser for help on using the repository browser.