source: trunk/gsdl/src/recpt/phindaction.cpp@ 7415

Last change on this file since 7415 was 7415, checked in by mdewsnip, 20 years ago

(Human Info) Use the SetCStr-with-size-hint method.

  • Property svn:keywords set to Author Date Id Revision
File size: 26.1 KB
Line 
1/**********************************************************************
2 *
3 * phindaction.cpp --
4 *
5 * Copyright 2001 Gordon W. Paynter
6 * Copyright 2001 The New Zealand Digital Library Project
7 *
8 * A component of the Greenstone digital library software
9 * from the New Zealand Digital Library Project at the
10 * University of Waikato, New Zealand.
11 *
12 * This program is free software; you can redistribute it and/or modify
13 * it under the terms of the GNU General Public License as published by
14 * the Free Software Foundation; either version 2 of the License, or
15 * (at your option) any later version.
16 *
17 * This program is distributed in the hope that it will be useful,
18 * but WITHOUT ANY WARRANTY; without even the implied warranty of
19 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20 * GNU General Public License for more details.
21 *
22 * You should have received a copy of the GNU General Public License
23 * along with this program; if not, write to the Free Software
24 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
25 *
26 *********************************************************************/
27
28#include "gsdl_modules_cfg.h"
29#ifdef GSDL_USE_PHIND_ACTION
30
31// Note that this action uses mgpp to retrieve phind info, calling MGQuery
32// etc. directly, not through the protocol. This breaks our receptionist -
33// collection server separation and should be fixed some day I guess.
34
35#include "phindaction.h"
36#include "fileutil.h"
37
38phindaction::phindaction () {
39
40 cgiarginfo arg_ainfo;
41
42 arg_ainfo.shortname = "pc";
43 arg_ainfo.longname = "phind classifier";
44 arg_ainfo.multiplechar = true;
45 arg_ainfo.defaultstatus = cgiarginfo::weak;
46 arg_ainfo.argdefault = g_EmptyText;
47 arg_ainfo.savedarginfo = cgiarginfo::mustnot;
48 argsinfo.addarginfo (NULL, arg_ainfo);
49
50 arg_ainfo.shortname = "pxml";
51 arg_ainfo.longname = "phind XML mode";
52 arg_ainfo.multiplechar = false;
53 arg_ainfo.defaultstatus = cgiarginfo::weak;
54 arg_ainfo.argdefault = "0";
55 arg_ainfo.savedarginfo = cgiarginfo::mustnot;
56 argsinfo.addarginfo (NULL, arg_ainfo);
57
58 arg_ainfo.shortname = "ppnum";
59 arg_ainfo.longname = "phind phrase number";
60 arg_ainfo.multiplechar = true;
61 arg_ainfo.defaultstatus = cgiarginfo::weak;
62 arg_ainfo.argdefault = "0";
63 arg_ainfo.savedarginfo = cgiarginfo::mustnot;
64 argsinfo.addarginfo (NULL, arg_ainfo);
65
66 arg_ainfo.shortname = "pptext";
67 arg_ainfo.longname = "phind phrase text";
68 arg_ainfo.multiplechar = true;
69 arg_ainfo.defaultstatus = cgiarginfo::weak;
70 arg_ainfo.argdefault = g_EmptyText;
71 arg_ainfo.savedarginfo = cgiarginfo::mustnot;
72 argsinfo.addarginfo (NULL, arg_ainfo);
73
74 arg_ainfo.shortname = "pfe";
75 arg_ainfo.longname = "phind first_e";
76 arg_ainfo.multiplechar = true;
77 arg_ainfo.defaultstatus = cgiarginfo::weak;
78 arg_ainfo.argdefault = "0";
79 arg_ainfo.savedarginfo = cgiarginfo::mustnot;
80 argsinfo.addarginfo (NULL, arg_ainfo);
81
82 arg_ainfo.shortname = "ple";
83 arg_ainfo.longname = "phind last_e";
84 arg_ainfo.multiplechar = true;
85 arg_ainfo.defaultstatus = cgiarginfo::weak;
86 arg_ainfo.argdefault = "10";
87 arg_ainfo.savedarginfo = cgiarginfo::mustnot;
88 argsinfo.addarginfo (NULL, arg_ainfo);
89
90 arg_ainfo.shortname = "pfl";
91 arg_ainfo.longname = "phind first_l";
92 arg_ainfo.multiplechar = true;
93 arg_ainfo.defaultstatus = cgiarginfo::weak;
94 arg_ainfo.argdefault = "0";
95 arg_ainfo.savedarginfo = cgiarginfo::mustnot;
96 argsinfo.addarginfo (NULL, arg_ainfo);
97
98 arg_ainfo.shortname = "pll";
99 arg_ainfo.longname = "phind last_l";
100 arg_ainfo.multiplechar = true;
101 arg_ainfo.defaultstatus = cgiarginfo::weak;
102 arg_ainfo.argdefault = "10";
103 arg_ainfo.savedarginfo = cgiarginfo::mustnot;
104 argsinfo.addarginfo (NULL, arg_ainfo);
105
106 arg_ainfo.shortname = "pfd";
107 arg_ainfo.longname = "phind first_d";
108 arg_ainfo.multiplechar = true;
109 arg_ainfo.defaultstatus = cgiarginfo::weak;
110 arg_ainfo.argdefault = "0";
111 arg_ainfo.savedarginfo = cgiarginfo::mustnot;
112 argsinfo.addarginfo (NULL, arg_ainfo);
113
114 arg_ainfo.shortname = "pld";
115 arg_ainfo.longname = "phind last_d";
116 arg_ainfo.multiplechar = true;
117 arg_ainfo.defaultstatus = cgiarginfo::weak;
118 arg_ainfo.argdefault = "10";
119 arg_ainfo.savedarginfo = cgiarginfo::mustnot;
120 argsinfo.addarginfo (NULL, arg_ainfo);
121}
122
123phindaction::~phindaction () {
124}
125
126void phindaction::get_cgihead_info (cgiargsclass &args, recptprotolistclass * /*protos*/,
127 response_t &response,text_t &response_data,
128 ostream &/*logout*/) {
129 response = content;
130 if (args["pxml"] == "1") {
131 response_data = "text/xml";
132 } else {
133 response_data = "text/html";
134 }
135}
136
137bool phindaction::do_action (cgiargsclass &args, recptprotolistclass *protos,
138 browsermapclass * /*browsers*/, displayclass &disp,
139 outconvertclass &outconvert, ostream &textout,
140 ostream &logout) {
141
142 unsigned long count_l, count_e, count_d;
143 unsigned long phrase = args["ppnum"].getulong();
144 text_t &word = args["pptext"];
145 unsigned long first_e = args["pfe"].getulong();
146 unsigned long last_e = args["ple"].getulong();
147 unsigned long first_l = args["pfl"].getulong();
148 unsigned long last_l = args["pll"].getulong();
149 unsigned long first_d = args["pfd"].getulong();
150 unsigned long last_d = args["pld"].getulong();
151 bool XMLmode = false;
152 if (args["pxml"] == "1") XMLmode = true;
153
154 // must have a valid collection server
155 recptproto *collectproto = protos->getrecptproto (args["c"], logout);
156 if (collectproto == NULL) {
157 output_error("phindaction: ERROR: collection not set", textout,
158 outconvert, disp, logout, XMLmode);
159 return true;
160 }
161
162 // the frequency and occurances of the phrase
163 unsigned long tf;
164 vector <unsigned long> el, linkdest, docNums, docfreq;
165 vector <UCArray> linktype;
166
167 // the number of occurances to display
168 unsigned long ef, lf, df;
169
170 text_t basepath = filename_cat(gsdlhome, "collect", args["c"],
171 "index", "phind" + args["pc"]);
172
173 // If we don't know the phrase number, look it up
174 if (phrase == 0) {
175
176 if (word.empty()) {
177 output_error("phindaction: ERROR: no phrase number or word", textout,
178 outconvert, disp, logout, XMLmode);
179 return true;
180 }
181
182 DocNumArray result;
183 find_phrase_number_from_word(basepath, word, result);
184
185 if (result.empty()) {
186 output_error("phindaction: The search term does not occur in the collection",
187 textout, outconvert, disp, logout, XMLmode);
188 return true;
189 } else {
190 phrase = result[0];
191 }
192 }
193
194 // Create a TextData object to read the phrase data (pdata)
195 TextData textdata;
196
197 text_t fullpath = filename_cat(basepath, "pdata");
198 char *fullpathc = fullpath.getcstr();
199#if defined __WIN32__
200 char *base = "";
201#else
202 char *base = "/";
203#endif
204
205 if (!textdata.LoadData (base, fullpathc)) {
206 // FatalError (1, "Couldn't load text information for \"%s\"", fullpathc);
207 exit (0);
208 }
209
210 delete []fullpathc;
211
212 get_phrase_all_data(textdata, phrase, word, tf, ef, lf, df, el,
213 linkdest, linktype, docNums, docfreq);
214
215 // Output the header
216 if (XMLmode) {
217 textout << "<phinddata id=\"" << phrase
218 << "\" text=\"" << word
219 << "\" tf=\"" << tf
220 << "\" ef=\"" << ef
221 << "\" df=\"" << df
222 << "\" lf=\"" << lf
223 << "\">\n";
224 } else {
225 textout << "<html><head><title>" << word << "</title></head>\n"
226 << "<body><center>\n"
227 << "<p><h1>" << word << "</h1>\n"
228 << "<p><b>"<< word << "</b> occurs "
229 << tf << " times in " << df << " documents\n";
230 }
231
232 // Output the thesaurus links
233 if ((lf > 0) && (first_l < last_l)) {
234
235 // figure out the number of phrases to output
236 if (last_l > lf) {
237 last_l = lf;
238 }
239 count_l = last_l - first_l;
240
241 if (XMLmode) {
242 textout << "<thesauruslist length=\"" << lf
243 << "\" start=\"" << first_l
244 << "\" end=\"" << last_l << "\">\n";
245 print_thesaurus_links(args["c"], XMLmode, textdata, linkdest, linktype,
246 first_l, last_l, disp, outconvert, textout);
247 textout << "</thesauruslist>\n";
248 }
249
250 // output links as HTML
251 else {
252 if (count_l == lf) {
253 textout << "<p><b> " << count_l << " thesaurus links</b>\n";
254 } else {
255 textout << "<p><b>" << count_l << " of " << lf << " thesaurus links</b>\n";
256 }
257
258 textout << "<p><table border=1><tr><th>type</th><th>topic</th><th>freq</th><th>docs</th></tr>\n";
259 print_thesaurus_links(args["c"], XMLmode, textdata, linkdest, linktype,
260 first_l, last_l, disp, outconvert, textout);
261 textout << "</table>\n";
262
263 if (last_l < lf) {
264 if ((last_l + 10) < lf) {
265 textout << outconvert << disp
266 << "<br><a href=\"_gwcgi_?"
267 << "c=" << args["c"]
268 << "&ppnum=" << phrase
269 << "&pfe=" << first_e
270 << "&ple=" << last_e
271 << "&pfd=" << first_d
272 << "&pld=" << last_d
273 << "&pfl=" << first_l
274 << "&pll=" << (last_l + 10)
275 << "\">Get more thesaurus links</a>\n";
276 }
277 textout << outconvert << disp
278 << "<br><a href=\"_gwcgi_?"
279 << "c=" << args["c"]
280 << "&ppnum=" << phrase
281 << "&pfe=" << first_e
282 << "&ple=" << last_e
283 << "&pfd=" << first_d
284 << "&pld=" << last_d
285 << "&pfl=" << first_l
286 << "&pll=" << lf
287 << "\">Get every thesaurus link</a>\n" ;
288 }
289 }
290 }
291
292 // Output the expansions
293 if ((ef > 0) && (first_e < last_e)) {
294
295 // figure out the number of phrases to output
296 if (last_e > el.size()) {
297 last_e = el.size();
298 }
299 count_e = last_e - first_e;
300
301 // output expansions as XML
302 if (XMLmode) {
303 textout << "<expansionlist length=\"" << ef
304 << "\" start=\"" << first_e
305 << "\" end=\"" << last_e << "\">" << endl;
306
307 print_expansions(args["c"], XMLmode, word, textdata, el, first_e,
308 last_e, disp, outconvert, textout);
309
310 textout << "</expansionlist>\n";
311 }
312
313 // output expansions as HTML
314 else {
315 if (count_e == el.size()) {
316 textout << "<p><b> " << count_e << " expansions</b>\n";
317 } else {
318 textout << "<p><b>" << count_e << " of " << ef << " expansions</b>\n";
319 }
320
321 textout << "<p><table border=1><tr><th colspan=3>phrase</th><th>freq</th><th>docs</th></tr>\n";
322 print_expansions(args["c"], XMLmode, word, textdata, el, first_e,
323 last_e, disp, outconvert, textout);
324 textout << "</table>\n";
325
326 if (last_e < ef) {
327 if ((last_e + 10) < ef) {
328 textout << outconvert << disp
329 << "<br><a href=\"_gwcgi_?"
330 << "c=" << args["c"]
331 << "&ppnum=" << phrase
332 << "&pfe=" << first_e
333 << "&ple=" << (last_e + 10)
334 << "&pfd=" << first_d
335 << "&pld=" << last_d
336 << "&pfl=" << first_l
337 << "&pll=" << last_l
338 << "\">Get more expansions</a>\n";
339 }
340 textout << outconvert << disp
341 << "<br><a href=\"_gwcgi_?"
342 << "c=" << args["c"]
343 << "&ppnum=" << phrase
344 << "&pfe=" << first_e
345 << "&ple=" << ef
346 << "&pfd=" << first_d
347 << "&pld=" << last_d
348 << "&pfl=" << first_l
349 << "&pll=" << last_l
350 << "\">Get every expansion</a>\n";
351 }
352 }
353 }
354
355 // Output the document occurances
356 if ((df > 0) && (first_d < last_d)) {
357
358 // figure out the phrases to output
359 if (last_d > docNums.size()) {
360 last_d = docNums.size();
361 }
362 count_d = last_d - first_d;
363
364 // output document list as XML
365 if (XMLmode) {
366 textout << "<documentlist length=\"" << df
367 << "\" start=\"" << first_d
368 << "\" end=\"" << last_d << "\">\n";
369
370 print_documents(XMLmode, basepath, args["c"], docNums, docfreq,
371 first_d, last_d, disp, outconvert, textout);
372
373 textout << "</documentlist>\n";
374 }
375
376 // output document list as HTML
377 else {
378
379 if (count_d == docNums.size()) {
380 textout << "<p><b> " << count_d << " documents</b>\n";
381 } else {
382 textout << "<p><b>" << count_d << " of " << df << " documents</b>\n";
383 }
384
385 textout << "<p><table border=1><tr><th align=left>document</th><th>freq</th></tr>\n";
386 print_documents(XMLmode, basepath, args["c"], docNums, docfreq,
387 first_d, last_d, disp, outconvert, textout);
388 textout << "</table>\n";
389
390 if (last_d < df) {
391 if ((last_d + 10) < df) {
392 textout << outconvert << disp
393 << "<br><a href=\"_gwcgi_?"
394 << "c=" << args["c"]
395 << "&ppnum=" << phrase
396 << "&pfe=" << first_e
397 << "&ple=" << last_e
398 << "&pfd=" << first_d
399 << "&pld=" << (last_d + 10)
400 << "&pfl=" << first_l
401 << "&pll=" << last_l
402 << "\">Get more documents</a>\n";
403 }
404 textout << outconvert << disp
405 << "<br><a href=\"_gwcgi_?"
406 << "c=" << args["c"]
407 << "&ppnum=" << phrase
408 << "&pfe=" << first_e
409 << "&ple=" << last_e
410 << "&pfd=" << first_d
411 << "&pld=" << df
412 << "&pfl=" << first_l
413 << "&pll=" << last_l
414 << "\">Get every document</a>\n";
415 }
416 }
417 }
418
419 // Close the document
420 if (XMLmode) {
421 textout << "</phinddata>\n";
422 } else {
423 textout << "</center></body></html>\n";
424 }
425
426 textdata.UnloadData ();
427
428 return true;
429}
430
431// Find the phrase number of a word in the index file
432void phindaction::find_phrase_number_from_word(const text_t &basepath,
433 const text_t &query,
434 DocNumArray &result) {
435
436 // Open the index file for searching
437 IndexData indexData;
438
439 text_t fullpath = filename_cat(basepath, "pword");
440 char *fullpathc = fullpath.getcstr();
441#if defined __WIN32__
442 char *base = "";
443#else
444 char *base = "/";
445#endif
446
447 if (!indexData.LoadData (base, fullpathc)) {
448 // FatalError (1, "Couldn't load index information for \"%s\"", fullpathc);
449 exit (0);
450 }
451
452 delete []fullpathc;
453
454 // set up the query object
455 QueryInfo queryInfo;
456 SetCStr (queryInfo.docLevel, "Document", 8);
457 queryInfo.maxDocs = 5;
458 queryInfo.sortByRank = true;
459 queryInfo.exactWeights = false;
460 queryInfo.needRankInfo = true;
461 queryInfo.needTermFreqs = true;
462
463 // mode 1 = casefolded, unstemmed search
464 UCArray ucquery;
465 toUCArray(query, ucquery);
466 QueryNode *queryTree = ParseQuery(ucquery, 1, 1);
467
468 // perform the query
469 ExtQueryResult queryResult;
470 MGQuery (indexData, queryInfo, queryTree, queryResult);
471 // cout << "-- word lookup result -- " << endl << queryResult << endl ;
472
473 result.clear();
474 result = queryResult.docs;
475
476 // delete the query
477 if (queryTree != NULL) delete queryTree;
478
479 indexData.UnloadData();
480}
481
482// Get all the data about a phrase
483//
484// The phrase is stored in textData as record phrase.
485// We retrieve:
486// word - the text of the phrase
487// tf - the total frequency of the phrase
488// ef - the expansion frequency of the phrase
489// lf - the thesaurus link frequency of the phrase
490// df - the document frequency of the phrase
491// el - the list of phrases that are expansions of phrase
492// ll - the list of phrases that are thesaurus links
493// dl - the list of documents that contain phrase
494void phindaction::get_phrase_all_data(TextData &textdata, unsigned long phrase,
495 text_t &word, unsigned long &tf, unsigned long &ef,
496 unsigned long &lf, unsigned long &df,
497 vector <unsigned long> &el,
498 vector <unsigned long> &linkdest,
499 vector <UCArray> &linktype,
500 vector <unsigned long> &docnum,
501 vector <unsigned long> &docfrq) {
502 UCArray text;
503 UCArray docLevel;
504 SetCStr(docLevel, "Document", 8);
505
506 // Look the word up in the textData
507 if (!GetDocText (textdata, docLevel, phrase, text)) {
508 // FatalError (1, "Error while trying to get phrase %u", phrase);
509 exit (0);
510 }
511
512 // Ignore everything up to the first colon
513 UCArray::iterator next = text.begin();
514 while (*next++ != ':');
515
516 // ignore training carriage returns
517 while (text.back() == '\n') {
518 text.pop_back();
519 }
520
521 // Get the word
522 word.clear();
523 for (; *next != ':'; next++) {
524 word.push_back(*next);
525 }
526
527 // Get total frequency
528 tf = 0;
529 for (next++; *next != ':'; next++) {
530 tf *= 10;
531 tf += (*next - '0');
532 }
533
534 // Get expansion frequency
535 ef = 0;
536 for (next++; *next != ':'; next++) {
537 ef *= 10;
538 ef += (*next - '0');
539 }
540
541 // Get document frequency
542 df = 0;
543 for (next++; *next != ':'; next++) {
544 df *= 10;
545 df += (*next - '0');
546 }
547
548 // Get expansion list
549 el.clear();
550 unsigned long e = 0;
551 for (next++; *next != ':'; next++) {
552 if (*next == ',') {
553 el.push_back(e);
554 e = 0;
555 } else {
556 e *= 10;
557 e += (*next - '0');
558 }
559 }
560
561 // Get document list & the document frequency list
562 docnum.clear();
563 docfrq.clear();
564 bool readnum = false;
565 unsigned long d = 0;
566 for (next++; *next != ':'; next++) {
567 if (*next == ',') {
568 docnum.push_back(d);
569 readnum = true;
570 d = 0;
571 } else if (*next == ';') {
572 if (readnum) {
573 docfrq.push_back(d);
574 } else {
575 docnum.push_back(d);
576 docfrq.push_back(1);
577 }
578 readnum = false;
579 d = 0;
580 } else {
581 d *= 10;
582 d += (*next - '0');
583 }
584 }
585
586 // Get thesaurus link frequency & link list
587 text.push_back(':');
588 text.push_back(':');
589
590 // link frequency
591 lf = 0;
592 for (next++; *next != ':'; next++) {
593 lf *= 10;
594 lf += (*next - '0');
595 }
596
597 // two lists of link data
598 linkdest.clear();
599 linktype.clear();
600
601 UCArray thistype;
602 thistype.clear();
603 bool typedone = false;
604 unsigned long l = 0;
605 for (next++; *next != ':'; next++) {
606
607 if (!typedone) {
608 // first read the link type, a charactor string
609 if (*next == ',') {
610 typedone = true;
611 } else {
612 thistype.push_back(*next);
613 }
614 } else {
615 // having read the link type, read the list of link destinations
616 if (*next == ',') {
617 linkdest.push_back(l);
618 linktype.push_back(thistype);
619 l = 0;
620 } else if (*next == ';') {
621 linkdest.push_back(l);
622 linktype.push_back(thistype);
623 l = 0;
624 thistype.clear();
625 typedone = false;
626 } else {
627 l *= 10;
628 l += (*next - '0');
629 }
630 }
631 }
632}
633
634void phindaction::print_thesaurus_links(const text_t &collection, bool XMLmode,
635 TextData &textdata, vector <unsigned long> &linkdest,
636 vector <UCArray> &linktype, unsigned long first,
637 unsigned long last, displayclass &disp,
638 outconvertclass &outconvert, ostream &textout) {
639
640 // information describing each link in the list
641 unsigned long phrase, tf, ef, df;
642 UCArray type, text;
643
644 for (unsigned long l = first; l < last; l++) {
645
646 // get the phrase data
647 phrase = linkdest[l];
648 type = linktype[l];
649 get_phrase_freq_data(textdata, phrase, text, tf, ef, df);
650
651 if (XMLmode) {
652 textout << "<thesaurus num=\"" << l
653 << "\" id=\"" << phrase
654 << "\" tf=\"" << tf
655 << "\" df=\"" << df
656 << "\" type=\"" << type
657 << "\" text=\"" << text
658 << "\"/>\n";
659 } else {
660 textout << "<tr valign=top><td>" << type << "</td><td>";
661 textout << outconvert << disp
662 << "<a href=\"_gwcgi_?c=" << collection;
663 textout << "&ppnum=" << phrase << "\">" << text << "</a>"
664 << "</td><td>" << tf << "</td><td>" << df << "</td></tr>\n";
665 }
666 }
667}
668
669// Get the frequency data about a phrase
670//
671// The phrase is stored in textData as record phrase.
672// We retrieve:
673// word - the text of the phrase
674// tf - the total frequency of the phrase
675// ef - the expansion frequency of the phrase
676// df - the document frequency of the phrase
677void phindaction::get_phrase_freq_data(TextData &textdata, unsigned long phrase,
678 UCArray &word, unsigned long &tf,
679 unsigned long &ef, unsigned long &df) {
680
681 UCArray text;
682 UCArray docLevel;
683 SetCStr(docLevel, "Document", 8);
684
685 // Look the word up in the textData
686 if (!GetDocText (textdata, docLevel, phrase, text)) {
687 // FatalError (1, "Error while trying to get phrase %u", phrase);
688 exit (0);
689 }
690
691 // Ignore everything up to the first colon
692 UCArray::iterator next = text.begin();
693 while (*next++ != ':');
694
695 // Get the word
696 word.clear();
697 for (; *next != ':'; next++) {
698 word.push_back(*next);
699 }
700
701 // Get total frequency
702 tf = 0;
703 for (next++; *next != ':'; next++) {
704 tf *= 10;
705 tf += (*next - '0');
706 }
707
708 // Get expansion frequency
709 ef = 0;
710 for (next++; *next != ':'; next++) {
711 ef *= 10;
712 ef += (*next - '0');
713 }
714
715 // Get document frequency
716 df = 0;
717 for (next++; *next != ':'; next++) {
718 df *= 10;
719 df += (*next - '0');
720 }
721}
722
723// Print a list of expansions
724//
725// Given the textData and a list of phrase numbers, print out each of the
726// expansions.
727void phindaction::print_expansions(const text_t &collection, bool XMLmode,
728 const text_t &body, TextData &textdata,
729 const vector <unsigned long> &elist,
730 unsigned long first, unsigned long last,
731 displayclass &disp, outconvertclass &outconvert,
732 ostream &textout) {
733
734 UCArray word;
735 unsigned long phrase, tf, df, ef;
736
737 UCArray suffix, prefix, ucbody;
738
739 toUCArray(body, ucbody);
740
741 for (unsigned long e = first; e < last; e++) {
742
743 phrase = elist[e];
744 get_phrase_freq_data(textdata, phrase, word, tf, ef, df);
745
746 split_phrase(word, ucbody, prefix, suffix);
747
748 if (XMLmode) {
749 // body is always the same as the text of the phrase, so no need to send it
750 textout << "<expansion num=\"" << e
751 << "\" id=\"" << phrase
752 << "\" tf=\"" << tf
753 << "\" df=\"" << df;
754 if (!prefix.empty()) {
755 textout << "\" prefix=\"" << prefix;
756 }
757 if (!suffix.empty()) {
758 textout << "\" suffix=\"" << suffix;
759 }
760 textout << "\"/>\n";
761 } else {
762 textout << outconvert << disp
763 << "<tr valign=top><td align=right><a href=\"_gwcgi_?"
764 << "c=" << collection << "&ppnum=" << phrase << "\">";
765 textout << prefix << "</a></td>";
766 textout <<outconvert << disp
767 << "<td align=center><a href=\"_gwcgi_?"
768 << "c=" << collection << "&ppnum=" << phrase << "\">"
769 << body << "</a></td>"
770 << "<td align=left><a href=\"_gwcgi_?"
771 << "c=" << collection << "&ppnum=" << phrase << "\">";
772 textout << suffix << "</a></td>"
773 << "<td>" << tf << "</td><td>" << df << "</td></tr>\n";
774 }
775 }
776}
777
778// split an expansion into prefix and suffix
779void phindaction::split_phrase(const UCArray &word, const UCArray &body,
780 UCArray &prefix, UCArray &suffix) {
781
782 prefix.clear();
783 suffix.clear();
784
785 bool readingPrefix = true;
786 UCArray::const_iterator here = word.begin();
787 UCArray::const_iterator end = word.end();
788
789 while (here != end) {
790
791 // if we've not read all the prefix, add the next char to the prefix
792 if (readingPrefix) {
793 if (phrase_match(body, here, end)) {
794 readingPrefix = false;
795 // trim whitespace from end of prefix & start of suffix
796 if (!prefix.empty()) {
797 prefix.pop_back();
798 }
799 if ((here != end) && (*here == ' ')) {
800 here++;
801 }
802 } else {
803 prefix.push_back(*here);
804 here++;
805 }
806 }
807 // if we've finished with the prefix, update the suffix
808 else {
809 suffix.push_back(*here);
810 here++;
811 }
812 }
813}
814
815// phrase_match
816//
817// compare two strings, one represented as an UCArray, the other as two
818// UCArray iterators.
819//
820// Return true if the UCArray is the same as the phrase the iterators point
821// to for the length of the UCArray.
822bool phindaction::phrase_match(const UCArray &text, UCArray::const_iterator &here,
823 UCArray::const_iterator end) {
824
825 UCArray::const_iterator one_here = text.begin();
826 UCArray::const_iterator one_end = text.end();
827 UCArray::const_iterator two_here = here;
828
829 // iterate over the length of the first string, comparing each element to
830 // the corresponding element in the second string.
831 while (one_here != one_end) {
832
833 if (two_here == end) {
834 return false;
835 } else if (*one_here != *two_here) {
836 return false;
837 }
838 one_here++;
839 two_here++;
840 }
841
842 here = two_here;
843 return true;
844}
845
846void phindaction::print_documents(bool XMLmode, const text_t &basepath,
847 const text_t &collection,
848 const vector <unsigned long> &docNums,
849 const vector <unsigned long> &docFreq,
850 unsigned long first, unsigned long last,
851 displayclass &disp, outconvertclass &outconvert,
852 ostream &textout) {
853
854 // Create a TextData object to read the document data
855 TextData docdata;
856
857 text_t fullpath = filename_cat(basepath, "docs");
858 char *fullpathc = fullpath.getcstr();
859#if defined __WIN32__
860 char *base = "";
861#else
862 char *base = "/";
863#endif
864
865 if (!docdata.LoadData (base, fullpathc)) {
866 // FatalError (1, "Couldn't load text information for \"%s\"", fullpathc);
867 exit (0);
868 }
869
870 delete []fullpathc;
871
872 UCArray title, hash;
873 unsigned long freq, doc;
874
875 for (unsigned long d = first; d < last; d++) {
876 doc = docNums[d];
877 freq = docFreq[d];
878
879 get_document_all_data(docdata, doc, title, hash);
880
881 if (XMLmode) {
882 textout << "<document num=\"" << d
883 << "\" hash=\"" << hash
884 << "\" freq=\"" << freq
885 << "\" title=\"" << title << "\"/>\n";
886 } else {
887 textout << outconvert << disp
888 << "<tr valign=top><td><a href=\"_gwcgi_?"
889 << "c=" << collection;
890 textout << "&a=d&d=" << hash << "\">" << title << "</a>"
891 << "</td><td>" << freq << "</td></tr>\n";
892 }
893 }
894
895 docdata.UnloadData();
896}
897
898// Get all the data about a docment
899//
900// The document's details are stored in docData as record docNum.
901// We retrieve:
902// title - the document's title
903// hash - the document's unique OID
904void phindaction::get_document_all_data(TextData &docdata, unsigned long docNum,
905 UCArray &title, UCArray &hash) {
906
907 UCArray text;
908 UCArray docLevel;
909 SetCStr(docLevel, "Document", 8);
910
911 // Look the word up in the textData
912 if (!GetDocText (docdata, docLevel, docNum, text)) {
913 // FatalError (1, "Error while trying to get document %u", docNum);
914 exit (0);
915 }
916
917 // Ignore everything up to the first colon
918 UCArray::iterator next = text.begin();
919 while (*next++ != '\t');
920
921 // Get the document OID (hash)
922 hash.clear();
923 for (; *next != '\t'; next++) {
924 hash.push_back(*next);
925 }
926
927 // Get the title
928 text.push_back('\n');
929 title.clear();
930 for (next++; *next != '\n'; next++) {
931 title.push_back(*next);
932 }
933}
934
935void phindaction::toUCArray(const text_t &in, UCArray &out) {
936 out.clear();
937 text_t::const_iterator here = in.begin();
938 text_t::const_iterator end = in.end();
939 while (here != end) {
940 out.push_back((unsigned char) *here);
941 here++;
942 }
943}
944
945void phindaction::output_error (const text_t &message, ostream &textout,
946 outconvertclass &outconvert,
947 displayclass & disp, ostream &logout,
948 bool XMLmode) {
949
950 logout << outconvert << message << "\n";
951 if (XMLmode) {
952 textout << outconvert
953 << "<phinddata>\n"
954 << "<phinderror>" << message << "</phinderror>\n"
955 << "</phinddata>\n";
956 } else {
957 textout << outconvert << disp
958 << "_header_\n"
959 << message
960 << "_footer_\n";
961 }
962}
963
964#endif //GSDL_USE_PHIND_ACTION
Note: See TracBrowser for help on using the repository browser.