source: trunk/gsdl/src/recpt/phindaction.cpp@ 6787

Last change on this file since 6787 was 6787, checked in by jrm21, 20 years ago

Tidied slightly so that we return the content type in get_cgihead_info,
otherwise the default gets printed and then we printed out another one.

It wasn't screwing up Phind's parsing or anything, but it's tidier now.

  • Property svn:keywords set to Author Date Id Revision
File size: 26.0 KB
Line 
1/**********************************************************************
2 *
3 * phindaction.cpp --
4 *
5 * Copyright 2001 Gordon W. Paynter
6 * Copyright 2001 The New Zealand Digital Library Project
7 *
8 * A component of the Greenstone digital library software
9 * from the New Zealand Digital Library Project at the
10 * University of Waikato, New Zealand.
11 *
12 * This program is free software; you can redistribute it and/or modify
13 * it under the terms of the GNU General Public License as published by
14 * the Free Software Foundation; either version 2 of the License, or
15 * (at your option) any later version.
16 *
17 * This program is distributed in the hope that it will be useful,
18 * but WITHOUT ANY WARRANTY; without even the implied warranty of
19 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20 * GNU General Public License for more details.
21 *
22 * You should have received a copy of the GNU General Public License
23 * along with this program; if not, write to the Free Software
24 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
25 *
26 *********************************************************************/
27
28// Note that this action uses mgpp to retrieve phind info, calling MGQuery
29// etc. directly, not through the protocol. This breaks our receptionist -
30// collection server separation and should be fixed some day I guess.
31
32#include "phindaction.h"
33#include "fileutil.h"
34
35phindaction::phindaction () {
36
37 cgiarginfo arg_ainfo;
38
39 arg_ainfo.shortname = "pc";
40 arg_ainfo.longname = "phind classifier";
41 arg_ainfo.multiplechar = true;
42 arg_ainfo.defaultstatus = cgiarginfo::weak;
43 arg_ainfo.argdefault = "";
44 arg_ainfo.savedarginfo = cgiarginfo::mustnot;
45 argsinfo.addarginfo (NULL, arg_ainfo);
46
47 arg_ainfo.shortname = "pxml";
48 arg_ainfo.longname = "phind XML mode";
49 arg_ainfo.multiplechar = false;
50 arg_ainfo.defaultstatus = cgiarginfo::weak;
51 arg_ainfo.argdefault = "0";
52 arg_ainfo.savedarginfo = cgiarginfo::mustnot;
53 argsinfo.addarginfo (NULL, arg_ainfo);
54
55 arg_ainfo.shortname = "ppnum";
56 arg_ainfo.longname = "phind phrase number";
57 arg_ainfo.multiplechar = true;
58 arg_ainfo.defaultstatus = cgiarginfo::weak;
59 arg_ainfo.argdefault = "0";
60 arg_ainfo.savedarginfo = cgiarginfo::mustnot;
61 argsinfo.addarginfo (NULL, arg_ainfo);
62
63 arg_ainfo.shortname = "pptext";
64 arg_ainfo.longname = "phind phrase text";
65 arg_ainfo.multiplechar = true;
66 arg_ainfo.defaultstatus = cgiarginfo::weak;
67 arg_ainfo.argdefault = "";
68 arg_ainfo.savedarginfo = cgiarginfo::mustnot;
69 argsinfo.addarginfo (NULL, arg_ainfo);
70
71 arg_ainfo.shortname = "pfe";
72 arg_ainfo.longname = "phind first_e";
73 arg_ainfo.multiplechar = true;
74 arg_ainfo.defaultstatus = cgiarginfo::weak;
75 arg_ainfo.argdefault = "0";
76 arg_ainfo.savedarginfo = cgiarginfo::mustnot;
77 argsinfo.addarginfo (NULL, arg_ainfo);
78
79 arg_ainfo.shortname = "ple";
80 arg_ainfo.longname = "phind last_e";
81 arg_ainfo.multiplechar = true;
82 arg_ainfo.defaultstatus = cgiarginfo::weak;
83 arg_ainfo.argdefault = "10";
84 arg_ainfo.savedarginfo = cgiarginfo::mustnot;
85 argsinfo.addarginfo (NULL, arg_ainfo);
86
87 arg_ainfo.shortname = "pfl";
88 arg_ainfo.longname = "phind first_l";
89 arg_ainfo.multiplechar = true;
90 arg_ainfo.defaultstatus = cgiarginfo::weak;
91 arg_ainfo.argdefault = "0";
92 arg_ainfo.savedarginfo = cgiarginfo::mustnot;
93 argsinfo.addarginfo (NULL, arg_ainfo);
94
95 arg_ainfo.shortname = "pll";
96 arg_ainfo.longname = "phind last_l";
97 arg_ainfo.multiplechar = true;
98 arg_ainfo.defaultstatus = cgiarginfo::weak;
99 arg_ainfo.argdefault = "10";
100 arg_ainfo.savedarginfo = cgiarginfo::mustnot;
101 argsinfo.addarginfo (NULL, arg_ainfo);
102
103 arg_ainfo.shortname = "pfd";
104 arg_ainfo.longname = "phind first_d";
105 arg_ainfo.multiplechar = true;
106 arg_ainfo.defaultstatus = cgiarginfo::weak;
107 arg_ainfo.argdefault = "0";
108 arg_ainfo.savedarginfo = cgiarginfo::mustnot;
109 argsinfo.addarginfo (NULL, arg_ainfo);
110
111 arg_ainfo.shortname = "pld";
112 arg_ainfo.longname = "phind last_d";
113 arg_ainfo.multiplechar = true;
114 arg_ainfo.defaultstatus = cgiarginfo::weak;
115 arg_ainfo.argdefault = "10";
116 arg_ainfo.savedarginfo = cgiarginfo::mustnot;
117 argsinfo.addarginfo (NULL, arg_ainfo);
118}
119
120phindaction::~phindaction () {
121}
122
123void phindaction::get_cgihead_info (cgiargsclass &args, recptprotolistclass * /*protos*/,
124 response_t &response,text_t &response_data,
125 ostream &/*logout*/) {
126 response = content;
127 if (args["pxml"] == "1") {
128 response_data = "text/xml";
129 } else {
130 response_data = "text/html";
131 }
132}
133
134bool phindaction::do_action (cgiargsclass &args, recptprotolistclass *protos,
135 browsermapclass * /*browsers*/, displayclass &disp,
136 outconvertclass &outconvert, ostream &textout,
137 ostream &logout) {
138
139 unsigned long count_l, count_e, count_d;
140 unsigned long phrase = args["ppnum"].getulong();
141 text_t &word = args["pptext"];
142 unsigned long first_e = args["pfe"].getulong();
143 unsigned long last_e = args["ple"].getulong();
144 unsigned long first_l = args["pfl"].getulong();
145 unsigned long last_l = args["pll"].getulong();
146 unsigned long first_d = args["pfd"].getulong();
147 unsigned long last_d = args["pld"].getulong();
148 bool XMLmode = false;
149 if (args["pxml"] == "1") XMLmode = true;
150
151 // must have a valid collection server
152 recptproto *collectproto = protos->getrecptproto (args["c"], logout);
153 if (collectproto == NULL) {
154 output_error("phindaction: ERROR: collection not set", textout,
155 outconvert, disp, logout, XMLmode);
156 return true;
157 }
158
159 // the frequency and occurances of the phrase
160 unsigned long tf;
161 vector <unsigned long> el, linkdest, docNums, docfreq;
162 vector <UCArray> linktype;
163
164 // the number of occurances to display
165 unsigned long ef, lf, df;
166
167 text_t basepath = filename_cat(gsdlhome, "collect", args["c"],
168 "index", "phind" + args["pc"]);
169
170 // If we don't know the phrase number, look it up
171 if (phrase == 0) {
172
173 if (word.empty()) {
174 output_error("phindaction: ERROR: no phrase number or word", textout,
175 outconvert, disp, logout, XMLmode);
176 return true;
177 }
178
179 DocNumArray result;
180 find_phrase_number_from_word(basepath, word, result);
181
182 if (result.empty()) {
183 output_error("phindaction: The search term does not occur in the collection",
184 textout, outconvert, disp, logout, XMLmode);
185 return true;
186 } else {
187 phrase = result[0];
188 }
189 }
190
191 // Create a TextData object to read the phrase data (pdata)
192 TextData textdata;
193
194 text_t fullpath = filename_cat(basepath, "pdata");
195 char *fullpathc = fullpath.getcstr();
196#if defined __WIN32__
197 char *base = "";
198#else
199 char *base = "/";
200#endif
201
202 if (!textdata.LoadData (base, fullpathc)) {
203 // FatalError (1, "Couldn't load text information for \"%s\"", fullpathc);
204 exit (0);
205 }
206
207 delete fullpathc;
208
209 get_phrase_all_data(textdata, phrase, word, tf, ef, lf, df, el,
210 linkdest, linktype, docNums, docfreq);
211
212 // Output the header
213 if (XMLmode) {
214 textout << "<phinddata id=\"" << phrase
215 << "\" text=\"" << word
216 << "\" tf=\"" << tf
217 << "\" ef=\"" << ef
218 << "\" df=\"" << df
219 << "\" lf=\"" << lf
220 << "\">\n";
221 } else {
222 textout << "<html><head><title>" << word << "</title></head>\n"
223 << "<body><center>\n"
224 << "<p><h1>" << word << "</h1>\n"
225 << "<p><b>"<< word << "</b> occurs "
226 << tf << " times in " << df << " documents\n";
227 }
228
229 // Output the thesaurus links
230 if ((lf > 0) && (first_l < last_l)) {
231
232 // figure out the number of phrases to output
233 if (last_l > lf) {
234 last_l = lf;
235 }
236 count_l = last_l - first_l;
237
238 if (XMLmode) {
239 textout << "<thesauruslist length=\"" << lf
240 << "\" start=\"" << first_l
241 << "\" end=\"" << last_l << "\">\n";
242 print_thesaurus_links(args["c"], XMLmode, textdata, linkdest, linktype,
243 first_l, last_l, disp, outconvert, textout);
244 textout << "</thesauruslist>\n";
245 }
246
247 // output links as HTML
248 else {
249 if (count_l == lf) {
250 textout << "<p><b> " << count_l << " thesaurus links</b>\n";
251 } else {
252 textout << "<p><b>" << count_l << " of " << lf << " thesaurus links</b>\n";
253 }
254
255 textout << "<p><table border=1><tr><th>type</th><th>topic</th><th>freq</th><th>docs</th></tr>\n";
256 print_thesaurus_links(args["c"], XMLmode, textdata, linkdest, linktype,
257 first_l, last_l, disp, outconvert, textout);
258 textout << "</table>\n";
259
260 if (last_l < lf) {
261 if ((last_l + 10) < lf) {
262 textout << outconvert << disp
263 << "<br><a href=\"_gwcgi_?"
264 << "c=" << args["c"]
265 << "&ppnum=" << phrase
266 << "&pfe=" << first_e
267 << "&ple=" << last_e
268 << "&pfd=" << first_d
269 << "&pld=" << last_d
270 << "&pfl=" << first_l
271 << "&pll=" << (last_l + 10)
272 << "\">Get more thesaurus links</a>\n";
273 }
274 textout << outconvert << disp
275 << "<br><a href=\"_gwcgi_?"
276 << "c=" << args["c"]
277 << "&ppnum=" << phrase
278 << "&pfe=" << first_e
279 << "&ple=" << last_e
280 << "&pfd=" << first_d
281 << "&pld=" << last_d
282 << "&pfl=" << first_l
283 << "&pll=" << lf
284 << "\">Get every thesaurus link</a>\n" ;
285 }
286 }
287 }
288
289 // Output the expansions
290 if ((ef > 0) && (first_e < last_e)) {
291
292 // figure out the number of phrases to output
293 if (last_e > el.size()) {
294 last_e = el.size();
295 }
296 count_e = last_e - first_e;
297
298 // output expansions as XML
299 if (XMLmode) {
300 textout << "<expansionlist length=\"" << ef
301 << "\" start=\"" << first_e
302 << "\" end=\"" << last_e << "\">" << endl;
303
304 print_expansions(args["c"], XMLmode, word, textdata, el, first_e,
305 last_e, disp, outconvert, textout);
306
307 textout << "</expansionlist>\n";
308 }
309
310 // output expansions as HTML
311 else {
312 if (count_e == el.size()) {
313 textout << "<p><b> " << count_e << " expansions</b>\n";
314 } else {
315 textout << "<p><b>" << count_e << " of " << ef << " expansions</b>\n";
316 }
317
318 textout << "<p><table border=1><tr><th colspan=3>phrase</th><th>freq</th><th>docs</th></tr>\n";
319 print_expansions(args["c"], XMLmode, word, textdata, el, first_e,
320 last_e, disp, outconvert, textout);
321 textout << "</table>\n";
322
323 if (last_e < ef) {
324 if ((last_e + 10) < ef) {
325 textout << outconvert << disp
326 << "<br><a href=\"_gwcgi_?"
327 << "c=" << args["c"]
328 << "&ppnum=" << phrase
329 << "&pfe=" << first_e
330 << "&ple=" << (last_e + 10)
331 << "&pfd=" << first_d
332 << "&pld=" << last_d
333 << "&pfl=" << first_l
334 << "&pll=" << last_l
335 << "\">Get more expansions</a>\n";
336 }
337 textout << outconvert << disp
338 << "<br><a href=\"_gwcgi_?"
339 << "c=" << args["c"]
340 << "&ppnum=" << phrase
341 << "&pfe=" << first_e
342 << "&ple=" << ef
343 << "&pfd=" << first_d
344 << "&pld=" << last_d
345 << "&pfl=" << first_l
346 << "&pll=" << last_l
347 << "\">Get every expansion</a>\n";
348 }
349 }
350 }
351
352 // Output the document occurances
353 if ((df > 0) && (first_d < last_d)) {
354
355 // figure out the phrases to output
356 if (last_d > docNums.size()) {
357 last_d = docNums.size();
358 }
359 count_d = last_d - first_d;
360
361 // output document list as XML
362 if (XMLmode) {
363 textout << "<documentlist length=\"" << df
364 << "\" start=\"" << first_d
365 << "\" end=\"" << last_d << "\">\n";
366
367 print_documents(XMLmode, basepath, args["c"], docNums, docfreq,
368 first_d, last_d, disp, outconvert, textout);
369
370 textout << "</documentlist>\n";
371 }
372
373 // output document list as HTML
374 else {
375
376 if (count_d == docNums.size()) {
377 textout << "<p><b> " << count_d << " documents</b>\n";
378 } else {
379 textout << "<p><b>" << count_d << " of " << df << " documents</b>\n";
380 }
381
382 textout << "<p><table border=1><tr><th align=left>document</th><th>freq</th></tr>\n";
383 print_documents(XMLmode, basepath, args["c"], docNums, docfreq,
384 first_d, last_d, disp, outconvert, textout);
385 textout << "</table>\n";
386
387 if (last_d < df) {
388 if ((last_d + 10) < df) {
389 textout << outconvert << disp
390 << "<br><a href=\"_gwcgi_?"
391 << "c=" << args["c"]
392 << "&ppnum=" << phrase
393 << "&pfe=" << first_e
394 << "&ple=" << last_e
395 << "&pfd=" << first_d
396 << "&pld=" << (last_d + 10)
397 << "&pfl=" << first_l
398 << "&pll=" << last_l
399 << "\">Get more documents</a>\n";
400 }
401 textout << outconvert << disp
402 << "<br><a href=\"_gwcgi_?"
403 << "c=" << args["c"]
404 << "&ppnum=" << phrase
405 << "&pfe=" << first_e
406 << "&ple=" << last_e
407 << "&pfd=" << first_d
408 << "&pld=" << df
409 << "&pfl=" << first_l
410 << "&pll=" << last_l
411 << "\">Get every document</a>\n";
412 }
413 }
414 }
415
416 // Close the document
417 if (XMLmode) {
418 textout << "</phinddata>\n";
419 } else {
420 textout << "</center></body></html>\n";
421 }
422
423 textdata.UnloadData ();
424
425 return true;
426}
427
428// Find the phrase number of a word in the index file
429void phindaction::find_phrase_number_from_word(const text_t &basepath,
430 const text_t &query,
431 DocNumArray &result) {
432
433 // Open the index file for searching
434 IndexData indexData;
435
436 text_t fullpath = filename_cat(basepath, "pword");
437 char *fullpathc = fullpath.getcstr();
438#if defined __WIN32__
439 char *base = "";
440#else
441 char *base = "/";
442#endif
443
444 if (!indexData.LoadData (base, fullpathc)) {
445 // FatalError (1, "Couldn't load index information for \"%s\"", fullpathc);
446 exit (0);
447 }
448
449 delete fullpathc;
450
451 // set up the query object
452 QueryInfo queryInfo;
453 SetCStr (queryInfo.docLevel, "Document");
454 queryInfo.maxDocs = 5;
455 queryInfo.sortByRank = true;
456 queryInfo.exactWeights = false;
457 queryInfo.needRankInfo = true;
458 queryInfo.needTermFreqs = true;
459
460 // mode 1 = casefolded, unstemmed search
461 UCArray ucquery;
462 toUCArray(query, ucquery);
463 QueryNode *queryTree = ParseQuery(ucquery, 1, 1);
464
465 // perform the query
466 ExtQueryResult queryResult;
467 MGQuery (indexData, queryInfo, queryTree, queryResult);
468 // cout << "-- word lookup result -- " << endl << queryResult << endl ;
469
470 result.clear();
471 result = queryResult.docs;
472
473 // delete the query
474 if (queryTree != NULL) delete queryTree;
475
476 indexData.UnloadData();
477}
478
479// Get all the data about a phrase
480//
481// The phrase is stored in textData as record phrase.
482// We retrieve:
483// word - the text of the phrase
484// tf - the total frequency of the phrase
485// ef - the expansion frequency of the phrase
486// lf - the thesaurus link frequency of the phrase
487// df - the document frequency of the phrase
488// el - the list of phrases that are expansions of phrase
489// ll - the list of phrases that are thesaurus links
490// dl - the list of documents that contain phrase
491void phindaction::get_phrase_all_data(TextData &textdata, unsigned long phrase,
492 text_t &word, unsigned long &tf, unsigned long &ef,
493 unsigned long &lf, unsigned long &df,
494 vector <unsigned long> &el,
495 vector <unsigned long> &linkdest,
496 vector <UCArray> &linktype,
497 vector <unsigned long> &docnum,
498 vector <unsigned long> &docfrq) {
499 UCArray text;
500 UCArray docLevel;
501 SetCStr(docLevel, "Document");
502
503 // Look the word up in the textData
504 if (!GetDocText (textdata, docLevel, phrase, text)) {
505 // FatalError (1, "Error while trying to get phrase %u", phrase);
506 exit (0);
507 }
508
509 // Ignore everything up to the first colon
510 UCArray::iterator next = text.begin();
511 while (*next++ != ':');
512
513 // ignore training carriage returns
514 while (text.back() == '\n') {
515 text.pop_back();
516 }
517
518 // Get the word
519 word.clear();
520 for (; *next != ':'; next++) {
521 word.push_back(*next);
522 }
523
524 // Get total frequency
525 tf = 0;
526 for (next++; *next != ':'; next++) {
527 tf *= 10;
528 tf += (*next - '0');
529 }
530
531 // Get expansion frequency
532 ef = 0;
533 for (next++; *next != ':'; next++) {
534 ef *= 10;
535 ef += (*next - '0');
536 }
537
538 // Get document frequency
539 df = 0;
540 for (next++; *next != ':'; next++) {
541 df *= 10;
542 df += (*next - '0');
543 }
544
545 // Get expansion list
546 el.clear();
547 unsigned long e = 0;
548 for (next++; *next != ':'; next++) {
549 if (*next == ',') {
550 el.push_back(e);
551 e = 0;
552 } else {
553 e *= 10;
554 e += (*next - '0');
555 }
556 }
557
558 // Get document list & the document frequency list
559 docnum.clear();
560 docfrq.clear();
561 bool readnum = false;
562 unsigned long d = 0;
563 for (next++; *next != ':'; next++) {
564 if (*next == ',') {
565 docnum.push_back(d);
566 readnum = true;
567 d = 0;
568 } else if (*next == ';') {
569 if (readnum) {
570 docfrq.push_back(d);
571 } else {
572 docnum.push_back(d);
573 docfrq.push_back(1);
574 }
575 readnum = false;
576 d = 0;
577 } else {
578 d *= 10;
579 d += (*next - '0');
580 }
581 }
582
583 // Get thesaurus link frequency & link list
584 text.push_back(':');
585 text.push_back(':');
586
587 // link frequency
588 lf = 0;
589 for (next++; *next != ':'; next++) {
590 lf *= 10;
591 lf += (*next - '0');
592 }
593
594 // two lists of link data
595 linkdest.clear();
596 linktype.clear();
597
598 UCArray thistype;
599 thistype.clear();
600 bool typedone = false;
601 unsigned long l = 0;
602 for (next++; *next != ':'; next++) {
603
604 if (!typedone) {
605 // first read the link type, a charactor string
606 if (*next == ',') {
607 typedone = true;
608 } else {
609 thistype.push_back(*next);
610 }
611 } else {
612 // having read the link type, read the list of link destinations
613 if (*next == ',') {
614 linkdest.push_back(l);
615 linktype.push_back(thistype);
616 l = 0;
617 } else if (*next == ';') {
618 linkdest.push_back(l);
619 linktype.push_back(thistype);
620 l = 0;
621 thistype.clear();
622 typedone = false;
623 } else {
624 l *= 10;
625 l += (*next - '0');
626 }
627 }
628 }
629}
630
631void phindaction::print_thesaurus_links(const text_t &collection, bool XMLmode,
632 TextData &textdata, vector <unsigned long> &linkdest,
633 vector <UCArray> &linktype, unsigned long first,
634 unsigned long last, displayclass &disp,
635 outconvertclass &outconvert, ostream &textout) {
636
637 // information describing each link in the list
638 unsigned long phrase, tf, ef, df;
639 UCArray type, text;
640
641 for (unsigned long l = first; l < last; l++) {
642
643 // get the phrase data
644 phrase = linkdest[l];
645 type = linktype[l];
646 get_phrase_freq_data(textdata, phrase, text, tf, ef, df);
647
648 if (XMLmode) {
649 textout << "<thesaurus num=\"" << l
650 << "\" id=\"" << phrase
651 << "\" tf=\"" << tf
652 << "\" df=\"" << df
653 << "\" type=\"" << type
654 << "\" text=\"" << text
655 << "\"/>\n";
656 } else {
657 textout << "<tr valign=top><td>" << type << "</td><td>";
658 textout << outconvert << disp
659 << "<a href=\"_gwcgi_?c=" << collection;
660 textout << "&ppnum=" << phrase << "\">" << text << "</a>"
661 << "</td><td>" << tf << "</td><td>" << df << "</td></tr>\n";
662 }
663 }
664}
665
666// Get the frequency data about a phrase
667//
668// The phrase is stored in textData as record phrase.
669// We retrieve:
670// word - the text of the phrase
671// tf - the total frequency of the phrase
672// ef - the expansion frequency of the phrase
673// df - the document frequency of the phrase
674void phindaction::get_phrase_freq_data(TextData &textdata, unsigned long phrase,
675 UCArray &word, unsigned long &tf,
676 unsigned long &ef, unsigned long &df) {
677
678 UCArray text;
679 UCArray docLevel;
680 SetCStr(docLevel, "Document");
681
682 // Look the word up in the textData
683 if (!GetDocText (textdata, docLevel, phrase, text)) {
684 // FatalError (1, "Error while trying to get phrase %u", phrase);
685 exit (0);
686 }
687
688 // Ignore everything up to the first colon
689 UCArray::iterator next = text.begin();
690 while (*next++ != ':');
691
692 // Get the word
693 word.clear();
694 for (; *next != ':'; next++) {
695 word.push_back(*next);
696 }
697
698 // Get total frequency
699 tf = 0;
700 for (next++; *next != ':'; next++) {
701 tf *= 10;
702 tf += (*next - '0');
703 }
704
705 // Get expansion frequency
706 ef = 0;
707 for (next++; *next != ':'; next++) {
708 ef *= 10;
709 ef += (*next - '0');
710 }
711
712 // Get document frequency
713 df = 0;
714 for (next++; *next != ':'; next++) {
715 df *= 10;
716 df += (*next - '0');
717 }
718}
719
720// Print a list of expansions
721//
722// Given the textData and a list of phrase numbers, print out each of the
723// expansions.
724void phindaction::print_expansions(const text_t &collection, bool XMLmode,
725 const text_t &body, TextData &textdata,
726 const vector <unsigned long> &elist,
727 unsigned long first, unsigned long last,
728 displayclass &disp, outconvertclass &outconvert,
729 ostream &textout) {
730
731 UCArray word;
732 unsigned long phrase, tf, df, ef;
733
734 UCArray suffix, prefix, ucbody;
735
736 toUCArray(body, ucbody);
737
738 for (unsigned long e = first; e < last; e++) {
739
740 phrase = elist[e];
741 get_phrase_freq_data(textdata, phrase, word, tf, ef, df);
742
743 split_phrase(word, ucbody, prefix, suffix);
744
745 if (XMLmode) {
746 // body is always the same as the text of the phrase, so no need to send it
747 textout << "<expansion num=\"" << e
748 << "\" id=\"" << phrase
749 << "\" tf=\"" << tf
750 << "\" df=\"" << df;
751 if (!prefix.empty()) {
752 textout << "\" prefix=\"" << prefix;
753 }
754 if (!suffix.empty()) {
755 textout << "\" suffix=\"" << suffix;
756 }
757 textout << "\"/>\n";
758 } else {
759 textout << outconvert << disp
760 << "<tr valign=top><td align=right><a href=\"_gwcgi_?"
761 << "c=" << collection << "&ppnum=" << phrase << "\">";
762 textout << prefix << "</a></td>";
763 textout <<outconvert << disp
764 << "<td align=center><a href=\"_gwcgi_?"
765 << "c=" << collection << "&ppnum=" << phrase << "\">"
766 << body << "</a></td>"
767 << "<td align=left><a href=\"_gwcgi_?"
768 << "c=" << collection << "&ppnum=" << phrase << "\">";
769 textout << suffix << "</a></td>"
770 << "<td>" << tf << "</td><td>" << df << "</td></tr>\n";
771 }
772 }
773}
774
775// split an expansion into prefix and suffix
776void phindaction::split_phrase(const UCArray &word, const UCArray &body,
777 UCArray &prefix, UCArray &suffix) {
778
779 prefix.clear();
780 suffix.clear();
781
782 bool readingPrefix = true;
783 UCArray::const_iterator here = word.begin();
784 UCArray::const_iterator end = word.end();
785
786 while (here != end) {
787
788 // if we've not read all the prefix, add the next char to the prefix
789 if (readingPrefix) {
790 if (phrase_match(body, here, end)) {
791 readingPrefix = false;
792 // trim whitespace from end of prefix & start of suffix
793 if (!prefix.empty()) {
794 prefix.pop_back();
795 }
796 if ((here != end) && (*here == ' ')) {
797 here++;
798 }
799 } else {
800 prefix.push_back(*here);
801 here++;
802 }
803 }
804 // if we've finished with the prefix, update the suffix
805 else {
806 suffix.push_back(*here);
807 here++;
808 }
809 }
810}
811
812// phrase_match
813//
814// compare two strings, one represented as an UCArray, the other as two
815// UCArray iterators.
816//
817// Return true if the UCArray is the same as the phrase the iterators point
818// to for the length of the UCArray.
819bool phindaction::phrase_match(const UCArray &text, UCArray::const_iterator &here,
820 UCArray::const_iterator end) {
821
822 UCArray::const_iterator one_here = text.begin();
823 UCArray::const_iterator one_end = text.end();
824 UCArray::const_iterator two_here = here;
825
826 // iterate over the length of the first string, comparing each element to
827 // the corresponding element in the second string.
828 while (one_here != one_end) {
829
830 if (two_here == end) {
831 return false;
832 } else if (*one_here != *two_here) {
833 return false;
834 }
835 one_here++;
836 two_here++;
837 }
838
839 here = two_here;
840 return true;
841}
842
843void phindaction::print_documents(bool XMLmode, const text_t &basepath,
844 const text_t &collection,
845 const vector <unsigned long> &docNums,
846 const vector <unsigned long> &docFreq,
847 unsigned long first, unsigned long last,
848 displayclass &disp, outconvertclass &outconvert,
849 ostream &textout) {
850
851 // Create a TextData object to read the document data
852 TextData docdata;
853
854 text_t fullpath = filename_cat(basepath, "docs");
855 char *fullpathc = fullpath.getcstr();
856#if defined __WIN32__
857 char *base = "";
858#else
859 char *base = "/";
860#endif
861
862 if (!docdata.LoadData (base, fullpathc)) {
863 // FatalError (1, "Couldn't load text information for \"%s\"", fullpathc);
864 exit (0);
865 }
866
867 delete fullpathc;
868
869 UCArray title, hash;
870 unsigned long freq, doc;
871
872 for (unsigned long d = first; d < last; d++) {
873 doc = docNums[d];
874 freq = docFreq[d];
875
876 get_document_all_data(docdata, doc, title, hash);
877
878 if (XMLmode) {
879 textout << "<document num=\"" << d
880 << "\" hash=\"" << hash
881 << "\" freq=\"" << freq
882 << "\" title=\"" << title << "\"/>\n";
883 } else {
884 textout << outconvert << disp
885 << "<tr valign=top><td><a href=\"_gwcgi_?"
886 << "c=" << collection;
887 textout << "&a=d&d=" << hash << "\">" << title << "</a>"
888 << "</td><td>" << freq << "</td></tr>\n";
889 }
890 }
891
892 docdata.UnloadData();
893}
894
895// Get all the data about a docment
896//
897// The document's details are stored in docData as record docNum.
898// We retrieve:
899// title - the document's title
900// hash - the document's unique OID
901void phindaction::get_document_all_data(TextData &docdata, unsigned long docNum,
902 UCArray &title, UCArray &hash) {
903
904 UCArray text;
905 UCArray docLevel;
906 SetCStr(docLevel, "Document");
907
908 // Look the word up in the textData
909 if (!GetDocText (docdata, docLevel, docNum, text)) {
910 // FatalError (1, "Error while trying to get document %u", docNum);
911 exit (0);
912 }
913
914 // Ignore everything up to the first colon
915 UCArray::iterator next = text.begin();
916 while (*next++ != '\t');
917
918 // Get the document OID (hash)
919 hash.clear();
920 for (; *next != '\t'; next++) {
921 hash.push_back(*next);
922 }
923
924 // Get the title
925 text.push_back('\n');
926 title.clear();
927 for (next++; *next != '\n'; next++) {
928 title.push_back(*next);
929 }
930}
931
932void phindaction::toUCArray(const text_t &in, UCArray &out) {
933 out.clear();
934 text_t::const_iterator here = in.begin();
935 text_t::const_iterator end = in.end();
936 while (here != end) {
937 out.push_back((unsigned char) *here);
938 here++;
939 }
940}
941
942void phindaction::output_error (const text_t &message, ostream &textout,
943 outconvertclass &outconvert,
944 displayclass & disp, ostream &logout,
945 bool XMLmode) {
946
947 logout << outconvert << message << "\n";
948 if (XMLmode) {
949 textout << outconvert
950 << "<phinddata>\n"
951 << "<phinderror>" << message << "</phinderror>\n"
952 << "</phinddata>\n";
953 } else {
954 textout << outconvert << disp
955 << "_header_\n"
956 << message
957 << "_footer_\n";
958 }
959}
Note: See TracBrowser for help on using the repository browser.