source: trunk/gsdl/src/recpt/phindaction.cpp@ 3036

Last change on this file since 3036 was 2542, checked in by sjboddie, 23 years ago

* empty log message *

  • Property svn:keywords set to Author Date Id Revision
File size: 26.0 KB
Line 
1/**********************************************************************
2 *
3 * phindaction.cpp --
4 *
5 * Copyright 2001 Gordon W. Paynter
6 * Copyright 2001 The New Zealand Digital Library Project
7 *
8 * A component of the Greenstone digital library software
9 * from the New Zealand Digital Library Project at the
10 * University of Waikato, New Zealand.
11 *
12 * This program is free software; you can redistribute it and/or modify
13 * it under the terms of the GNU General Public License as published by
14 * the Free Software Foundation; either version 2 of the License, or
15 * (at your option) any later version.
16 *
17 * This program is distributed in the hope that it will be useful,
18 * but WITHOUT ANY WARRANTY; without even the implied warranty of
19 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20 * GNU General Public License for more details.
21 *
22 * You should have received a copy of the GNU General Public License
23 * along with this program; if not, write to the Free Software
24 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
25 *
26 *********************************************************************/
27
28// Note that this action uses mgpp to retrieve phind info, calling MGQuery
29// etc. directly, not through the protocol. This breaks our receptionist -
30// collection server separation and should be fixed some day I guess.
31
32#include "phindaction.h"
33#include "fileutil.h"
34
35phindaction::phindaction () {
36
37 cgiarginfo arg_ainfo;
38
39 arg_ainfo.shortname = "pc";
40 arg_ainfo.longname = "phind classifier";
41 arg_ainfo.multiplechar = true;
42 arg_ainfo.defaultstatus = cgiarginfo::weak;
43 arg_ainfo.argdefault = "";
44 arg_ainfo.savedarginfo = cgiarginfo::mustnot;
45 argsinfo.addarginfo (NULL, arg_ainfo);
46
47 arg_ainfo.shortname = "pxml";
48 arg_ainfo.longname = "phind XML mode";
49 arg_ainfo.multiplechar = false;
50 arg_ainfo.defaultstatus = cgiarginfo::weak;
51 arg_ainfo.argdefault = "0";
52 arg_ainfo.savedarginfo = cgiarginfo::mustnot;
53 argsinfo.addarginfo (NULL, arg_ainfo);
54
55 arg_ainfo.shortname = "ppnum";
56 arg_ainfo.longname = "phind phrase number";
57 arg_ainfo.multiplechar = true;
58 arg_ainfo.defaultstatus = cgiarginfo::weak;
59 arg_ainfo.argdefault = "0";
60 arg_ainfo.savedarginfo = cgiarginfo::mustnot;
61 argsinfo.addarginfo (NULL, arg_ainfo);
62
63 arg_ainfo.shortname = "pptext";
64 arg_ainfo.longname = "phind phrase text";
65 arg_ainfo.multiplechar = true;
66 arg_ainfo.defaultstatus = cgiarginfo::weak;
67 arg_ainfo.argdefault = "";
68 arg_ainfo.savedarginfo = cgiarginfo::mustnot;
69 argsinfo.addarginfo (NULL, arg_ainfo);
70
71 arg_ainfo.shortname = "pfe";
72 arg_ainfo.longname = "phind first_e";
73 arg_ainfo.multiplechar = true;
74 arg_ainfo.defaultstatus = cgiarginfo::weak;
75 arg_ainfo.argdefault = "0";
76 arg_ainfo.savedarginfo = cgiarginfo::mustnot;
77 argsinfo.addarginfo (NULL, arg_ainfo);
78
79 arg_ainfo.shortname = "ple";
80 arg_ainfo.longname = "phind last_e";
81 arg_ainfo.multiplechar = true;
82 arg_ainfo.defaultstatus = cgiarginfo::weak;
83 arg_ainfo.argdefault = "10";
84 arg_ainfo.savedarginfo = cgiarginfo::mustnot;
85 argsinfo.addarginfo (NULL, arg_ainfo);
86
87 arg_ainfo.shortname = "pfl";
88 arg_ainfo.longname = "phind first_l";
89 arg_ainfo.multiplechar = true;
90 arg_ainfo.defaultstatus = cgiarginfo::weak;
91 arg_ainfo.argdefault = "0";
92 arg_ainfo.savedarginfo = cgiarginfo::mustnot;
93 argsinfo.addarginfo (NULL, arg_ainfo);
94
95 arg_ainfo.shortname = "pll";
96 arg_ainfo.longname = "phind last_l";
97 arg_ainfo.multiplechar = true;
98 arg_ainfo.defaultstatus = cgiarginfo::weak;
99 arg_ainfo.argdefault = "10";
100 arg_ainfo.savedarginfo = cgiarginfo::mustnot;
101 argsinfo.addarginfo (NULL, arg_ainfo);
102
103 arg_ainfo.shortname = "pfd";
104 arg_ainfo.longname = "phind first_d";
105 arg_ainfo.multiplechar = true;
106 arg_ainfo.defaultstatus = cgiarginfo::weak;
107 arg_ainfo.argdefault = "0";
108 arg_ainfo.savedarginfo = cgiarginfo::mustnot;
109 argsinfo.addarginfo (NULL, arg_ainfo);
110
111 arg_ainfo.shortname = "pld";
112 arg_ainfo.longname = "phind last_d";
113 arg_ainfo.multiplechar = true;
114 arg_ainfo.defaultstatus = cgiarginfo::weak;
115 arg_ainfo.argdefault = "10";
116 arg_ainfo.savedarginfo = cgiarginfo::mustnot;
117 argsinfo.addarginfo (NULL, arg_ainfo);
118}
119
120phindaction::~phindaction () {
121}
122
123void phindaction::get_cgihead_info (cgiargsclass &/*args*/, recptprotolistclass * /*protos*/,
124 response_t &response,text_t &response_data,
125 ostream &/*logout*/) {
126 response = content;
127 response_data = "text/html";
128}
129
130bool phindaction::do_action (cgiargsclass &args, recptprotolistclass *protos,
131 browsermapclass * /*browsers*/, displayclass &disp,
132 outconvertclass &outconvert, ostream &textout,
133 ostream &logout) {
134
135 unsigned long count_l, count_e, count_d;
136 unsigned long phrase = args["ppnum"].getulong();
137 text_t &word = args["pptext"];
138 unsigned long first_e = args["pfe"].getulong();
139 unsigned long last_e = args["ple"].getulong();
140 unsigned long first_l = args["pfl"].getulong();
141 unsigned long last_l = args["pll"].getulong();
142 unsigned long first_d = args["pfd"].getulong();
143 unsigned long last_d = args["pld"].getulong();
144 bool XMLmode = false;
145 if (args["pxml"] == "1") XMLmode = true;
146
147 // must have a valid collection server
148 recptproto *collectproto = protos->getrecptproto (args["c"], logout);
149 if (collectproto == NULL) {
150 output_error("phindaction: ERROR: collection not set", textout,
151 outconvert, disp, logout, XMLmode);
152 return true;
153 }
154
155 // the frequency and occurances of the phrase
156 unsigned long tf;
157 vector <unsigned long> el, linkdest, docNums, docfreq;
158 vector <UCArray> linktype;
159
160 // the number of occurances to display
161 unsigned long ef, lf, df;
162
163 text_t basepath = filename_cat(gsdlhome, "collect", args["c"],
164 "index", "phind" + args["pc"]);
165
166 // If we don't know the phrase number, look it up
167 if (phrase == 0) {
168
169 if (word.empty()) {
170 output_error("phindaction: ERROR: no phrase number or word", textout,
171 outconvert, disp, logout, XMLmode);
172 return true;
173 }
174
175 DocNumArray result;
176 find_phrase_number_from_word(basepath, word, result);
177
178 if (result.empty()) {
179 output_error("phindaction: The search term does not occur in the collection",
180 textout, outconvert, disp, logout, XMLmode);
181 return true;
182 } else {
183 phrase = result[0];
184 }
185 }
186
187 // Create a TextData object to read the phrase data (pdata)
188 TextData textdata;
189
190 text_t fullpath = filename_cat(basepath, "pdata");
191 char *fullpathc = fullpath.getcstr();
192#if defined __WIN32__
193 char *base = "";
194#else
195 char *base = "/";
196#endif
197
198 if (!textdata.LoadData (base, fullpathc)) {
199 // FatalError (1, "Couldn't load text information for \"%s\"", fullpathc);
200 exit (0);
201 }
202
203 delete fullpathc;
204
205 get_phrase_all_data(textdata, phrase, word, tf, ef, lf, df, el,
206 linkdest, linktype, docNums, docfreq);
207
208 // Output the header
209 if (XMLmode) {
210 textout << "Content-type: text/plain\n\n"
211 << "<phinddata id=\"" << phrase
212 << "\" text=\"" << word
213 << "\" tf=\"" << tf
214 << "\" ef=\"" << ef
215 << "\" df=\"" << df
216 << "\" lf=\"" << lf
217 << "\">\n";
218 } else {
219 textout << "Content-type: text/html\n\n"
220 << "<html><head><title>" << word << "</title></head>\n"
221 << "<body><center>\n"
222 << "<p><h1>" << word << "</h1>\n"
223 << "<p><b>"<< word << "</b> occurs "
224 << tf << " times in " << df << " documents\n";
225 }
226
227 // Output the thesaurus links
228 if ((lf > 0) && (first_l < last_l)) {
229
230 // figure out the number of phrases to output
231 if (last_l > lf) {
232 last_l = lf;
233 }
234 count_l = last_l - first_l;
235
236 if (XMLmode) {
237 textout << "<thesauruslist length=\"" << lf
238 << "\" start=\"" << first_l
239 << "\" end=\"" << last_l << "\">\n";
240 print_thesaurus_links(args["c"], XMLmode, textdata, linkdest, linktype,
241 first_l, last_l, disp, outconvert, textout);
242 textout << "</thesauruslist>\n";
243 }
244
245 // output links as HTML
246 else {
247 if (count_l == lf) {
248 textout << "<p><b> " << count_l << " thesaurus links</b>\n";
249 } else {
250 textout << "<p><b>" << count_l << " of " << lf << " thesaurus links</b>\n";
251 }
252
253 textout << "<p><table border=1><tr><th>type</th><th>topic</th><th>freq</th><th>docs</th></tr>\n";
254 print_thesaurus_links(args["c"], XMLmode, textdata, linkdest, linktype,
255 first_l, last_l, disp, outconvert, textout);
256 textout << "</table>\n";
257
258 if (last_l < lf) {
259 if ((last_l + 10) < lf) {
260 textout << outconvert << disp
261 << "<br><a href=\"_gwcgi_?"
262 << "c=" << args["c"]
263 << "&ppnum=" << phrase
264 << "&pfe=" << first_e
265 << "&ple=" << last_e
266 << "&pfd=" << first_d
267 << "&pld=" << last_d
268 << "&pfl=" << first_l
269 << "&pll=" << (last_l + 10)
270 << "\">Get more thesaurus links</a>\n";
271 }
272 textout << outconvert << disp
273 << "<br><a href=\"_gwcgi_?"
274 << "c=" << args["c"]
275 << "&ppnum=" << phrase
276 << "&pfe=" << first_e
277 << "&ple=" << last_e
278 << "&pfd=" << first_d
279 << "&pld=" << last_d
280 << "&pfl=" << first_l
281 << "&pll=" << lf
282 << "\">Get every thesaurus link</a>\n" ;
283 }
284 }
285 }
286
287 // Output the expansions
288 if ((ef > 0) && (first_e < last_e)) {
289
290 // figure out the number of phrases to output
291 if (last_e > el.size()) {
292 last_e = el.size();
293 }
294 count_e = last_e - first_e;
295
296 // output expansions as XML
297 if (XMLmode) {
298 textout << "<expansionlist length=\"" << ef
299 << "\" start=\"" << first_e
300 << "\" end=\"" << last_e << "\">" << endl;
301
302 print_expansions(args["c"], XMLmode, word, textdata, el, first_e,
303 last_e, disp, outconvert, textout);
304
305 textout << "</expansionlist>\n";
306 }
307
308 // output expansions as HTML
309 else {
310 if (count_e == el.size()) {
311 textout << "<p><b> " << count_e << " expansions</b>\n";
312 } else {
313 textout << "<p><b>" << count_e << " of " << ef << " expansions</b>\n";
314 }
315
316 textout << "<p><table border=1><tr><th colspan=3>phrase</th><th>freq</th><th>docs</th></tr>\n";
317 print_expansions(args["c"], XMLmode, word, textdata, el, first_e,
318 last_e, disp, outconvert, textout);
319 textout << "</table>\n";
320
321 if (last_e < ef) {
322 if ((last_e + 10) < ef) {
323 textout << outconvert << disp
324 << "<br><a href=\"_gwcgi_?"
325 << "c=" << args["c"]
326 << "&ppnum=" << phrase
327 << "&pfe=" << first_e
328 << "&ple=" << (last_e + 10)
329 << "&pfd=" << first_d
330 << "&pld=" << last_d
331 << "&pfl=" << first_l
332 << "&pll=" << last_l
333 << "\">Get more expansions</a>\n";
334 }
335 textout << outconvert << disp
336 << "<br><a href=\"_gwcgi_?"
337 << "c=" << args["c"]
338 << "&ppnum=" << phrase
339 << "&pfe=" << first_e
340 << "&ple=" << ef
341 << "&pfd=" << first_d
342 << "&pld=" << last_d
343 << "&pfl=" << first_l
344 << "&pll=" << last_l
345 << "\">Get every expansion</a>\n";
346 }
347 }
348 }
349
350 // Output the document occurances
351 if ((df > 0) && (first_d < last_d)) {
352
353 // figure out the phrases to output
354 if (last_d > docNums.size()) {
355 last_d = docNums.size();
356 }
357 count_d = last_d - first_d;
358
359 // output document list as XML
360 if (XMLmode) {
361 textout << "<documentlist length=\"" << df
362 << "\" start=\"" << first_d
363 << "\" end=\"" << last_d << "\">\n";
364
365 print_documents(XMLmode, basepath, args["c"], docNums, docfreq,
366 first_d, last_d, disp, outconvert, textout);
367
368 textout << "</documentlist>\n";
369 }
370
371 // output document list as HTML
372 else {
373
374 if (count_d == docNums.size()) {
375 textout << "<p><b> " << count_d << " documents</b>\n";
376 } else {
377 textout << "<p><b>" << count_d << " of " << df << " documents</b>\n";
378 }
379
380 textout << "<p><table border=1><tr><th align=left>document</th><th>freq</th></tr>\n";
381 print_documents(XMLmode, basepath, args["c"], docNums, docfreq,
382 first_d, last_d, disp, outconvert, textout);
383 textout << "</table>\n";
384
385 if (last_d < df) {
386 if ((last_d + 10) < df) {
387 textout << outconvert << disp
388 << "<br><a href=\"_gwcgi_?"
389 << "c=" << args["c"]
390 << "&ppnum=" << phrase
391 << "&pfe=" << first_e
392 << "&ple=" << last_e
393 << "&pfd=" << first_d
394 << "&pld=" << (last_d + 10)
395 << "&pfl=" << first_l
396 << "&pll=" << last_l
397 << "\">Get more documents</a>\n";
398 }
399 textout << outconvert << disp
400 << "<br><a href=\"_gwcgi_?"
401 << "c=" << args["c"]
402 << "&ppnum=" << phrase
403 << "&pfe=" << first_e
404 << "&ple=" << last_e
405 << "&pfd=" << first_d
406 << "&pld=" << df
407 << "&pfl=" << first_l
408 << "&pll=" << last_l
409 << "\">Get every document</a>\n";
410 }
411 }
412 }
413
414 // Close the document
415 if (XMLmode) {
416 textout << "</phinddata>\n";
417 } else {
418 textout << "</center></body></html>\n";
419 }
420
421 textdata.UnloadData ();
422
423 return true;
424}
425
426// Find the phrase number of a word in the index file
427void phindaction::find_phrase_number_from_word(const text_t &basepath,
428 const text_t &query,
429 DocNumArray &result) {
430
431 // Open the index file for searching
432 IndexData indexData;
433
434 text_t fullpath = filename_cat(basepath, "pword");
435 char *fullpathc = fullpath.getcstr();
436#if defined __WIN32__
437 char *base = "";
438#else
439 char *base = "/";
440#endif
441
442 if (!indexData.LoadData (base, fullpathc)) {
443 // FatalError (1, "Couldn't load index information for \"%s\"", fullpathc);
444 exit (0);
445 }
446
447 delete fullpathc;
448
449 // set up the query object
450 QueryInfo queryInfo;
451 SetCStr (queryInfo.docLevel, "Document");
452 queryInfo.maxDocs = 5;
453 queryInfo.sortByRank = true;
454 queryInfo.exactWeights = false;
455 queryInfo.needRankInfo = true;
456 queryInfo.needTermFreqs = true;
457
458 // mode 1 = casefolded, unstemmed search
459 UCArray ucquery;
460 toUCArray(query, ucquery);
461 QueryNode *queryTree = ParseQuery(ucquery, 1, 1);
462
463 // perform the query
464 ExtQueryResult queryResult;
465 MGQuery (indexData, queryInfo, queryTree, queryResult);
466 // cout << "-- word lookup result -- " << endl << queryResult << endl ;
467
468 result.clear();
469 result = queryResult.docs;
470
471 // delete the query
472 if (queryTree != NULL) delete queryTree;
473
474 indexData.UnloadData();
475}
476
477// Get all the data about a phrase
478//
479// The phrase is stored in textData as record phrase.
480// We retrieve:
481// word - the text of the phrase
482// tf - the total frequency of the phrase
483// ef - the expansion frequency of the phrase
484// lf - the thesaurus link frequency of the phrase
485// df - the document frequency of the phrase
486// el - the list of phrases that are expansions of phrase
487// ll - the list of phrases that are thesaurus links
488// dl - the list of documents that contain phrase
489void phindaction::get_phrase_all_data(TextData &textdata, unsigned long phrase,
490 text_t &word, unsigned long &tf, unsigned long &ef,
491 unsigned long &lf, unsigned long &df,
492 vector <unsigned long> &el,
493 vector <unsigned long> &linkdest,
494 vector <UCArray> &linktype,
495 vector <unsigned long> &docnum,
496 vector <unsigned long> &docfrq) {
497 UCArray text;
498 UCArray docLevel;
499 SetCStr(docLevel, "Document");
500
501 // Look the word up in the textData
502 if (!GetDocText (textdata, docLevel, phrase, text)) {
503 // FatalError (1, "Error while trying to get phrase %u", phrase);
504 exit (0);
505 }
506
507 // Ignore everything up to the first colon
508 UCArray::iterator next = text.begin();
509 while (*next++ != ':');
510
511 // ignore training carriage returns
512 while (text.back() == '\n') {
513 text.pop_back();
514 }
515
516 // Get the word
517 word.clear();
518 for (; *next != ':'; next++) {
519 word.push_back(*next);
520 }
521
522 // Get total frequency
523 tf = 0;
524 for (next++; *next != ':'; next++) {
525 tf *= 10;
526 tf += (*next - '0');
527 }
528
529 // Get expansion frequency
530 ef = 0;
531 for (next++; *next != ':'; next++) {
532 ef *= 10;
533 ef += (*next - '0');
534 }
535
536 // Get document frequency
537 df = 0;
538 for (next++; *next != ':'; next++) {
539 df *= 10;
540 df += (*next - '0');
541 }
542
543 // Get expansion list
544 el.clear();
545 unsigned long e = 0;
546 for (next++; *next != ':'; next++) {
547 if (*next == ',') {
548 el.push_back(e);
549 e = 0;
550 } else {
551 e *= 10;
552 e += (*next - '0');
553 }
554 }
555
556 // Get document list & the document frequency list
557 docnum.clear();
558 docfrq.clear();
559 bool readnum = false;
560 unsigned long d = 0;
561 for (next++; *next != ':'; next++) {
562 if (*next == ',') {
563 docnum.push_back(d);
564 readnum = true;
565 d = 0;
566 } else if (*next == ';') {
567 if (readnum) {
568 docfrq.push_back(d);
569 } else {
570 docnum.push_back(d);
571 docfrq.push_back(1);
572 }
573 readnum = false;
574 d = 0;
575 } else {
576 d *= 10;
577 d += (*next - '0');
578 }
579 }
580
581 // Get thesaurus link frequency & link list
582 text.push_back(':');
583 text.push_back(':');
584
585 // link frequency
586 lf = 0;
587 for (next++; *next != ':'; next++) {
588 lf *= 10;
589 lf += (*next - '0');
590 }
591
592 // two lists of link data
593 linkdest.clear();
594 linktype.clear();
595
596 UCArray thistype;
597 thistype.clear();
598 bool typedone = false;
599 unsigned long l = 0;
600 for (next++; *next != ':'; next++) {
601
602 if (!typedone) {
603 // first read the link type, a charactor string
604 if (*next == ',') {
605 typedone = true;
606 } else {
607 thistype.push_back(*next);
608 }
609 } else {
610 // having read the link type, read the list of link destinations
611 if (*next == ',') {
612 linkdest.push_back(l);
613 linktype.push_back(thistype);
614 l = 0;
615 } else if (*next == ';') {
616 linkdest.push_back(l);
617 linktype.push_back(thistype);
618 l = 0;
619 thistype.clear();
620 typedone = false;
621 } else {
622 l *= 10;
623 l += (*next - '0');
624 }
625 }
626 }
627}
628
629void phindaction::print_thesaurus_links(const text_t &collection, bool XMLmode,
630 TextData &textdata, vector <unsigned long> &linkdest,
631 vector <UCArray> &linktype, unsigned long first,
632 unsigned long last, displayclass &disp,
633 outconvertclass &outconvert, ostream &textout) {
634
635 // information describing each link in the list
636 unsigned long phrase, tf, ef, df;
637 UCArray type, text;
638
639 for (unsigned long l = first; l < last; l++) {
640
641 // get the phrase data
642 phrase = linkdest[l];
643 type = linktype[l];
644 get_phrase_freq_data(textdata, phrase, text, tf, ef, df);
645
646 if (XMLmode) {
647 textout << "<thesaurus num=\"" << l
648 << "\" id=\"" << phrase
649 << "\" tf=\"" << tf
650 << "\" df=\"" << df
651 << "\" type=\"" << type
652 << "\" text=\"" << text
653 << "\"/>\n";
654 } else {
655 textout << "<tr valign=top><td>" << type << "</td><td>";
656 textout << outconvert << disp
657 << "<a href=\"_gwcgi_?c=" << collection;
658 textout << "&ppnum=" << phrase << "\">" << text << "</a>"
659 << "</td><td>" << tf << "</td><td>" << df << "</td></tr>\n";
660 }
661 }
662}
663
664// Get the frequency data about a phrase
665//
666// The phrase is stored in textData as record phrase.
667// We retrieve:
668// word - the text of the phrase
669// tf - the total frequency of the phrase
670// ef - the expansion frequency of the phrase
671// df - the document frequency of the phrase
672void phindaction::get_phrase_freq_data(TextData &textdata, unsigned long phrase,
673 UCArray &word, unsigned long &tf,
674 unsigned long &ef, unsigned long &df) {
675
676 UCArray text;
677 UCArray docLevel;
678 SetCStr(docLevel, "Document");
679
680 // Look the word up in the textData
681 if (!GetDocText (textdata, docLevel, phrase, text)) {
682 // FatalError (1, "Error while trying to get phrase %u", phrase);
683 exit (0);
684 }
685
686 // Ignore everything up to the first colon
687 UCArray::iterator next = text.begin();
688 while (*next++ != ':');
689
690 // Get the word
691 word.clear();
692 for (; *next != ':'; next++) {
693 word.push_back(*next);
694 }
695
696 // Get total frequency
697 tf = 0;
698 for (next++; *next != ':'; next++) {
699 tf *= 10;
700 tf += (*next - '0');
701 }
702
703 // Get expansion frequency
704 ef = 0;
705 for (next++; *next != ':'; next++) {
706 ef *= 10;
707 ef += (*next - '0');
708 }
709
710 // Get document frequency
711 df = 0;
712 for (next++; *next != ':'; next++) {
713 df *= 10;
714 df += (*next - '0');
715 }
716}
717
718// Print a list of expansions
719//
720// Given the textData and a list of phrase numbers, print out each of the
721// expansions.
722void phindaction::print_expansions(const text_t &collection, bool XMLmode,
723 const text_t &body, TextData &textdata,
724 const vector <unsigned long> &elist,
725 unsigned long first, unsigned long last,
726 displayclass &disp, outconvertclass &outconvert,
727 ostream &textout) {
728
729 UCArray word;
730 unsigned long phrase, tf, df, ef;
731
732 UCArray suffix, prefix, ucbody;
733
734 toUCArray(body, ucbody);
735
736 for (unsigned long e = first; e < last; e++) {
737
738 phrase = elist[e];
739 get_phrase_freq_data(textdata, phrase, word, tf, ef, df);
740
741 split_phrase(word, ucbody, prefix, suffix);
742
743 if (XMLmode) {
744 // body is always the same as the text of the phrase, so no need to send it
745 textout << "<expansion num=\"" << e
746 << "\" id=\"" << phrase
747 << "\" tf=\"" << tf
748 << "\" df=\"" << df;
749 if (!prefix.empty()) {
750 textout << "\" prefix=\"" << prefix;
751 }
752 if (!suffix.empty()) {
753 textout << "\" suffix=\"" << suffix;
754 }
755 textout << "\"/>\n";
756 } else {
757 textout << outconvert << disp
758 << "<tr valign=top><td align=right><a href=\"_gwcgi_?"
759 << "c=" << collection << "&ppnum=" << phrase << "\">";
760 textout << prefix << "</a></td>";
761 textout <<outconvert << disp
762 << "<td align=center><a href=\"_gwcgi_?"
763 << "c=" << collection << "&ppnum=" << phrase << "\">"
764 << body << "</a></td>"
765 << "<td align=left><a href=\"_gwcgi_?"
766 << "c=" << collection << "&ppnum=" << phrase << "\">";
767 textout << suffix << "</a></td>"
768 << "<td>" << tf << "</td><td>" << df << "</td></tr>\n";
769 }
770 }
771}
772
773// split an expansion into prefix and suffix
774void phindaction::split_phrase(const UCArray &word, const UCArray &body,
775 UCArray &prefix, UCArray &suffix) {
776
777 prefix.clear();
778 suffix.clear();
779
780 bool readingPrefix = true;
781 UCArray::const_iterator here = word.begin();
782 UCArray::const_iterator end = word.end();
783
784 while (here != end) {
785
786 // if we've not read all the prefix, add the next char to the prefix
787 if (readingPrefix) {
788 if (phrase_match(body, here, end)) {
789 readingPrefix = false;
790 // trim whitespace from end of prefix & start of suffix
791 if (!prefix.empty()) {
792 prefix.pop_back();
793 }
794 if ((here != end) && (*here == ' ')) {
795 here++;
796 }
797 } else {
798 prefix.push_back(*here);
799 here++;
800 }
801 }
802 // if we've finished with the prefix, update the suffix
803 else {
804 suffix.push_back(*here);
805 here++;
806 }
807 }
808}
809
810// phrase_match
811//
812// compare two strings, one represented as an UCArray, the other as two
813// UCArray iterators.
814//
815// Return true if the UCArray is the same as the phrase the iterators point
816// to for the length of the UCArray.
817bool phindaction::phrase_match(const UCArray &text, UCArray::const_iterator &here,
818 UCArray::const_iterator end) {
819
820 UCArray::const_iterator one_here = text.begin();
821 UCArray::const_iterator one_end = text.end();
822 UCArray::const_iterator two_here = here;
823
824 // iterate over the length of the first string, comparing each element to
825 // the corresponding element in the second string.
826 while (one_here != one_end) {
827
828 if (two_here == end) {
829 return false;
830 } else if (*one_here != *two_here) {
831 return false;
832 }
833 one_here++;
834 two_here++;
835 }
836
837 here = two_here;
838 return true;
839}
840
841void phindaction::print_documents(bool XMLmode, const text_t &basepath,
842 const text_t &collection,
843 const vector <unsigned long> &docNums,
844 const vector <unsigned long> &docFreq,
845 unsigned long first, unsigned long last,
846 displayclass &disp, outconvertclass &outconvert,
847 ostream &textout) {
848
849 // Create a TextData object to read the document data
850 TextData docdata;
851
852 text_t fullpath = filename_cat(basepath, "docs");
853 char *fullpathc = fullpath.getcstr();
854#if defined __WIN32__
855 char *base = "";
856#else
857 char *base = "/";
858#endif
859
860 if (!docdata.LoadData (base, fullpathc)) {
861 // FatalError (1, "Couldn't load text information for \"%s\"", fullpathc);
862 exit (0);
863 }
864
865 delete fullpathc;
866
867 UCArray title, hash;
868 unsigned long freq, doc;
869
870 for (unsigned long d = first; d < last; d++) {
871 doc = docNums[d];
872 freq = docFreq[d];
873
874 get_document_all_data(docdata, doc, title, hash);
875
876 if (XMLmode) {
877 textout << "<document num=\"" << d
878 << "\" hash=\"" << hash
879 << "\" freq=\"" << freq
880 << "\" title=\"" << title << "\"/>\n";
881 } else {
882 textout << outconvert << disp
883 << "<tr valign=top><td><a href=\"_gwcgi_?"
884 << "c=" << collection;
885 textout << "&a=d&d=" << hash << "\">" << title << "</a>"
886 << "</td><td>" << freq << "</td></tr>\n";
887 }
888 }
889
890 docdata.UnloadData();
891}
892
893// Get all the data about a docment
894//
895// The document's details are stored in docData as record docNum.
896// We retrieve:
897// title - the document's title
898// hash - the document's unique OID
899void phindaction::get_document_all_data(TextData &docdata, unsigned long docNum,
900 UCArray &title, UCArray &hash) {
901
902 UCArray text;
903 UCArray docLevel;
904 SetCStr(docLevel, "Document");
905
906 // Look the word up in the textData
907 if (!GetDocText (docdata, docLevel, docNum, text)) {
908 // FatalError (1, "Error while trying to get document %u", docNum);
909 exit (0);
910 }
911
912 // Ignore everything up to the first colon
913 UCArray::iterator next = text.begin();
914 while (*next++ != '\t');
915
916 // Get the document OID (hash)
917 hash.clear();
918 for (; *next != '\t'; next++) {
919 hash.push_back(*next);
920 }
921
922 // Get the title
923 text.push_back('\n');
924 title.clear();
925 for (next++; *next != '\n'; next++) {
926 title.push_back(*next);
927 }
928}
929
930void phindaction::toUCArray(const text_t &in, UCArray &out) {
931 out.clear();
932 text_t::const_iterator here = in.begin();
933 text_t::const_iterator end = in.end();
934 while (here != end) {
935 out.push_back((unsigned char) *here);
936 here++;
937 }
938}
939
940void phindaction::output_error (const text_t &message, ostream &textout,
941 outconvertclass &outconvert,
942 displayclass & disp, ostream &logout,
943 bool XMLmode) {
944
945 logout << outconvert << message << "\n";
946 if (XMLmode) {
947 textout << outconvert
948 << "<phinddata>\n"
949 << "<phinderror>" << message << "</phinderror>\n"
950 << "</phinddata>\n";
951 } else {
952 textout << outconvert << disp
953 << "_header_\n"
954 << message
955 << "_footer_\n";
956 }
957}
Note: See TracBrowser for help on using the repository browser.