source: trunk/gsdl/src/phind/host/phindcgi.cpp@ 2487

Last change on this file since 2487 was 2487, checked in by sjboddie, 23 years ago

Changes to get phind working under windows

  • Property svn:keywords set to Author Date Id Revision
File size: 29.7 KB
Line 
1/**********************************************************************
2 *
3 * phindcgi.cpp -- cgi program to serve phind phrase hierarchies
4 *
5 * Copyright 2000 Gordon W. Paynter
6 * Copyright 2000 The New Zealand Digital Library Project
7 *
8 *
9 * A component of the Greenstone digital library software
10 * from the New Zealand Digital Library Project at the
11 * University of Waikato, New Zealand.
12 *
13 * This program is free software; you can redistribute it and/or modify
14 * it under the terms of the GNU General Public License as published by
15 * the Free Software Foundation; either version 2 of the License, or
16 * (at your option) any later version.
17 *
18 * This program is distributed in the hope that it will be useful,
19 * but WITHOUT ANY WARRANTY; without even the implied warranty of
20 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
21 * GNU General Public License for more details.
22 *
23 * You should have received a copy of the GNU General Public License
24 * along with this program; if not, write to the Free Software
25 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
26 *
27 *********************************************************************/
28
29/*
30 * phindcgi.cpp
31 *
32 * The program itself reads request for a phrase's data from the
33 * QUERY_STRING variable, looks up the phrase (if necessary) in the MGPP
34 * pword database, then looks up the phrase's charatoristics in the MGPP
35 * pdata database, and reports output to STDOUT ar crude HTML or XML.
36 *
37 */
38
39#if defined(GSDL_USE_IOS_H)
40# include <fstream.h>
41# include <iostream.h>
42#else
43# include <fstream>
44# include <iostream>
45#endif
46
47#include <stdlib.h>
48#include <stdio.h>
49#include <assert.h>
50
51#if defined(GSDL_USE_STL_H)
52# if defined(GSDL_USE_ALGO_H)
53# include <algo.h>
54# else
55# include <algorithm.h>
56# endif
57# include <vector.h>
58#else
59# include <algorithm>
60# include <vector>
61#endif
62
63// Include MGPP functionality.
64#include <TextGet.h>
65#include <MGQuery.h>
66#include <Terms.h>
67#include <messages.h>
68#include <GSDLQueryParser.h>
69
70// Include GSDL's text_t object, which makes parsing cgi arguments easier.
71#include "text_t.h"
72#include "fileutil.h"
73// Note that GSDL stores strings as text_t objects (vectors of 16-bit short int),
74// while MGPP stores strings as UCArray objects (vectors of 8-bit unsigned char).
75
76
77
78void get_gsdlsite_parameters(char *&gsdlhome);
79
80void get_cgi_parameters(char *&collection, char *&classifier,
81 unsigned long &phrasenumber, UCArray &phrasetext,
82 unsigned long &first_e, unsigned long &last_e,
83 unsigned long &first_l, unsigned long &last_l,
84 unsigned long &first_d, unsigned long &last_d,
85 bool &XMLmode);
86
87void decode_cgi_arg (text_t &argstr);
88
89void print_expansions(char *cgi_script, char *collection, bool XMLmode, UCArray body,
90 TextData &textdata, vector <unsigned long> elist,
91 unsigned long first, unsigned long last);
92
93void print_thesaurus_links(char *cgi_script, char *collection,
94 bool XMLmode, UCArray body, TextData &textdata,
95 vector <unsigned long> &linkdest,
96 vector <UCArray> &linktype,
97 unsigned long first, unsigned long last);
98
99void print_documents(bool XMLmode, char *basepath, char *cgi_script,
100 char *collection,
101 vector <unsigned long> docNums,
102 vector <unsigned long> docFreq,
103 unsigned long first, unsigned long last);
104
105void find_phrase_number_from_word(char *basepath, UCArray &query, DocNumArray &result);
106
107void get_phrase_freq_data(TextData &textdata, unsigned long phrase,
108 UCArray &word, unsigned long &tf,
109 unsigned long &ef, unsigned long &df);
110
111void get_phrase_all_data(TextData &textdata, unsigned long phrase,
112 UCArray &word,
113 unsigned long &tf, unsigned long &ef,
114 unsigned long &lf, unsigned long &df,
115 vector <unsigned long> &el,
116 vector <unsigned long> &linkdest,
117 vector <UCArray> &linktype,
118 vector <unsigned long> &docnum,
119 vector <unsigned long> &docfrq);
120
121void split_phrase(UCArray word, UCArray body, UCArray &prefix, UCArray &suffix);
122bool phrase_match(UCArray text, UCArray::iterator &here, UCArray::iterator end);
123
124void get_document_all_data(TextData &docdata, unsigned long docNum,
125 UCArray &title, UCArray &hash);
126
127void cgi_error(bool XMLmode, char *message);
128
129void toUCArray(text_t &in, UCArray &out);
130unsigned long toLongInt(text_t &value);
131
132
133
134int main (int argc, char * argv[]) {
135
136
137 // the phrase to expand
138 unsigned long phrase = 0;
139 UCArray word;
140
141 // the frequency and occurances of the phrase
142 unsigned long tf;
143 vector <unsigned long> el, linkdest, docNums, docfreq;
144 vector <UCArray> linktype;
145
146 // the number of occurances to display
147 unsigned long ef, first_e, last_e, count_e,
148 lf, first_l, last_l, count_l,
149 df, first_d, last_d, count_d;
150
151 // are we in XML mode (as opposed to HTML mode)
152 bool XMLmode = false;
153
154 // Read the gsdlsite.cfg file
155 char *gsdlhome = NULL;
156 get_gsdlsite_parameters(gsdlhome);
157
158 if (gsdlhome == NULL) {
159 cgi_error(XMLmode, "GSDLHOME not set in gsdlsite.cfg file.");
160 }
161
162 // Get command-line parameters
163 char *collection = NULL;
164 char *classifier = NULL;
165 text_tmap param;
166 get_cgi_parameters(collection, classifier, phrase, word,
167 first_e, last_e, first_l, last_l, first_d, last_d, XMLmode);
168
169 if (collection == NULL) {
170 cgi_error(XMLmode, "No collection");
171 }
172
173 text_t basepathstr = filename_cat(gsdlhome, "collect", collection,
174 "index", text_t("phind") + classifier);
175
176 char *basepath = basepathstr.getcstr();
177
178 // If we don't know the phrase number, look itup
179 if (phrase == 0) {
180
181 if (word.empty()) {
182 cgi_error(XMLmode, "No phrase number or word.");
183 }
184
185 DocNumArray result;
186 find_phrase_number_from_word(basepath, word, result);
187
188 if (result.empty()) {
189 cgi_error(XMLmode, "The search term does not occur in the collection.");
190 exit(0);
191 } else {
192 phrase = result[0];
193 }
194 }
195
196 // Create a TextData object to read the phrase data (pdata)
197 TextData textdata;
198
199 text_t fullpath = filename_cat(basepath, "pdata");
200 char *fullpathc = fullpath.getcstr();
201#if defined __WIN32__
202 char *base = "";
203#else
204 char *base = "/";
205#endif
206
207 if (!textdata.LoadData (base, fullpathc)) {
208 FatalError (1, "Couldn't load text information for \"%s\"", fullpathc);
209 }
210
211 delete fullpathc;
212
213 get_phrase_all_data(textdata, phrase, word, tf, ef, lf, df, el,
214 linkdest, linktype, docNums, docfreq);
215
216
217 // Output the header
218 if (XMLmode) {
219 cout << "Content-type: text/plain" << endl << endl
220 << "<phinddata id=\"" << phrase
221 << "\" text=\"" << word
222 << "\" tf=\"" << tf
223 << "\" ef=\"" << ef
224 << "\" df=\"" << df
225 << "\" lf=\"" << lf
226 << "\">" << endl;
227 } else {
228 cout << "Content-type: text/html" << endl << endl
229 << "<html><head><title>" << word << "</title></head>" << endl
230 << "<body><center>" << endl
231 << "<p><h1>" << word << "</h1>" << endl
232 << "<p><b>"<< word << "</b> occurs "
233 << tf << " times in " << df << " documents" << endl;
234 }
235
236
237 // Output the thesaurus links
238 if ((lf > 0) && (first_l < last_l)) {
239
240 // figure out the number of phrases to output
241 if (last_l > lf) {
242 last_l = lf;
243 }
244 count_l = last_l - first_l;
245
246 if (XMLmode) {
247 cout << "<thesauruslist length=\"" << lf
248 << "\" start=\"" << first_l
249 << "\" end=\"" << last_l << "\">" << endl;
250 print_thesaurus_links(argv[0], collection, XMLmode, word, textdata,
251 linkdest, linktype, first_l, last_l);
252 cout << "</thesauruslist>" << endl;
253 }
254
255 // output links as HTML
256 else {
257 if (count_l == lf) {
258 cout << "<p><b> " << count_l << " thesaurus links</b>" << endl;
259 } else {
260 cout << "<p><b>" << count_l << " of " << lf << " thesaurus links</b>" << endl;
261 }
262
263 cout << "<p><table border=1><tr><th>type</th><th>topic</th><th>freq</th><th>docs</th></tr>" << endl;
264 print_thesaurus_links(argv[0], collection, XMLmode, word, textdata,
265 linkdest, linktype, first_l, last_l);
266
267 cout << "</table>" << endl;
268
269 if (last_l < lf) {
270 if ((last_l + 10) < lf) {
271 cout << "<br><a href='" << argv[0]
272 << "?c=" << collection
273 << "&n=" << phrase
274 << "&e=" << first_e
275 << "&f=" << last_e
276 << "&h=" << first_d
277 << "&i=" << last_d
278 << "&k=" << first_l
279 << "&l=" << (last_l + 10)
280 << "'>Get more thesaurus links</a>"
281 << endl;
282 }
283 cout << "<br><a href='" << argv[0]
284 << "?c=" << collection
285 << "&n=" << phrase
286 << "&e=" << first_e
287 << "&f=" << last_e
288 << "&h=" << first_d
289 << "&i=" << last_d
290 << "&k=" << first_l
291 << "&l=" << lf
292 << "'>Get every thesaurus link</a>"
293 << endl;
294 }
295 }
296
297 }
298
299 // Output the expansions
300 if ((ef > 0) && (first_e < last_e)) {
301
302 // figure out the number of phrases to output
303 if (last_e > el.size()) {
304 last_e = el.size();
305 }
306 count_e = last_e - first_e;
307
308 // output expansions as XML
309 if (XMLmode) {
310 cout << "<expansionlist length=\"" << ef
311 << "\" start=\"" << first_e
312 << "\" end=\"" << last_e << "\">" << endl;
313
314 print_expansions(argv[0], collection, XMLmode, word, textdata, el, first_e, last_e);
315
316 cout << "</expansionlist>" << endl;
317 }
318
319 // output expansions as HTML
320 else {
321 if (count_e == el.size()) {
322 cout << "<p><b> " << count_e << " expansions</b>" << endl;
323 } else {
324 cout << "<p><b>" << count_e << " of " << ef << " expansions</b>" << endl;
325 }
326
327 cout << "<p><table border=1><tr><th colspan=3>phrase</th><th>freq</th><th>docs</th></tr>" << endl;
328 print_expansions(argv[0], collection, XMLmode, word, textdata, el, first_e, last_e);
329 cout << "</table>" << endl;
330
331 if (last_e < ef) {
332 if ((last_e + 10) < ef) {
333 cout << "<br><a href='" << argv[0]
334 << "?c=" << collection
335 << "&n=" << phrase
336 << "&e=" << first_e
337 << "&f=" << (last_e + 10)
338 << "&h=" << first_d
339 << "&i=" << last_d
340 << "&k=" << first_l
341 << "&l=" << last_l
342 << "'>Get more expansions</a>"
343 << endl;
344 }
345 cout << "<br><a href='" << argv[0]
346 << "?c=" << collection
347 << "&n=" << phrase
348 << "&e=" << first_e
349 << "&f=" << ef
350 << "&h=" << first_d
351 << "&i=" << last_d
352 << "&k=" << first_l
353 << "&l=" << last_l
354 << "'>Get every expansion</a>"
355 << endl;
356 }
357 }
358 }
359
360 // Output the document occurances
361 if ((df > 0) && (first_d < last_d)) {
362
363 // figure out the phrases to output
364 if (last_d > docNums.size()) {
365 last_d = docNums.size();
366 }
367 count_d = last_d - first_d;
368
369 // output document list as XML
370 if (XMLmode) {
371 cout << "<documentlist length=\"" << df
372 << "\" start=\"" << first_d
373 << "\" end=\"" << last_d << "\">" << endl;
374
375 print_documents(XMLmode, basepath, "library", collection,
376 docNums, docfreq, first_d, last_d);
377
378 cout << "</documentlist>" << endl;
379 }
380
381 // output document list as HTML
382 else {
383
384 if (count_d == docNums.size()) {
385 cout << "<p><b> " << count_d << " documents</b>" << endl;
386 } else {
387 cout << "<p><b>" << count_d << " of " << df << " documents</b>" << endl;
388 }
389
390 cout << "<p><table border=1><tr><th align=left>document</th><th>freq</th></tr>" << endl;
391 print_documents(XMLmode, basepath, "library", collection,
392 docNums, docfreq, first_d, last_d);
393 cout << "</table>" << endl;
394
395 if (last_d < df) {
396 if ((last_d + 10) < df) {
397 cout << "<br><a href='" << argv[0]
398 << "?c=" << collection
399 << "&n=" << phrase
400 << "&e=" << first_e
401 << "&f=" << last_e
402 << "&h=" << first_d
403 << "&i=" << (last_d + 10)
404 << "&k=" << first_l
405 << "&l=" << last_l
406 << "'>Get more documents</a>" << endl;
407 }
408 cout << "<br><a href='" << argv[0]
409 << "?c=" << collection
410 << "&n=" << phrase
411 << "&e=" << first_e
412 << "&f=" << last_e
413 << "&h=" << first_d
414 << "&i=" << df
415 << "&k=" << first_l
416 << "&l=" << last_l
417 << "'>Get every document</a>" << endl;
418 }
419 }
420 }
421
422 // Close the document
423 if (XMLmode) {
424 cout << "</phinddata>" << endl;
425 } else {
426 cout << "</center></body></html>" << endl;
427 }
428
429 textdata.UnloadData ();
430
431 delete basepath;
432
433 return 0;
434}
435
436
437// Print a list of expansions
438//
439// Given the textData and a list of phrase numbers, print out each of the
440// expansions.
441
442void print_expansions(char *cgi_script, char *collection, bool XMLmode, UCArray body,
443 TextData &textdata, vector <unsigned long> elist,
444 unsigned long first, unsigned long last) {
445
446 UCArray word;
447 unsigned long phrase, tf, df, ef;
448
449 UCArray suffix, prefix;
450
451 for (unsigned long e = first; e < last; e++) {
452
453 phrase = elist[e];
454 get_phrase_freq_data(textdata, phrase, word, tf, ef, df);
455
456 split_phrase(word, body, prefix, suffix);
457
458 if (XMLmode) {
459 // body is always the same as the text of the phrase, so no need to send it
460 cout << "<expansion num=\"" << e
461 << "\" id=\"" << phrase
462 << "\" tf=\"" << tf
463 << "\" df=\"" << df;
464 if (!prefix.empty()) {
465 cout << "\" prefix=\"" << prefix;
466 }
467 if (!suffix.empty()) {
468 cout << "\" suffix=\"" << suffix;
469 }
470 cout << "\"/>" << endl;
471 } else {
472 cout << "<tr valign=top><td align=right><a href='" << cgi_script
473 << "?c=" << collection << "&n=" << phrase << "'>" << prefix << "</a></td>"
474 << "<td align=center><a href='" << cgi_script
475 << "?c=" << collection << "&n=" << phrase << "'>" << body << "</a></td>"
476 << "<td align=left><a href='" << cgi_script
477 << "?c=" << collection << "&n=" << phrase << "'>" << suffix << "</a></td>"
478 << "<td>" << tf << "</td><td>" << df << "</td></tr>" << endl;
479 }
480 }
481}
482
483void print_thesaurus_links(char *cgi_script, char *collection,
484 bool XMLmode, UCArray body, TextData &textdata,
485 vector <unsigned long> &linkdest,
486 vector <UCArray> &linktype,
487 unsigned long first, unsigned long last) {
488
489 // information describing each link in the list
490 unsigned long phrase, tf, ef, df;
491 UCArray type, text, newbody, suffix, prefix;
492
493 for (unsigned long l = first; l < last; l++) {
494
495 // get the phrase data
496 phrase = linkdest[l];
497 type = linktype[l];
498 get_phrase_freq_data(textdata, phrase, text, tf, ef, df);
499 // split_phrase(text, newbody, prefix, suffix);
500
501 if (XMLmode) {
502 cout << "<thesaurus num=\"" << l
503 << "\" id=\"" << phrase
504 << "\" tf=\"" << tf
505 << "\" df=\"" << df
506 << "\" type=\"" << type
507 << "\" text=\"" << text
508 << "\"/>" << endl;
509 } else {
510 cout << "<tr valign=top><td>" << type << "</td><td>"
511 << "<a href='" << cgi_script << "?c=" << collection
512 << "&n=" << phrase << "'>" << text << "</a>"
513 << "</td><td>" << tf << "</td><td>" << df << "</td></tr>" << endl;
514 }
515 }
516}
517
518
519void print_documents(bool XMLmode, char *basepath, char *cgi_script, char *collection,
520 vector <unsigned long> docNums, vector <unsigned long> docFreq,
521 unsigned long first, unsigned long last) {
522
523 // _chdir(basepath);
524
525 // Create a TextData object to read the document data
526 TextData docdata;
527
528 text_t fullpath = filename_cat(basepath, "docs");
529 char *fullpathc = fullpath.getcstr();
530#if defined __WIN32__
531 char *base = "";
532#else
533 char *base = "/";
534#endif
535
536 if (!docdata.LoadData (base, fullpathc)) {
537 FatalError (1, "Couldn't load text information for \"%s\"", fullpathc);
538 }
539
540 delete fullpathc;
541
542 UCArray title, hash;
543 unsigned long freq, doc;
544
545 for (unsigned long d = first; d < last; d++) {
546 doc = docNums[d];
547 freq = docFreq[d];
548
549 get_document_all_data(docdata, doc, title, hash);
550
551 if (XMLmode) {
552 cout << "<document num=\"" << d
553 << "\" hash=\"" << hash
554 << "\" freq=\"" << freq
555 << "\" title=\"" << title << "\"/>" << endl;
556 } else {
557 cout << "<tr valign=top><td><a href='" << cgi_script << "?c=" << collection
558 << "&a=d&d=" << hash << "'>" << title << "</a>"
559 << "</td><td>" << freq << "</td></tr>"
560 << endl;
561 }
562 }
563}
564
565
566
567// Get the frequency data about a phrase
568//
569// The phrase is stored in textData as record phrase.
570// We retrieve:
571// word - the text of the phrase
572// tf - the total frequency of the phrase
573// ef - the expansion frequency of the phrase
574// df - the document frequency of the phrase
575
576void get_phrase_freq_data(TextData &textdata, unsigned long phrase,
577 UCArray &word, unsigned long &tf,
578 unsigned long &ef, unsigned long &df) {
579
580 UCArray text;
581 UCArray docLevel;
582 SetCStr(docLevel, "Document");
583
584 // Look the word up in the textData
585 if (!GetDocText (textdata, docLevel, phrase, text)) {
586 FatalError (1, "Error while trying to get phrase %u", phrase);
587 }
588
589 // Ignore everything up to the first colon
590 UCArray::iterator next = text.begin();
591 while (*next++ != ':');
592
593 // Get the word
594 word.clear();
595 for (; *next != ':'; next++) {
596 word.push_back(*next);
597 }
598
599 // Get total frequency
600 tf = 0;
601 for (next++; *next != ':'; next++) {
602 tf *= 10;
603 tf += (*next - '0');
604 }
605
606 // Get expansion frequency
607 ef = 0;
608 for (next++; *next != ':'; next++) {
609 ef *= 10;
610 ef += (*next - '0');
611 }
612
613 // Get document frequency
614 df = 0;
615 for (next++; *next != ':'; next++) {
616 df *= 10;
617 df += (*next - '0');
618 }
619}
620
621// Get all the data about a phrase
622//
623// The phrase is stored in textData as record phrase.
624// We retrieve:
625// word - the text of the phrase
626// tf - the total frequency of the phrase
627// ef - the expansion frequency of the phrase
628// lf - the thesaurus link frequency of the phrase
629// df - the document frequency of the phrase
630// el - the list of phrases that are expansions of phrase
631// ll - the list of phrases that are thesaurus links
632// dl - the list of documents that contain phrase
633
634void get_phrase_all_data(TextData &textdata, unsigned long phrase,
635 UCArray &word,
636 unsigned long &tf, unsigned long &ef,
637 unsigned long &lf, unsigned long &df,
638 vector <unsigned long> &el,
639 vector <unsigned long> &linkdest,
640 vector <UCArray> &linktype,
641 vector <unsigned long> &docnum,
642 vector <unsigned long> &docfrq) {
643 UCArray text;
644 UCArray docLevel;
645 SetCStr(docLevel, "Document");
646
647 // Look thwe word up in the textData
648 if (!GetDocText (textdata, docLevel, phrase, text)) {
649 FatalError (1, "Error while trying to get phrase %u", phrase);
650 }
651
652 // Ignore everything up to the first colon
653 UCArray::iterator next = text.begin();
654 while (*next++ != ':');
655
656 // ignore training cariage returns
657 while (text.back() == '\n') {
658 text.pop_back();
659 }
660
661 // Get the word
662 word.clear();
663 for (; *next != ':'; next++) {
664 word.push_back(*next);
665 }
666
667 // Get total frequency
668 tf = 0;
669 for (next++; *next != ':'; next++) {
670 tf *= 10;
671 tf += (*next - '0');
672 }
673
674 // Get expansion frequency
675 ef = 0;
676 for (next++; *next != ':'; next++) {
677 ef *= 10;
678 ef += (*next - '0');
679 }
680
681 // Get document frequency
682 df = 0;
683 for (next++; *next != ':'; next++) {
684 df *= 10;
685 df += (*next - '0');
686 }
687
688 // Get expansion list
689 el.clear();
690 unsigned long e = 0;
691 for (next++; *next != ':'; next++) {
692 if (*next == ',') {
693 el.push_back(e);
694 e = 0;
695 } else {
696 e *= 10;
697 e += (*next - '0');
698 }
699 }
700
701 // Get document list & the document frequency list
702 docnum.clear();
703 docfrq.clear();
704 bool readnum = false;
705 unsigned long d = 0;
706 for (next++; *next != ':'; next++) {
707 if (*next == ',') {
708 docnum.push_back(d);
709 readnum = true;
710 d = 0;
711 } else if (*next == ';') {
712 if (readnum) {
713 docfrq.push_back(d);
714 } else {
715 docnum.push_back(d);
716 docfrq.push_back(1);
717 }
718 readnum = false;
719 d = 0;
720 } else {
721 d *= 10;
722 d += (*next - '0');
723 }
724 }
725
726 // Get thesaurus link frequency & link list
727 text.push_back(':');
728 text.push_back(':');
729
730 // link frequency
731 lf = 0;
732 for (next++; *next != ':'; next++) {
733 lf *= 10;
734 lf += (*next - '0');
735 }
736
737 // two lists of link data
738 linkdest.clear();
739 linktype.clear();
740
741 UCArray thistype;
742 thistype.clear();
743 bool typedone = false;
744 unsigned long l = 0;
745 for (next++; *next != ':'; next++) {
746
747 if (!typedone) {
748 // first read the link type, a charactor string
749 if (*next == ',') {
750 typedone = true;
751 } else {
752 thistype.push_back(*next);
753 }
754 } else {
755 // having read the link type, read the list of link destinations
756 if (*next == ',') {
757 linkdest.push_back(l);
758 linktype.push_back(thistype);
759 l = 0;
760 } else if (*next == ';') {
761 linkdest.push_back(l);
762 linktype.push_back(thistype);
763 l = 0;
764 thistype.clear();
765 typedone = false;
766 } else {
767 l *= 10;
768 l += (*next - '0');
769 }
770 }
771 }
772}
773
774// Get all the data about a docment
775//
776// The document's detailes are stored in docData as record docNum.
777// We retrieve:
778// title - the document's title
779// hash - the documnt's unique OID
780
781void get_document_all_data(TextData &docdata, unsigned long docNum,
782 UCArray &title, UCArray &hash) {
783
784 UCArray text;
785 UCArray docLevel;
786 SetCStr(docLevel, "Document");
787
788 // Look the word up in the textData
789 if (!GetDocText (docdata, docLevel, docNum, text)) {
790 FatalError (1, "Error while trying to get document %u", docNum);
791 }
792
793 // Ignore everything up to the first colon
794 UCArray::iterator next = text.begin();
795 while (*next++ != '\t');
796
797 // Get the document OID (hash)
798 hash.clear();
799 for (; *next != '\t'; next++) {
800 hash.push_back(*next);
801 }
802
803 // Get the title
804 text.push_back('\n');
805 title.clear();
806 for (next++; *next != '\n'; next++) {
807 title.push_back(*next);
808 }
809}
810
811
812void get_gsdlsite_parameters(char *&gsdlhome) {
813
814 // open the file
815 ifstream gsdl("gsdlsite.cfg", ios::in);
816 if (!gsdl) {
817 cerr << "File gsdlsite.cfg could not be opened\n";
818 exit(1);
819 }
820
821 // read each line of the file
822 char buffer[2000];
823 while (!gsdl.eof()) {
824 gsdl.getline(buffer, 2000, '\n');
825
826 // read the gsdlhome variable
827 if (strncmp(buffer, "gsdlhome", 8) == 0) {
828
829 // find the start of the gsdlhome string
830 int len = strlen(buffer);
831 int i = 8;
832 while (i < len && (buffer[i] == ' ' || buffer[i] == '\t')) {
833 i++;
834 }
835 // store the gsdlhome string
836 gsdlhome = new char[len-i];
837 strncpy(gsdlhome, &(buffer[i]), len-i);
838 }
839 }
840}
841
842void get_cgi_parameters(char *&collection, char *&classifier,
843 unsigned long &phrasenumber, UCArray &phrasetext,
844 unsigned long &first_e, unsigned long &last_e,
845 unsigned long &first_l, unsigned long &last_l,
846 unsigned long &first_d, unsigned long &last_d,
847 bool &XMLmode) {
848
849
850 // set the default parameters
851 phrasenumber = 0;
852 phrasetext.clear();
853 first_e = 0;
854 last_e = 10;
855 first_l = 0;
856 last_l = 10;
857 first_d = 0;
858 last_d = 10;
859
860 // get the query string
861 char *request_method_str = getenv("REQUEST_METHOD");
862 char *query_string = getenv("QUERY_STRING");
863 text_t query;
864
865 if (request_method_str != NULL
866 && (strcmp(request_method_str, "GET") == 0)
867 && query_string != NULL) {
868 // GET cgi args from querystring
869 query = query_string;
870
871 } else {
872 // debugging from command line
873 cout << "? " << endl;
874 char query_input[1024];
875 cin.get(query_input, 1024, '\n');
876 query = query_input;
877 }
878
879 // extract out the key=value pairs
880 text_t::iterator here = query.begin();
881 text_t::iterator end = query.end();
882 text_t key, value;
883
884 while (here != end) {
885 // get the next key and value pair
886 here = getdelimitstr (here, end, '=', key);
887 here = getdelimitstr (here, end, '&', value);
888
889 // store this key=value pair
890 if (!key.empty() && !value.empty()) {
891
892 // c: the collection name
893 if (key[0] == 'c') {
894 UCArray tmp;
895 toUCArray(value, tmp);
896 collection = GetCStr(tmp);
897 }
898
899 // d: the classifier number as string
900 if (key[0] == 'd') {
901 UCArray tmp;
902 toUCArray(value, tmp);
903 classifier = GetCStr(tmp);
904 }
905
906 // e: the first expansion number
907 else if (key[0] == 'e') {
908 first_e = toLongInt(value);
909 }
910
911 // f: the last expansion number
912 else if (key[0] == 'f') {
913 last_e = toLongInt(value);
914 }
915
916 // h: the first document number
917 else if (key[0] == 'h') {
918 first_d = toLongInt(value);
919 }
920
921 // i: the last document number
922 else if (key[0] == 'i') {
923 last_d = toLongInt(value);
924 }
925
926 // k: the first thesaurus list number
927 else if (key[0] == 'k') {
928 first_l = toLongInt(value);
929 }
930
931 // l: the last thesaurus list number
932 else if (key[0] == 'l') {
933 last_l = toLongInt(value);
934 }
935
936 // n: the phrase number
937 else if (key[0] == 'n') {
938 phrasenumber = toLongInt(value);
939 }
940
941 // p: the phrase text
942 else if (key[0] == 'p') {
943 decode_cgi_arg(value);
944 toUCArray(value, phrasetext);
945 }
946
947 // x: XML mode
948 else if (key[0] == 'x') {
949 XMLmode = true;
950 }
951
952 }
953 }
954
955 // if no classifier number is supplied, default to 1.
956 if (classifier == NULL) {
957 classifier = new char[2];
958 strcpy(classifier, "1");
959 }
960}
961
962
963// Convert %xx and + to their appropriate equivalents
964//
965// This function was copied from %GSDLHOME/src/recpt/cgiutils.cpp
966// because it was much easier to copy it than to link against it.
967
968static unsigned short hexdigit (unsigned short c) {
969 if (c >= '0' && c <= '9') return (c-'0');
970 if (c >= 'a' && c <= 'f') return (c-'a'+10);
971 if (c >= 'A' && c <= 'F') return (c-'A'+10);
972 return c;
973}
974
975void decode_cgi_arg (text_t &argstr) {
976 text_t::iterator in = argstr.begin();
977 text_t::iterator out = in;
978 text_t::iterator end = argstr.end();
979
980 while (in != end) {
981 if (*in == '+') *out = ' ';
982
983 else if (*in == '%') {
984 unsigned short c = '%';
985 in++;
986 if (in != end) {
987 c = hexdigit (*in);
988 in++;
989 }
990 if (in != end && c < 16) { // sanity check on the previous character
991 c = c*16 + hexdigit (*in);
992 }
993
994 *out = c;
995 } else *out = *in;
996
997 if (in != end) in++;
998 out++;
999 }
1000
1001 // remove the excess characters
1002 argstr.erase (out, end);
1003}
1004
1005
1006// Find the phrase number of a word in the index file
1007
1008void find_phrase_number_from_word(char *basepath, UCArray &query, DocNumArray &result) {
1009
1010 // Open the index file for searching
1011 IndexData indexData;
1012
1013 text_t fullpath = filename_cat(basepath, "pword");
1014 char *fullpathc = fullpath.getcstr();
1015#if defined __WIN32__
1016 char *base = "";
1017#else
1018 char *base = "/";
1019#endif
1020
1021 if (!indexData.LoadData (base, fullpathc)) {
1022 FatalError (1, "Couldn't load index information for \"%s\"", fullpathc);
1023 }
1024
1025 delete fullpathc;
1026
1027 // set up the query object
1028 QueryInfo queryInfo;
1029 SetCStr (queryInfo.docLevel, "Document");
1030 queryInfo.maxDocs = 5;
1031 queryInfo.sortByRank = true;
1032 queryInfo.exactWeights = false;
1033 queryInfo.needRankInfo = true;
1034 queryInfo.needTermFreqs = true;
1035
1036 // mode 1 = casefolded, unstemmed search
1037 QueryNode *queryTree = ParseQuery(query, 1, 1);
1038
1039 // cout << "-- query --" << endl;
1040 // PrintNode (cout, queryTree);
1041
1042 // perform the query
1043 ExtQueryResult queryResult;
1044 MGQuery (indexData, queryInfo, queryTree, queryResult);
1045 // cout << "-- word lookup result -- " << endl << queryResult << endl ;
1046
1047 result.clear();
1048 result = queryResult.docs;
1049
1050 // delete the query
1051 if (queryTree != NULL) delete queryTree;
1052}
1053
1054
1055
1056
1057// cgi_error
1058//
1059// If for some reason we cannot proceed, output a simple error
1060// page and exit(0) the program.
1061
1062void cgi_error(bool XMLmode, char *message) {
1063
1064 if (XMLmode) {
1065 cout << "Content-type: text/plain" << endl << endl
1066 << "<phinddata>" << endl
1067 << "<phinderror>" << message << "</phinderror>" << endl
1068 << "</phinddata>" << endl;
1069 } else {
1070 cout << "Content-type: text/html" << endl << endl
1071 << "<html><head><title>phind error</title></head>" << endl
1072 << "<body>" << endl
1073 << "<p><h1>phind error</h1>"
1074 << "<p> An error occured processing your request: <p><b>"
1075 << message
1076 << "</b></body></html>" << endl;
1077 }
1078 exit(0);
1079}
1080
1081
1082// split an expansion into prefix and suffix
1083
1084void split_phrase(UCArray word, UCArray body, UCArray &prefix, UCArray &suffix) {
1085
1086 prefix.clear();
1087 suffix.clear();
1088
1089 bool readingPrefix = true;
1090 UCArray::iterator here = word.begin();
1091 UCArray::iterator end = word.end();
1092
1093 while (here != end) {
1094
1095 // if we've not read all the prefix, add the next char to the prefix
1096 if (readingPrefix) {
1097 if (phrase_match(body, here, end)) {
1098 readingPrefix = false;
1099 // trim whitespace from end of prefix & start of suffix
1100 if (!prefix.empty()) {
1101 prefix.pop_back();
1102 }
1103 if ((here != end) && (*here == ' ')) {
1104 here++;
1105 }
1106 } else {
1107 prefix.push_back(*here);
1108 here++;
1109 }
1110 }
1111 // if we've finished with the prefix, update the suffix
1112 else {
1113 suffix.push_back(*here);
1114 here++;
1115 }
1116 }
1117}
1118
1119// phrase_match
1120//
1121// compare two strings, one represented as an UCArray, the other as two
1122// UCArray iterators.
1123//
1124// Return true if the UCArray is the same as the phrase the iterators point
1125// to for the length of the UCArray.
1126
1127bool phrase_match(UCArray text, UCArray::iterator &here, UCArray::iterator end) {
1128
1129 UCArray::iterator one_here = text.begin();
1130 UCArray::iterator one_end = text.end();
1131 UCArray::iterator two_here = here;
1132
1133 // iterate over the length of the first string, comparing each element to
1134 // the corresponding element in the second string.
1135 while (one_here != one_end) {
1136
1137 if (two_here == end) {
1138 return false;
1139 } else if (*one_here != *two_here) {
1140 return false;
1141 }
1142 one_here++;
1143 two_here++;
1144 }
1145
1146 here = two_here;
1147 return true;
1148}
1149
1150
1151// Convert from text_t format
1152//
1153// Conversions from text_t to other types
1154
1155unsigned long toLongInt(text_t &value) {
1156
1157 unsigned long result = 0;
1158
1159 text_t::iterator here = value.begin();
1160 text_t::iterator end = value.end();
1161 while (here != end) {
1162 result *= 10;
1163 result += *here - '0';
1164 here++;
1165 }
1166
1167 return result;
1168}
1169
1170void toUCArray(text_t &in, UCArray &out) {
1171 out.clear();
1172 text_t::iterator here = in.begin();
1173 text_t::iterator end = in.end();
1174 while (here != end) {
1175 out.push_back((unsigned char) *here);
1176 here++;
1177 }
1178}
1179
Note: See TracBrowser for help on using the repository browser.