source: trunk/gsdl/src/phind/host/phindcgi.cpp@ 2062

Last change on this file since 2062 was 2062, checked in by paynter, 23 years ago

Added a function from the receptionist that decodes phrase CGI arguments
from URL-safe format (e.g. %43%41%54 means CAT) so we can do searches on
UTF8 text.

  • Property svn:keywords set to Author Date Id Revision
File size: 29.0 KB
Line 
1/**********************************************************************
2 *
3 * phindcgi.cpp -- cgi program to serve phind phrase hierarchies
4 *
5 * Copyright 2000 Gordon W. Paynter
6 * Copyright 2000 The New Zealand Digital Library Project
7 *
8 *
9 * A component of the Greenstone digital library software
10 * from the New Zealand Digital Library Project at the
11 * University of Waikato, New Zealand.
12 *
13 * This program is free software; you can redistribute it and/or modify
14 * it under the terms of the GNU General Public License as published by
15 * the Free Software Foundation; either version 2 of the License, or
16 * (at your option) any later version.
17 *
18 * This program is distributed in the hope that it will be useful,
19 * but WITHOUT ANY WARRANTY; without even the implied warranty of
20 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
21 * GNU General Public License for more details.
22 *
23 * You should have received a copy of the GNU General Public License
24 * along with this program; if not, write to the Free Software
25 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
26 *
27 *********************************************************************/
28
29/*
30 * phindcgi.cpp
31 *
32 * The program itself reads request for a phrase's data from the
33 * QUERY_STRING variable, looks up the phrase (if necessary) in the MGPP
34 * pword database, then looks up the phrase's charatoristics in the MGPP
35 * pdata database, and reports output to STDOUT ar crude HTML or XML.
36 *
37 */
38
39
40#include <iostream.h>
41#include <fstream.h>
42#include <stdlib.h>
43#include <stdio.h>
44#include <assert.h>
45
46#include <vector.h>
47#include <algo.h>
48
49// Include MGPP functionality.
50#include <TextGet.h>
51#include <MGQuery.h>
52#include <Terms.h>
53#include <messages.h>
54#include <GSDLQueryParser.h>
55
56// Include GSDL's text_t object, which makes parsing cgi arguments easier.
57#include <text_t.h>
58// Note that GSDL stores strings as text_t objects (vectors of 16-bit short int),
59// while MGPP stores strings as UCArray objects (vectors of 8-bit unsigned char).
60
61
62
63void get_gsdlsite_parameters(char *&gsdlhome);
64
65void get_cgi_parameters(char *&collection, char *&classifier,
66 unsigned long &phrasenumber, UCArray &phrasetext,
67 unsigned long &first_e, unsigned long &last_e,
68 unsigned long &first_l, unsigned long &last_l,
69 unsigned long &first_d, unsigned long &last_d,
70 bool &XMLmode);
71
72void decode_cgi_arg (text_t &argstr);
73
74void print_expansions(char *cgi_script, char *collection, bool XMLmode, UCArray body,
75 TextData &textdata, vector <unsigned long> elist,
76 unsigned long first, unsigned long last);
77
78void print_thesaurus_links(char *cgi_script, char *collection,
79 bool XMLmode, UCArray body, TextData &textdata,
80 vector <unsigned long> &linkdest,
81 vector <UCArray> &linktype,
82 unsigned long first, unsigned long last);
83
84void print_documents(bool XMLmode, char *basepath, char *cgi_script,
85 char *collection,
86 vector <unsigned long> docNums,
87 vector <unsigned long> docFreq,
88 unsigned long first, unsigned long last);
89
90void find_phrase_number_from_word(char *basepath, UCArray &query, DocNumArray &result);
91
92void get_phrase_freq_data(TextData &textdata, unsigned long phrase,
93 UCArray &word, unsigned long &tf,
94 unsigned long &ef, unsigned long &df);
95
96void get_phrase_all_data(TextData &textdata, unsigned long phrase,
97 UCArray &word,
98 unsigned long &tf, unsigned long &ef,
99 unsigned long &lf, unsigned long &df,
100 vector <unsigned long> &el,
101 vector <unsigned long> &linkdest,
102 vector <UCArray> &linktype,
103 vector <unsigned long> &docnum,
104 vector <unsigned long> &docfrq);
105
106void split_phrase(UCArray word, UCArray body, UCArray &prefix, UCArray &suffix);
107bool phrase_match(UCArray text, UCArray::iterator &here, UCArray::iterator end);
108
109void get_document_all_data(TextData &docdata, unsigned long docNum,
110 UCArray &title, UCArray &hash);
111
112void cgi_error(bool XMLmode, char *message);
113
114void toUCArray(text_t &in, UCArray &out);
115unsigned long toLongInt(text_t &value);
116
117
118
119int main (int argc, char * argv[]) {
120
121
122 // the phrase to expand
123 unsigned long phrase = 0;
124 UCArray word;
125
126 // the frequency and occurances of the phrase
127 unsigned long tf;
128 vector <unsigned long> el, linkdest, docNums, docfreq;
129 vector <UCArray> linktype;
130
131 // the number of occurances to display
132 unsigned long ef, first_e, last_e, count_e,
133 lf, first_l, last_l, count_l,
134 df, first_d, last_d, count_d;
135
136 // are we in XML mode (as opposed to HTML mode)
137 bool XMLmode = false;
138
139 // Read the gsdlsite.cfg file
140 char *gsdlhome = NULL;
141 get_gsdlsite_parameters(gsdlhome);
142
143 if (gsdlhome == NULL) {
144 cgi_error(XMLmode, "GSDLHOME not set in gsdlsite.cfg file.");
145 }
146
147 // Get command-line parameters
148 char *collection = NULL;
149 char *classifier = NULL;
150 text_tmap param;
151 get_cgi_parameters(collection, classifier, phrase, word,
152 first_e, last_e, first_l, last_l, first_d, last_d, XMLmode);
153
154 if (collection == NULL) {
155 cgi_error(XMLmode, "No collection");
156 }
157
158 char basepath[FILENAME_MAX] = "";
159 strcat(basepath, gsdlhome);
160 strcat(basepath, "/collect/");
161 strcat(basepath, collection);
162 strcat(basepath, "/index/phind");
163 strcat(basepath, classifier);
164
165 // If we don't know the phrase number, look itup
166 if (phrase == 0) {
167
168 if (word.empty()) {
169 cgi_error(XMLmode, "No phrase number or word.");
170 }
171
172 DocNumArray result;
173 find_phrase_number_from_word(basepath, word, result);
174
175 if (result.empty()) {
176 cgi_error(XMLmode, "The search term does not occur in the collection.");
177 exit(0);
178 } else {
179 phrase = result[0];
180 }
181 }
182
183 // Create a TextData object to read the phrase data (pdata)
184 TextData textdata;
185 char filename[FILENAME_MAX] = "pdata";
186 if (!textdata.LoadData (basepath, filename)) {
187 FatalError (1, "Couldn't load text information for \"%s\"", filename);
188 }
189 get_phrase_all_data(textdata, phrase, word, tf, ef, lf, df, el,
190 linkdest, linktype, docNums, docfreq);
191
192
193 // Output the header
194 if (XMLmode) {
195 cout << "Content-type: text/plain" << endl << endl
196 << "<phinddata id=\"" << phrase
197 << "\" text=\"" << word
198 << "\" tf=\"" << tf
199 << "\" ef=\"" << ef
200 << "\" df=\"" << df
201 << "\" lf=\"" << lf
202 << "\">" << endl;
203 } else {
204 cout << "Content-type: text/html" << endl << endl
205 << "<html><head><title>" << word << "</title></head>" << endl
206 << "<body><center>" << endl
207 << "<p><h1>" << word << "</h1>" << endl
208 << "<p><b>"<< word << "</b> occurs "
209 << tf << " times in " << df << " documents" << endl;
210 }
211
212
213 // Output the thesaurus links
214 if ((lf > 0) && (first_l < last_l)) {
215
216 // figure out the number of phrases to output
217 if (last_l > lf) {
218 last_l = lf;
219 }
220 count_l = last_l - first_l;
221
222 if (XMLmode) {
223 cout << "<thesauruslist length=\"" << lf
224 << "\" start=\"" << first_l
225 << "\" end=\"" << last_l << "\">" << endl;
226 print_thesaurus_links(argv[0], collection, XMLmode, word, textdata,
227 linkdest, linktype, first_l, last_l);
228 cout << "</thesauruslist>" << endl;
229 }
230
231 // output links as HTML
232 else {
233 if (count_l == lf) {
234 cout << "<p><b> " << count_l << " thesaurus links</b>" << endl;
235 } else {
236 cout << "<p><b>" << count_l << " of " << lf << " thesaurus links</b>" << endl;
237 }
238
239 cout << "<p><table border=1><tr><th>type</th><th>topic</th><th>freq</th><th>docs</th></tr>" << endl;
240 print_thesaurus_links(argv[0], collection, XMLmode, word, textdata,
241 linkdest, linktype, first_l, last_l);
242
243 cout << "</table>" << endl;
244
245 if (last_l < lf) {
246 if ((last_l + 10) < lf) {
247 cout << "<br><a href='" << argv[0]
248 << "?c=" << collection
249 << "&n=" << phrase
250 << "&e=" << first_e
251 << "&f=" << last_e
252 << "&h=" << first_d
253 << "&i=" << last_d
254 << "&k=" << first_l
255 << "&l=" << (last_l + 10)
256 << "'>Get more thesaurus links</a>"
257 << endl;
258 }
259 cout << "<br><a href='" << argv[0]
260 << "?c=" << collection
261 << "&n=" << phrase
262 << "&e=" << first_e
263 << "&f=" << last_e
264 << "&h=" << first_d
265 << "&i=" << last_d
266 << "&k=" << first_l
267 << "&l=" << lf
268 << "'>Get every thesaurus link</a>"
269 << endl;
270 }
271 }
272
273 }
274
275 // Output the expansions
276 if ((ef > 0) && (first_e < last_e)) {
277
278 // figure out the number of phrases to output
279 if (last_e > el.size()) {
280 last_e = el.size();
281 }
282 count_e = last_e - first_e;
283
284 // output expansions as XML
285 if (XMLmode) {
286 cout << "<expansionlist length=\"" << ef
287 << "\" start=\"" << first_e
288 << "\" end=\"" << last_e << "\">" << endl;
289
290 print_expansions(argv[0], collection, XMLmode, word, textdata, el, first_e, last_e);
291
292 cout << "</expansionlist>" << endl;
293 }
294
295 // output expansions as HTML
296 else {
297 if (count_e == el.size()) {
298 cout << "<p><b> " << count_e << " expansions</b>" << endl;
299 } else {
300 cout << "<p><b>" << count_e << " of " << ef << " expansions</b>" << endl;
301 }
302
303 cout << "<p><table border=1><tr><th colspan=3>phrase</th><th>freq</th><th>docs</th></tr>" << endl;
304 print_expansions(argv[0], collection, XMLmode, word, textdata, el, first_e, last_e);
305 cout << "</table>" << endl;
306
307 if (last_e < ef) {
308 if ((last_e + 10) < ef) {
309 cout << "<br><a href='" << argv[0]
310 << "?c=" << collection
311 << "&n=" << phrase
312 << "&e=" << first_e
313 << "&f=" << (last_e + 10)
314 << "&h=" << first_d
315 << "&i=" << last_d
316 << "&k=" << first_l
317 << "&l=" << last_l
318 << "'>Get more expansions</a>"
319 << endl;
320 }
321 cout << "<br><a href='" << argv[0]
322 << "?c=" << collection
323 << "&n=" << phrase
324 << "&e=" << first_e
325 << "&f=" << ef
326 << "&h=" << first_d
327 << "&i=" << last_d
328 << "&k=" << first_l
329 << "&l=" << last_l
330 << "'>Get every expansion</a>"
331 << endl;
332 }
333 }
334 }
335
336 // Output the document occurances
337 if ((df > 0) && (first_d < last_d)) {
338
339 // figure out the phrases to output
340 if (last_d > docNums.size()) {
341 last_d = docNums.size();
342 }
343 count_d = last_d - first_d;
344
345 // output document list as XML
346 if (XMLmode) {
347 cout << "<documentlist length=\"" << df
348 << "\" start=\"" << first_d
349 << "\" end=\"" << last_d << "\">" << endl;
350
351 print_documents(XMLmode, basepath, "library", collection,
352 docNums, docfreq, first_d, last_d);
353
354 cout << "</documentlist>" << endl;
355 }
356
357 // output document list as HTML
358 else {
359
360 if (count_d == docNums.size()) {
361 cout << "<p><b> " << count_d << " documents</b>" << endl;
362 } else {
363 cout << "<p><b>" << count_d << " of " << df << " documents</b>" << endl;
364 }
365
366 cout << "<p><table border=1><tr><th align=left>document</th><th>freq</th></tr>" << endl;
367 print_documents(XMLmode, basepath, "library", collection,
368 docNums, docfreq, first_d, last_d);
369 cout << "</table>" << endl;
370
371 if (last_d < df) {
372 if ((last_d + 10) < df) {
373 cout << "<br><a href='" << argv[0]
374 << "?c=" << collection
375 << "&n=" << phrase
376 << "&e=" << first_e
377 << "&f=" << last_e
378 << "&h=" << first_d
379 << "&i=" << (last_d + 10)
380 << "&k=" << first_l
381 << "&l=" << last_l
382 << "'>Get more documents</a>" << endl;
383 }
384 cout << "<br><a href='" << argv[0]
385 << "?c=" << collection
386 << "&n=" << phrase
387 << "&e=" << first_e
388 << "&f=" << last_e
389 << "&h=" << first_d
390 << "&i=" << df
391 << "&k=" << first_l
392 << "&l=" << last_l
393 << "'>Get every document</a>" << endl;
394 }
395 }
396 }
397
398 // Close the document
399 if (XMLmode) {
400 cout << "</phinddata>" << endl;
401 } else {
402 cout << "</center></body></html>" << endl;
403 }
404
405 textdata.UnloadData ();
406 return 0;
407}
408
409
410// Print a list of expansions
411//
412// Given the textData and a list of phrase numbers, print out each of the
413// expansions.
414
415void print_expansions(char *cgi_script, char *collection, bool XMLmode, UCArray body,
416 TextData &textdata, vector <unsigned long> elist,
417 unsigned long first, unsigned long last) {
418
419 UCArray word;
420 unsigned long phrase, tf, df, ef;
421
422 UCArray suffix, prefix;
423
424 for (unsigned long e = first; e < last; e++) {
425
426 phrase = elist[e];
427 get_phrase_freq_data(textdata, phrase, word, tf, ef, df);
428
429 split_phrase(word, body, prefix, suffix);
430
431 if (XMLmode) {
432 // body is always the same as the text of the phrase, so no need to send it
433 cout << "<expansion num=\"" << e
434 << "\" id=\"" << phrase
435 << "\" tf=\"" << tf
436 << "\" df=\"" << df;
437 if (!prefix.empty()) {
438 cout << "\" prefix=\"" << prefix;
439 }
440 if (!suffix.empty()) {
441 cout << "\" suffix=\"" << suffix;
442 }
443 cout << "\"/>" << endl;
444 } else {
445 cout << "<tr valign=top><td align=right><a href='" << cgi_script
446 << "?c=" << collection << "&n=" << phrase << "'>" << prefix << "</a></td>"
447 << "<td align=center><a href='" << cgi_script
448 << "?c=" << collection << "&n=" << phrase << "'>" << body << "</a></td>"
449 << "<td align=left><a href='" << cgi_script
450 << "?c=" << collection << "&n=" << phrase << "'>" << suffix << "</a></td>"
451 << "<td>" << tf << "</td><td>" << df << "</td></tr>" << endl;
452 }
453 }
454}
455
456void print_thesaurus_links(char *cgi_script, char *collection,
457 bool XMLmode, UCArray body, TextData &textdata,
458 vector <unsigned long> &linkdest,
459 vector <UCArray> &linktype,
460 unsigned long first, unsigned long last) {
461
462 // information describing each link in the list
463 unsigned long phrase, tf, ef, df;
464 UCArray type, text, newbody, suffix, prefix;
465
466 for (unsigned long l = first; l < last; l++) {
467
468 // get the phrase data
469 phrase = linkdest[l];
470 type = linktype[l];
471 get_phrase_freq_data(textdata, phrase, text, tf, ef, df);
472 // split_phrase(text, newbody, prefix, suffix);
473
474 if (XMLmode) {
475 cout << "<thesaurus num=\"" << l
476 << "\" id=\"" << phrase
477 << "\" tf=\"" << tf
478 << "\" df=\"" << df
479 << "\" type=\"" << type
480 << "\" text=\"" << text
481 << "\"/>" << endl;
482 } else {
483 cout << "<tr valign=top><td>" << type << "</td><td>"
484 << "<a href='" << cgi_script << "?c=" << collection
485 << "&n=" << phrase << "'>" << text << "</a>"
486 << "</td><td>" << tf << "</td><td>" << df << "</td></tr>" << endl;
487 }
488 }
489}
490
491
492void print_documents(bool XMLmode, char *basepath, char *cgi_script, char *collection,
493 vector <unsigned long> docNums, vector <unsigned long> docFreq,
494 unsigned long first, unsigned long last) {
495
496 // Create a TextData object to read the document data
497 TextData docdata;
498 char filename[FILENAME_MAX] = "docs";
499 if (!docdata.LoadData (basepath, filename)) {
500 FatalError (1, "Couldn't load text information for \"%s\"", filename);
501 }
502
503 UCArray title, hash;
504 unsigned long freq, doc;
505
506 for (unsigned long d = first; d < last; d++) {
507 doc = docNums[d];
508 freq = docFreq[d];
509
510 get_document_all_data(docdata, doc, title, hash);
511
512 if (XMLmode) {
513 cout << "<document num=\"" << d
514 << "\" hash=\"" << hash
515 << "\" freq=\"" << freq
516 << "\" title=\"" << title << "\"/>" << endl;
517 } else {
518 cout << "<tr valign=top><td><a href='" << cgi_script << "?c=" << collection
519 << "&a=d&d=" << hash << "'>" << title << "</a>"
520 << "</td><td>" << freq << "</td></tr>"
521 << endl;
522 }
523 }
524}
525
526
527
528// Get the frequency data about a phrase
529//
530// The phrase is stored in textData as record phrase.
531// We retrieve:
532// word - the text of the phrase
533// tf - the total frequency of the phrase
534// ef - the expansion frequency of the phrase
535// df - the document frequency of the phrase
536
537void get_phrase_freq_data(TextData &textdata, unsigned long phrase,
538 UCArray &word, unsigned long &tf,
539 unsigned long &ef, unsigned long &df) {
540
541 UCArray text;
542 UCArray docLevel;
543 SetCStr(docLevel, "Document");
544
545 // Look the word up in the textData
546 if (!GetDocText (textdata, docLevel, phrase, text)) {
547 FatalError (1, "Error while trying to get phrase %u", phrase);
548 }
549
550 // Ignore everything up to the first colon
551 UCArray::iterator next = text.begin();
552 while (*next++ != ':');
553
554 // Get the word
555 word.clear();
556 for (; *next != ':'; next++) {
557 word.push_back(*next);
558 }
559
560 // Get total frequency
561 tf = 0;
562 for (next++; *next != ':'; next++) {
563 tf *= 10;
564 tf += (*next - '0');
565 }
566
567 // Get expansion frequency
568 ef = 0;
569 for (next++; *next != ':'; next++) {
570 ef *= 10;
571 ef += (*next - '0');
572 }
573
574 // Get document frequency
575 df = 0;
576 for (next++; *next != ':'; next++) {
577 df *= 10;
578 df += (*next - '0');
579 }
580}
581
582// Get all the data about a phrase
583//
584// The phrase is stored in textData as record phrase.
585// We retrieve:
586// word - the text of the phrase
587// tf - the total frequency of the phrase
588// ef - the expansion frequency of the phrase
589// lf - the thesaurus link frequency of the phrase
590// df - the document frequency of the phrase
591// el - the list of phrases that are expansions of phrase
592// ll - the list of phrases that are thesaurus links
593// dl - the list of documents that contain phrase
594
595void get_phrase_all_data(TextData &textdata, unsigned long phrase,
596 UCArray &word,
597 unsigned long &tf, unsigned long &ef,
598 unsigned long &lf, unsigned long &df,
599 vector <unsigned long> &el,
600 vector <unsigned long> &linkdest,
601 vector <UCArray> &linktype,
602 vector <unsigned long> &docnum,
603 vector <unsigned long> &docfrq) {
604 UCArray text;
605 UCArray docLevel;
606 SetCStr(docLevel, "Document");
607
608 // Look thwe word up in the textData
609 if (!GetDocText (textdata, docLevel, phrase, text)) {
610 FatalError (1, "Error while trying to get phrase %u", phrase);
611 }
612
613 // Ignore everything up to the first colon
614 UCArray::iterator next = text.begin();
615 while (*next++ != ':');
616
617 // ignore training cariage returns
618 while (text.back() == '\n') {
619 text.pop_back();
620 }
621
622 // Get the word
623 word.clear();
624 for (; *next != ':'; next++) {
625 word.push_back(*next);
626 }
627
628 // Get total frequency
629 tf = 0;
630 for (next++; *next != ':'; next++) {
631 tf *= 10;
632 tf += (*next - '0');
633 }
634
635 // Get expansion frequency
636 ef = 0;
637 for (next++; *next != ':'; next++) {
638 ef *= 10;
639 ef += (*next - '0');
640 }
641
642 // Get document frequency
643 df = 0;
644 for (next++; *next != ':'; next++) {
645 df *= 10;
646 df += (*next - '0');
647 }
648
649 // Get expansion list
650 el.clear();
651 unsigned long e = 0;
652 for (next++; *next != ':'; next++) {
653 if (*next == ',') {
654 el.push_back(e);
655 e = 0;
656 } else {
657 e *= 10;
658 e += (*next - '0');
659 }
660 }
661
662 // Get document list & the document frequency list
663 docnum.clear();
664 docfrq.clear();
665 bool readnum = false;
666 unsigned long d = 0;
667 for (next++; *next != ':'; next++) {
668 if (*next == ',') {
669 docnum.push_back(d);
670 readnum = true;
671 d = 0;
672 } else if (*next == ';') {
673 if (readnum) {
674 docfrq.push_back(d);
675 } else {
676 docnum.push_back(d);
677 docfrq.push_back(1);
678 }
679 readnum = false;
680 d = 0;
681 } else {
682 d *= 10;
683 d += (*next - '0');
684 }
685 }
686
687 // Get thesaurus link frequency & link list
688 text.push_back(':');
689 text.push_back(':');
690
691 // link frequency
692 lf = 0;
693 for (next++; *next != ':'; next++) {
694 lf *= 10;
695 lf += (*next - '0');
696 }
697
698 // two lists of link data
699 linkdest.clear();
700 linktype.clear();
701
702 UCArray thistype;
703 thistype.clear();
704 bool typedone = false;
705 unsigned long l = 0;
706 for (next++; *next != ':'; next++) {
707
708 if (!typedone) {
709 // first read the link type, a charactor string
710 if (*next == ',') {
711 typedone = true;
712 } else {
713 thistype.push_back(*next);
714 }
715 } else {
716 // having read the link type, read the list of link destinations
717 if (*next == ',') {
718 linkdest.push_back(l);
719 linktype.push_back(thistype);
720 l = 0;
721 } else if (*next == ';') {
722 linkdest.push_back(l);
723 linktype.push_back(thistype);
724 l = 0;
725 thistype.clear();
726 typedone = false;
727 } else {
728 l *= 10;
729 l += (*next - '0');
730 }
731 }
732 }
733}
734
735// Get all the data about a docment
736//
737// The document's detailes are stored in docData as record docNum.
738// We retrieve:
739// title - the document's title
740// hash - the documnt's unique OID
741
742void get_document_all_data(TextData &docdata, unsigned long docNum,
743 UCArray &title, UCArray &hash) {
744
745 UCArray text;
746 UCArray docLevel;
747 SetCStr(docLevel, "Document");
748
749 // Look the word up in the textData
750 if (!GetDocText (docdata, docLevel, docNum, text)) {
751 FatalError (1, "Error while trying to get document %u", docNum);
752 }
753
754 // Ignore everything up to the first colon
755 UCArray::iterator next = text.begin();
756 while (*next++ != '\t');
757
758 // Get the document OID (hash)
759 hash.clear();
760 for (; *next != '\t'; next++) {
761 hash.push_back(*next);
762 }
763
764 // Get the title
765 text.push_back('\n');
766 title.clear();
767 for (next++; *next != '\n'; next++) {
768 title.push_back(*next);
769 }
770}
771
772
773void get_gsdlsite_parameters(char *&gsdlhome) {
774
775 // open the file
776 ifstream gsdl("gsdlsite.cfg", ios::in);
777 if (!gsdl) {
778 cerr << "File gsdlsite.cfg could not be opened\n";
779 exit(1);
780 }
781
782 // read each line of the file
783 char buffer[2000];
784 while (!gsdl.eof()) {
785 gsdl.getline(buffer, 2000, '\n');
786
787 // read the gsdlhome variable
788 if (strncmp(buffer, "gsdlhome", 8) == 0) {
789
790 // find the start of the gsdlhome string
791 int len = strlen(buffer);
792 int i = 8;
793 while (i < len && (buffer[i] == ' ' || buffer[i] == '\t')) {
794 i++;
795 }
796 // store the gsdlhome string
797 gsdlhome = new (char)[len-i];
798 strncpy(gsdlhome, &(buffer[i]), len-i);
799 }
800 }
801}
802
803void get_cgi_parameters(char *&collection, char *&classifier,
804 unsigned long &phrasenumber, UCArray &phrasetext,
805 unsigned long &first_e, unsigned long &last_e,
806 unsigned long &first_l, unsigned long &last_l,
807 unsigned long &first_d, unsigned long &last_d,
808 bool &XMLmode) {
809
810
811 // set the default parameters
812 phrasenumber = 0;
813 phrasetext.clear();
814 first_e = 0;
815 last_e = 10;
816 first_l = 0;
817 last_l = 10;
818 first_d = 0;
819 last_d = 10;
820
821 // get the query string
822 char *request_method_str = getenv("REQUEST_METHOD");
823 char *query_string = getenv("QUERY_STRING");
824 text_t query;
825
826 if (request_method_str != NULL
827 && (strcmp(request_method_str, "GET") == 0)
828 && query_string != NULL) {
829 // GET cgi args from querystring
830 query = query_string;
831
832 } else {
833 // debugging from command line
834 cout << "? " << endl;
835 char query_input[1024];
836 cin.get(query_input, 1024, '\n');
837 query = query_input;
838 }
839
840 // extract out the key=value pairs
841 text_t::iterator here = query.begin();
842 text_t::iterator end = query.end();
843 text_t key, value;
844
845 while (here != end) {
846 // get the next key and value pair
847 here = getdelimitstr (here, end, '=', key);
848 here = getdelimitstr (here, end, '&', value);
849
850 // store this key=value pair
851 if (!key.empty() && !value.empty()) {
852
853 // c: the collection name
854 if (key[0] == 'c') {
855 UCArray tmp;
856 toUCArray(value, tmp);
857 collection = GetCStr(tmp);
858 }
859
860 // d: the classifier number as string
861 if (key[0] == 'd') {
862 UCArray tmp;
863 toUCArray(value, tmp);
864 classifier = GetCStr(tmp);
865 }
866
867 // e: the first expansion number
868 else if (key[0] == 'e') {
869 first_e = toLongInt(value);
870 }
871
872 // f: the last expansion number
873 else if (key[0] == 'f') {
874 last_e = toLongInt(value);
875 }
876
877 // h: the first document number
878 else if (key[0] == 'h') {
879 first_d = toLongInt(value);
880 }
881
882 // i: the last document number
883 else if (key[0] == 'i') {
884 last_d = toLongInt(value);
885 }
886
887 // k: the first thesaurus list number
888 else if (key[0] == 'k') {
889 first_l = toLongInt(value);
890 }
891
892 // l: the last thesaurus list number
893 else if (key[0] == 'l') {
894 last_l = toLongInt(value);
895 }
896
897 // n: the phrase number
898 else if (key[0] == 'n') {
899 phrasenumber = toLongInt(value);
900 }
901
902 // p: the phrase text
903 else if (key[0] == 'p') {
904 decode_cgi_arg(value);
905 toUCArray(value, phrasetext);
906 }
907
908 // x: XML mode
909 else if (key[0] == 'x') {
910 XMLmode = true;
911 }
912
913 }
914 }
915
916 // if no classifier number is supplied, default to 1.
917 if (classifier == NULL) {
918 classifier = new (char)[2];
919 strcpy(classifier, "1");
920 }
921}
922
923
924// Convert %xx and + to their appropriate equivalents
925//
926// This function was copied from %GSDLHOME/src/recpt/cgiutils.cpp
927// because it was much easier to copy it than to link against it.
928
929static unsigned short hexdigit (unsigned short c) {
930 if (c >= '0' && c <= '9') return (c-'0');
931 if (c >= 'a' && c <= 'f') return (c-'a'+10);
932 if (c >= 'A' && c <= 'F') return (c-'A'+10);
933 return c;
934}
935
936void decode_cgi_arg (text_t &argstr) {
937 text_t::iterator in = argstr.begin();
938 text_t::iterator out = in;
939 text_t::iterator end = argstr.end();
940
941 while (in != end) {
942 if (*in == '+') *out = ' ';
943
944 else if (*in == '%') {
945 unsigned short c = '%';
946 in++;
947 if (in != end) {
948 c = hexdigit (*in);
949 in++;
950 }
951 if (in != end && c < 16) { // sanity check on the previous character
952 c = c*16 + hexdigit (*in);
953 }
954
955 *out = c;
956 } else *out = *in;
957
958 if (in != end) in++;
959 out++;
960 }
961
962 // remove the excess characters
963 argstr.erase (out, end);
964}
965
966
967// Find the phrase number of a word in the index file
968
969void find_phrase_number_from_word(char *basepath, UCArray &query, DocNumArray &result) {
970
971 // Open the index file for searching
972 IndexData indexData;
973 char indexfilename[FILENAME_MAX] = "pword";
974 if (!indexData.LoadData (basepath, indexfilename)) {
975 FatalError (1, "Couldn't load index information for \"%s\"", indexfilename);
976 }
977
978 // set up the query object
979 QueryInfo queryInfo;
980 SetCStr (queryInfo.docLevel, "Document");
981 queryInfo.maxDocs = 5;
982 queryInfo.sortByRank = true;
983 queryInfo.exactWeights = false;
984 queryInfo.needRankInfo = true;
985 queryInfo.needTermFreqs = true;
986
987 // mode 1 = casefolded, unstemmed search
988 QueryNode *queryTree = ParseQuery(query, 1, 1);
989
990 // cout << "-- query --" << endl;
991 // PrintNode (cout, queryTree);
992
993 // perform the query
994 ExtQueryResult queryResult;
995 MGQuery (indexData, queryInfo, queryTree, queryResult);
996 // cout << "-- word lookup result -- " << endl << queryResult << endl ;
997
998 result.clear();
999 result = queryResult.docs;
1000
1001 // delete the query
1002 if (queryTree != NULL) delete queryTree;
1003}
1004
1005
1006
1007
1008// cgi_error
1009//
1010// If for some reason we cannot proceed, output a simple error
1011// page and exit(0) the program.
1012
1013void cgi_error(bool XMLmode, char *message) {
1014
1015 if (XMLmode) {
1016 cout << "Content-type: text/plain" << endl << endl
1017 << "<phinddata>" << endl
1018 << "<phinderror>" << message << "</phinderror>" << endl
1019 << "</phinddata>" << endl;
1020 } else {
1021 cout << "Content-type: text/html" << endl << endl
1022 << "<html><head><title>phind error</title></head>" << endl
1023 << "<body>" << endl
1024 << "<p><h1>phind error</h1>"
1025 << "<p> An error occured processing your request: <p><b>"
1026 << message
1027 << "</b></body></html>" << endl;
1028 }
1029 exit(0);
1030}
1031
1032
1033// split an expansion into prefix and suffix
1034
1035void split_phrase(UCArray word, UCArray body, UCArray &prefix, UCArray &suffix) {
1036
1037 prefix.clear();
1038 suffix.clear();
1039
1040 bool readingPrefix = true;
1041 UCArray::iterator here = word.begin();
1042 UCArray::iterator end = word.end();
1043
1044 while (here != end) {
1045
1046 // if we've not read all the prefix, add the next char to the prefix
1047 if (readingPrefix) {
1048 if (phrase_match(body, here, end)) {
1049 readingPrefix = false;
1050 // trim whitespace from end of prefix & start of suffix
1051 if (!prefix.empty()) {
1052 prefix.pop_back();
1053 }
1054 if ((here != end) && (*here == ' ')) {
1055 here++;
1056 }
1057 } else {
1058 prefix.push_back(*here);
1059 here++;
1060 }
1061 }
1062 // if we've finished with the prefix, update the suffix
1063 else {
1064 suffix.push_back(*here);
1065 here++;
1066 }
1067 }
1068}
1069
1070// phrase_match
1071//
1072// compare two strings, one represented as an UCArray, the other as two
1073// UCArray iterators.
1074//
1075// Return true if the UCArray is the same as the phrase the iterators point
1076// to for the length of the UCArray.
1077
1078bool phrase_match(UCArray text, UCArray::iterator &here, UCArray::iterator end) {
1079
1080 UCArray::iterator one_here = text.begin();
1081 UCArray::iterator one_end = text.end();
1082 UCArray::iterator two_here = here;
1083
1084 // iterate over the length of the first string, comparing each element to
1085 // the corresponding element in the second string.
1086 while (one_here != one_end) {
1087
1088 if (two_here == end) {
1089 return false;
1090 } else if (*one_here != *two_here) {
1091 return false;
1092 }
1093 one_here++;
1094 two_here++;
1095 }
1096
1097 here = two_here;
1098 return true;
1099}
1100
1101
1102// Convert from text_t format
1103//
1104// Conversions from text_t to other types
1105
1106unsigned long toLongInt(text_t &value) {
1107
1108 unsigned long result = 0;
1109
1110 text_t::iterator here = value.begin();
1111 text_t::iterator end = value.end();
1112 while (here != end) {
1113 result *= 10;
1114 result += *here - '0';
1115 here++;
1116 }
1117
1118 return result;
1119}
1120
1121void toUCArray(text_t &in, UCArray &out) {
1122 out.clear();
1123 text_t::iterator here = in.begin();
1124 text_t::iterator end = in.end();
1125 while (here != end) {
1126 out.push_back((unsigned char) *here);
1127 here++;
1128 }
1129}
1130
Note: See TracBrowser for help on using the repository browser.