source: trunk/gsdl/src/phind/host/phindcgi.cpp@ 1828

Last change on this file since 1828 was 1828, checked in by paynter, 23 years ago

Handle the new phind.dm input format, which includes thesaurus data,
and output this thesaurus data when it is available.

  • Property svn:keywords set to Author Date Id Revision
File size: 27.5 KB
Line 
1/**********************************************************************
2 *
3 * phindcgi.cpp -- cgi program to serve phind phrase hierarchies
4 *
5 * Copyright 2000 Gordon W. Paynter
6 * Copyright 2000 The New Zealand Digital Library Project
7 *
8 *
9 * A component of the Greenstone digital library software
10 * from the New Zealand Digital Library Project at the
11 * University of Waikato, New Zealand.
12 *
13 * This program is free software; you can redistribute it and/or modify
14 * it under the terms of the GNU General Public License as published by
15 * the Free Software Foundation; either version 2 of the License, or
16 * (at your option) any later version.
17 *
18 * This program is distributed in the hope that it will be useful,
19 * but WITHOUT ANY WARRANTY; without even the implied warranty of
20 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
21 * GNU General Public License for more details.
22 *
23 * You should have received a copy of the GNU General Public License
24 * along with this program; if not, write to the Free Software
25 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
26 *
27 *********************************************************************/
28
29/*
30 * phindcgi.cpp
31 *
32 * The program itself reads request for a phrase's data from the
33 * QUERY_STRING variable, looks up the phrase (if necessary) in the MGPP
34 * pword database, then looks up the phrase's charatoristics in the MGPP
35 * pdata database, and reports output to STDOUT ar crude HTML or XML.
36 *
37 */
38
39
40#include <iostream.h>
41#include <fstream.h>
42#include <stdlib.h>
43#include <stdio.h>
44#include <assert.h>
45
46#include <vector.h>
47#include <algo.h>
48
49// Include MGPP functionality.
50#include <TextGet.h>
51#include <MGQuery.h>
52#include <Terms.h>
53#include <messages.h>
54#include <GSDLQueryParser.h>
55
56// Include GSDL's text_t object, which makes parsing cgi arguments easier.
57#include <text_t.h>
58// Note that GSDL stores strings as text_t objects (vectors of 16-bit short int),
59// while MGPP stores strings as UCArray objects (vectors of 8-bit unsigned char).
60
61
62
63void get_gsdlsite_parameters(char *&gsdlhome);
64
65void get_cgi_parameters(char *&collection,
66 unsigned long &phrasenumber, UCArray &phrasetext,
67 unsigned long &first_e, unsigned long &last_e,
68 unsigned long &first_l, unsigned long &last_l,
69 unsigned long &first_d, unsigned long &last_d,
70 bool &XMLmode);
71
72void print_expansions(char *cgi_script, char *collection, bool XMLmode, UCArray body,
73 TextData &textdata, vector <unsigned long> elist,
74 unsigned long first, unsigned long last);
75
76void print_thesaurus_links(char *cgi_script, char *collection,
77 bool XMLmode, UCArray body, TextData &textdata,
78 vector <unsigned long> &linkdest,
79 vector <UCArray> &linktype,
80 unsigned long first, unsigned long last);
81
82void print_documents(bool XMLmode, char *basepath, char *cgi_script,
83 char *collection,
84 vector <unsigned long> docNums,
85 vector <unsigned long> docFreq,
86 unsigned long first, unsigned long last);
87
88void find_phrase_number_from_word(char *basepath, UCArray &query, DocNumArray &result);
89
90void get_phrase_freq_data(TextData &textdata, unsigned long phrase,
91 UCArray &word, unsigned long &tf,
92 unsigned long &ef, unsigned long &df);
93
94void get_phrase_all_data(TextData &textdata, unsigned long phrase,
95 UCArray &word,
96 unsigned long &tf, unsigned long &ef,
97 unsigned long &lf, unsigned long &df,
98 vector <unsigned long> &el,
99 vector <unsigned long> &linkdest,
100 vector <UCArray> &linktype,
101 vector <unsigned long> &docnum,
102 vector <unsigned long> &docfrq);
103
104void split_phrase(UCArray word, UCArray body, UCArray &prefix, UCArray &suffix);
105bool phrase_match(UCArray text, UCArray::iterator &here, UCArray::iterator end);
106
107void get_document_all_data(TextData &docdata, unsigned long docNum,
108 UCArray &title, UCArray &hash);
109
110void cgi_error(bool XMLmode, char *message);
111
112void toUCArray(text_t &in, UCArray &out);
113unsigned long toLongInt(text_t &value);
114
115
116
117int main (int argc, char * argv[]) {
118
119
120 // the phrase to expand
121 unsigned long phrase = 0;
122 UCArray word;
123
124 // the frequency and occurances of the phrase
125 unsigned long tf;
126 vector <unsigned long> el, linkdest, docNums, docfreq;
127 vector <UCArray> linktype;
128
129 // the number of occurances to display
130 unsigned long ef, first_e, last_e, count_e,
131 lf, first_l, last_l, count_l,
132 df, first_d, last_d, count_d;
133
134 // are we in XML mode (as opposed to HTML mode)
135 bool XMLmode = false;
136
137 // Read the gsdlsite.cfg file
138 char *gsdlhome = NULL;
139 get_gsdlsite_parameters(gsdlhome);
140
141 if (gsdlhome == NULL) {
142 cgi_error(XMLmode, "GSDLHOME not set in gsdlsite.cfg file.");
143 }
144
145 // Get command-line parameters
146 char *collection;
147 text_tmap param;
148 get_cgi_parameters(collection, phrase, word,
149 first_e, last_e, first_l, last_l, first_d, last_d, XMLmode);
150
151 if (collection == NULL) {
152 cgi_error(XMLmode, "No collection");
153 }
154
155 char basepath[FILENAME_MAX] = "";
156 strcat(basepath, gsdlhome);
157 strcat(basepath, "/collect/");
158 strcat(basepath, collection);
159 strcat(basepath, "/index/phind");
160
161 // If we don't know the phrase number, look itup
162 if (phrase == 0) {
163
164 if (word.empty()) {
165 cgi_error(XMLmode, "No phrase number or word.");
166 }
167
168 DocNumArray result;
169 find_phrase_number_from_word(basepath, word, result);
170
171 if (result.empty()) {
172 cgi_error(XMLmode, "The search term does not occur in the collection.");
173 exit(0);
174 } else {
175 phrase = result[0];
176 }
177 }
178
179 // Create a TextData object to read the phrase data (pdata)
180 TextData textdata;
181 char filename[FILENAME_MAX] = "pdata";
182 if (!textdata.LoadData (basepath, filename)) {
183 FatalError (1, "Couldn't load text information for \"%s\"", filename);
184 }
185 get_phrase_all_data(textdata, phrase, word, tf, ef, lf, df, el,
186 linkdest, linktype, docNums, docfreq);
187
188
189 // Output the header
190 if (XMLmode) {
191 cout << "Content-type: text/plain" << endl << endl
192 << "<phinddata id=\"" << phrase
193 << "\" text=\"" << word
194 << "\" tf=\"" << tf
195 << "\" ef=\"" << ef
196 << "\" df=\"" << df
197 << "\" lf=\"" << lf
198 << "\">" << endl;
199 } else {
200 cout << "Content-type: text/html" << endl << endl
201 << "<html><head><title>" << word << "</title></head>" << endl
202 << "<body><center>" << endl
203 << "<p><h1>" << word << "</h1>" << endl
204 << "<p><b>"<< word << "</b> occurs "
205 << tf << " times in " << df << " documents" << endl;
206 }
207
208
209 // Output the thesaurus links
210 if ((lf > 0) && (first_l < last_l)) {
211
212 // figure out the number of phrases to output
213 if (last_l > lf) {
214 last_l = lf;
215 }
216 count_l = last_l - first_l;
217
218 if (XMLmode) {
219 cout << "<thesauruslist length=\"" << lf
220 << "\" start=\"" << first_l
221 << "\" end=\"" << last_l << "\">" << endl;
222 print_thesaurus_links(argv[0], collection, XMLmode, word, textdata,
223 linkdest, linktype, first_l, last_l);
224 cout << "</thesauruslist>" << endl;
225 }
226
227 // output links as HTML
228 else {
229 if (count_l == lf) {
230 cout << "<p><b> " << count_l << " thesaurus links</b>" << endl;
231 } else {
232 cout << "<p><b>" << count_l << " of " << lf << " thesaurus links</b>" << endl;
233 }
234
235 cout << "<p><table border=1><tr><th>type</th><th>topic</th><th>freq</th><th>docs</th></tr>" << endl;
236 print_thesaurus_links(argv[0], collection, XMLmode, word, textdata,
237 linkdest, linktype, first_l, last_l);
238
239 cout << "</table>" << endl;
240
241 if (last_l < lf) {
242 if ((last_l + 10) < lf) {
243 cout << "<br><a href='" << argv[0]
244 << "?c=" << collection
245 << "&n=" << phrase
246 << "&e=" << first_e
247 << "&f=" << last_e
248 << "&h=" << first_d
249 << "&i=" << last_d
250 << "&k=" << first_l
251 << "&l=" << (last_l + 10)
252 << "'>Get more thesaurus links</a>"
253 << endl;
254 }
255 cout << "<br><a href='" << argv[0]
256 << "?c=" << collection
257 << "&n=" << phrase
258 << "&e=" << first_e
259 << "&f=" << last_e
260 << "&h=" << first_d
261 << "&i=" << last_d
262 << "&k=" << first_l
263 << "&l=" << lf
264 << "'>Get every thesaurus link</a>"
265 << endl;
266 }
267 }
268
269 }
270
271 // Output the expansions
272 if ((ef > 0) && (first_e < last_e)) {
273
274 // figure out the number of phrases to output
275 if (last_e > el.size()) {
276 last_e = el.size();
277 }
278 count_e = last_e - first_e;
279
280 // output expansions as XML
281 if (XMLmode) {
282 cout << "<expansionlist length=\"" << ef
283 << "\" start=\"" << first_e
284 << "\" end=\"" << last_e << "\">" << endl;
285
286 print_expansions(argv[0], collection, XMLmode, word, textdata, el, first_e, last_e);
287
288 cout << "</expansionlist>" << endl;
289 }
290
291 // output expansions as HTML
292 else {
293 if (count_e == el.size()) {
294 cout << "<p><b> " << count_e << " expansions</b>" << endl;
295 } else {
296 cout << "<p><b>" << count_e << " of " << ef << " expansions</b>" << endl;
297 }
298
299 cout << "<p><table border=1><tr><th colspan=3>phrase</th><th>freq</th><th>docs</th></tr>" << endl;
300 print_expansions(argv[0], collection, XMLmode, word, textdata, el, first_e, last_e);
301 cout << "</table>" << endl;
302
303 if (last_e < ef) {
304 if ((last_e + 10) < ef) {
305 cout << "<br><a href='" << argv[0]
306 << "?c=" << collection
307 << "&n=" << phrase
308 << "&e=" << first_e
309 << "&f=" << (last_e + 10)
310 << "&h=" << first_d
311 << "&i=" << last_d
312 << "&k=" << first_l
313 << "&l=" << last_l
314 << "'>Get more expansions</a>"
315 << endl;
316 }
317 cout << "<br><a href='" << argv[0]
318 << "?c=" << collection
319 << "&n=" << phrase
320 << "&e=" << first_e
321 << "&f=" << ef
322 << "&h=" << first_d
323 << "&i=" << last_d
324 << "&k=" << first_l
325 << "&l=" << last_l
326 << "'>Get every expansion</a>"
327 << endl;
328 }
329 }
330 }
331
332 // Output the document occurances
333 if ((df > 0) && (first_d < last_d)) {
334
335 // figure out the phrases to output
336 if (last_d > docNums.size()) {
337 last_d = docNums.size();
338 }
339 count_d = last_d - first_d;
340
341 // output document list as XML
342 if (XMLmode) {
343 cout << "<documentlist length=\"" << df
344 << "\" start=\"" << first_d
345 << "\" end=\"" << last_d << "\">" << endl;
346
347 print_documents(XMLmode, basepath, "library", collection,
348 docNums, docfreq, first_d, last_d);
349
350 cout << "</documentlist>" << endl;
351 }
352
353 // output document list as HTML
354 else {
355
356 if (count_d == docNums.size()) {
357 cout << "<p><b> " << count_d << " documents</b>" << endl;
358 } else {
359 cout << "<p><b>" << count_d << " of " << df << " documents</b>" << endl;
360 }
361
362 cout << "<p><table border=1><tr><th align=left>document</th><th>freq</th></tr>" << endl;
363 print_documents(XMLmode, basepath, "library", collection,
364 docNums, docfreq, first_d, last_d);
365 cout << "</table>" << endl;
366
367 if (last_d < df) {
368 if ((last_d + 10) < df) {
369 cout << "<br><a href='" << argv[0]
370 << "?c=" << collection
371 << "&n=" << phrase
372 << "&e=" << first_e
373 << "&f=" << last_e
374 << "&h=" << first_d
375 << "&i=" << (last_d + 10)
376 << "&k=" << first_l
377 << "&l=" << last_l
378 << "'>Get more documents</a>" << endl;
379 }
380 cout << "<br><a href='" << argv[0]
381 << "?c=" << collection
382 << "&n=" << phrase
383 << "&e=" << first_e
384 << "&f=" << last_e
385 << "&h=" << first_d
386 << "&i=" << df
387 << "&k=" << first_l
388 << "&l=" << last_l
389 << "'>Get every document</a>" << endl;
390 }
391 }
392 }
393
394 // Close the document
395 if (XMLmode) {
396 cout << "</phinddata>" << endl;
397 } else {
398 cout << "</center></body></html>" << endl;
399 }
400
401 textdata.UnloadData ();
402 return 0;
403}
404
405
406// Print a list of expansions
407//
408// Given the textData and a list of phrase numbers, print out each of the
409// expansions.
410
411void print_expansions(char *cgi_script, char *collection, bool XMLmode, UCArray body,
412 TextData &textdata, vector <unsigned long> elist,
413 unsigned long first, unsigned long last) {
414
415 UCArray word;
416 unsigned long phrase, tf, df, ef;
417
418 UCArray suffix, prefix;
419
420 for (unsigned long e = first; e < last; e++) {
421
422 phrase = elist[e];
423 get_phrase_freq_data(textdata, phrase, word, tf, ef, df);
424
425 split_phrase(word, body, prefix, suffix);
426
427 if (XMLmode) {
428 // body is always the same as the text of the phrase, so no need to send it
429 cout << "<expansion num=\"" << e
430 << "\" id=\"" << phrase
431 << "\" tf=\"" << tf
432 << "\" df=\"" << df;
433 if (!prefix.empty()) {
434 cout << "\" prefix=\"" << prefix;
435 }
436 if (!suffix.empty()) {
437 cout << "\" suffix=\"" << suffix;
438 }
439 cout << "\"/>" << endl;
440 } else {
441 cout << "<tr valign=top><td align=right><a href='" << cgi_script
442 << "?c=" << collection << "&n=" << phrase << "'>" << prefix << "</a></td>"
443 << "<td align=center><a href='" << cgi_script
444 << "?c=" << collection << "&n=" << phrase << "'>" << body << "</a></td>"
445 << "<td align=left><a href='" << cgi_script
446 << "?c=" << collection << "&n=" << phrase << "'>" << suffix << "</a></td>"
447 << "<td>" << tf << "</td><td>" << df << "</td></tr>" << endl;
448 }
449 }
450}
451
452void print_thesaurus_links(char *cgi_script, char *collection,
453 bool XMLmode, UCArray body, TextData &textdata,
454 vector <unsigned long> &linkdest,
455 vector <UCArray> &linktype,
456 unsigned long first, unsigned long last) {
457
458 // information describing each link in the list
459 unsigned long phrase, tf, ef, df;
460 UCArray type, text, newbody, suffix, prefix;
461
462 for (unsigned long l = first; l < last; l++) {
463
464 // get the phrase data
465 phrase = linkdest[l];
466 type = linktype[l];
467 get_phrase_freq_data(textdata, phrase, text, tf, ef, df);
468 // split_phrase(text, newbody, prefix, suffix);
469
470 if (XMLmode) {
471 cout << "<thesaurus num=\"" << l
472 << "\" id=\"" << phrase
473 << "\" tf=\"" << tf
474 << "\" df=\"" << df
475 << "\" type=\"" << type
476 << "\" text=\"" << text
477 << "\"/>" << endl;
478 } else {
479 cout << "<tr valign=top><td>" << type << "</td><td>"
480 << "<a href='" << cgi_script << "?c=" << collection
481 << "&n=" << phrase << "'>" << text << "</a>"
482 << "</td><td>" << tf << "</td><td>" << df << "</td></tr>" << endl;
483 }
484 }
485}
486
487
488void print_documents(bool XMLmode, char *basepath, char *cgi_script, char *collection,
489 vector <unsigned long> docNums, vector <unsigned long> docFreq,
490 unsigned long first, unsigned long last) {
491
492 // Create a TextData object to read the document data
493 TextData docdata;
494 char filename[FILENAME_MAX] = "docs";
495 if (!docdata.LoadData (basepath, filename)) {
496 FatalError (1, "Couldn't load text information for \"%s\"", filename);
497 }
498
499 UCArray title, hash;
500 unsigned long freq, doc;
501
502 for (unsigned long d = first; d < last; d++) {
503 doc = docNums[d];
504 freq = docFreq[d];
505
506 get_document_all_data(docdata, doc, title, hash);
507
508 if (XMLmode) {
509 cout << "<document num=\"" << d
510 << "\" hash=\"" << hash
511 << "\" freq=\"" << freq
512 << "\" title=\"" << title << "\"/>" << endl;
513 } else {
514 cout << "<tr valign=top><td><a href='" << cgi_script << "?c=" << collection
515 << "&a=d&d=" << hash << "'>" << title << "</a>"
516 << "</td><td>" << freq << "</td></tr>"
517 << endl;
518 }
519 }
520}
521
522
523
524// Get the frequency data about a phrase
525//
526// The phrase is stored in textData as record phrase.
527// We retrieve:
528// word - the text of the phrase
529// tf - the total frequency of the phrase
530// ef - the expansion frequency of the phrase
531// df - the document frequency of the phrase
532
533void get_phrase_freq_data(TextData &textdata, unsigned long phrase,
534 UCArray &word, unsigned long &tf,
535 unsigned long &ef, unsigned long &df) {
536
537 UCArray text;
538 UCArray docLevel;
539 SetCStr(docLevel, "Document");
540
541 // Look the word up in the textData
542 if (!GetDocText (textdata, docLevel, phrase, text)) {
543 FatalError (1, "Error while trying to get phrase %u", phrase);
544 }
545
546 // Ignore everything up to the first colon
547 UCArray::iterator next = text.begin();
548 while (*next++ != ':');
549
550 // Get the word
551 word.clear();
552 for (; *next != ':'; next++) {
553 word.push_back(*next);
554 }
555
556 // Get total frequency
557 tf = 0;
558 for (next++; *next != ':'; next++) {
559 tf *= 10;
560 tf += (*next - '0');
561 }
562
563 // Get expansion frequency
564 ef = 0;
565 for (next++; *next != ':'; next++) {
566 ef *= 10;
567 ef += (*next - '0');
568 }
569
570 // Get document frequency
571 df = 0;
572 for (next++; *next != ':'; next++) {
573 df *= 10;
574 df += (*next - '0');
575 }
576}
577
578// Get all the data about a phrase
579//
580// The phrase is stored in textData as record phrase.
581// We retrieve:
582// word - the text of the phrase
583// tf - the total frequency of the phrase
584// ef - the expansion frequency of the phrase
585// lf - the thesaurus link frequency of the phrase
586// df - the document frequency of the phrase
587// el - the list of phrases that are expansions of phrase
588// ll - the list of phrases that are thesaurus links
589// dl - the list of documents that contain phrase
590
591void get_phrase_all_data(TextData &textdata, unsigned long phrase,
592 UCArray &word,
593 unsigned long &tf, unsigned long &ef,
594 unsigned long &lf, unsigned long &df,
595 vector <unsigned long> &el,
596 vector <unsigned long> &linkdest,
597 vector <UCArray> &linktype,
598 vector <unsigned long> &docnum,
599 vector <unsigned long> &docfrq) {
600 UCArray text;
601 UCArray docLevel;
602 SetCStr(docLevel, "Document");
603
604 // Look thwe word up in the textData
605 if (!GetDocText (textdata, docLevel, phrase, text)) {
606 FatalError (1, "Error while trying to get phrase %u", phrase);
607 }
608
609 // Ignore everything up to the first colon
610 UCArray::iterator next = text.begin();
611 while (*next++ != ':');
612
613 // ignore training cariage returns
614 while (text.back() == '\n') {
615 text.pop_back();
616 }
617
618 // Get the word
619 word.clear();
620 for (; *next != ':'; next++) {
621 word.push_back(*next);
622 }
623
624 // Get total frequency
625 tf = 0;
626 for (next++; *next != ':'; next++) {
627 tf *= 10;
628 tf += (*next - '0');
629 }
630
631 // Get expansion frequency
632 ef = 0;
633 for (next++; *next != ':'; next++) {
634 ef *= 10;
635 ef += (*next - '0');
636 }
637
638 // Get document frequency
639 df = 0;
640 for (next++; *next != ':'; next++) {
641 df *= 10;
642 df += (*next - '0');
643 }
644
645 // Get expansion list
646 el.clear();
647 unsigned long e = 0;
648 for (next++; *next != ':'; next++) {
649 if (*next == ',') {
650 el.push_back(e);
651 e = 0;
652 } else {
653 e *= 10;
654 e += (*next - '0');
655 }
656 }
657
658 // Get document list & the document frequency list
659 docnum.clear();
660 docfrq.clear();
661 bool readnum = false;
662 unsigned long d = 0;
663 for (next++; *next != ':'; next++) {
664 if (*next == ',') {
665 docnum.push_back(d);
666 readnum = true;
667 d = 0;
668 } else if (*next == ';') {
669 if (readnum) {
670 docfrq.push_back(d);
671 } else {
672 docnum.push_back(d);
673 docfrq.push_back(1);
674 }
675 readnum = false;
676 d = 0;
677 } else {
678 d *= 10;
679 d += (*next - '0');
680 }
681 }
682
683 // Get thesaurus link frequency & link list
684 text.push_back(':');
685 text.push_back(':');
686
687 // link frequency
688 lf = 0;
689 for (next++; *next != ':'; next++) {
690 lf *= 10;
691 lf += (*next - '0');
692 }
693
694 // two lists of link data
695 linkdest.clear();
696 linktype.clear();
697
698 UCArray thistype;
699 thistype.clear();
700 bool typedone = false;
701 unsigned long l = 0;
702 for (next++; *next != ':'; next++) {
703
704 if (!typedone) {
705 // first read the link type, a charactor string
706 if (*next == ',') {
707 typedone = true;
708 } else {
709 thistype.push_back(*next);
710 }
711 } else {
712 // having read the link type, read the list of link destinations
713 if (*next == ',') {
714 linkdest.push_back(l);
715 linktype.push_back(thistype);
716 l = 0;
717 } else if (*next == ';') {
718 linkdest.push_back(l);
719 linktype.push_back(thistype);
720 l = 0;
721 thistype.clear();
722 typedone = false;
723 } else {
724 l *= 10;
725 l += (*next - '0');
726 }
727 }
728 }
729}
730
731// Get all the data about a docment
732//
733// The document's detailes are stored in docData as record docNum.
734// We retrieve:
735// title - the document's title
736// hash - the documnt's unique OID
737
738void get_document_all_data(TextData &docdata, unsigned long docNum,
739 UCArray &title, UCArray &hash) {
740
741 UCArray text;
742 UCArray docLevel;
743 SetCStr(docLevel, "Document");
744
745 // Look the word up in the textData
746 if (!GetDocText (docdata, docLevel, docNum, text)) {
747 FatalError (1, "Error while trying to get document %u", docNum);
748 }
749
750 // Ignore everything up to the first colon
751 UCArray::iterator next = text.begin();
752 while (*next++ != '\t');
753
754 // Get the document OID (hash)
755 hash.clear();
756 for (; *next != '\t'; next++) {
757 hash.push_back(*next);
758 }
759
760 // Get the title
761 text.push_back('\n');
762 title.clear();
763 for (next++; *next != '\n'; next++) {
764 title.push_back(*next);
765 }
766}
767
768
769void get_gsdlsite_parameters(char *&gsdlhome) {
770
771 // open the file
772 ifstream gsdl("gsdlsite.cfg", ios::in);
773 if (!gsdl) {
774 cerr << "File gsdlsite.cfg could not be opened\n";
775 exit(1);
776 }
777
778 // read each line of the file
779 char buffer[2000];
780 while (!gsdl.eof()) {
781 gsdl.getline(buffer, 2000, '\n');
782
783 // read the gsdlhome variable
784 if (strncmp(buffer, "gsdlhome", 8) == 0) {
785
786 // find the start of the gsdlhome string
787 int len = strlen(buffer);
788 int i = 8;
789 while (i < len && (buffer[i] == ' ' || buffer[i] == '\t')) {
790 i++;
791 }
792 // store the gsdlhome string
793 gsdlhome = new (char)[len-i];
794 strncpy(gsdlhome, &(buffer[i]), len-i);
795 }
796 }
797}
798
799void get_cgi_parameters(char *&collection,
800 unsigned long &phrasenumber, UCArray &phrasetext,
801 unsigned long &first_e, unsigned long &last_e,
802 unsigned long &first_l, unsigned long &last_l,
803 unsigned long &first_d, unsigned long &last_d,
804 bool &XMLmode) {
805
806
807 // set the default parameters
808 phrasenumber = 0;
809 phrasetext.clear();
810 first_e = 0;
811 last_e = 10;
812 first_l = 0;
813 last_l = 10;
814 first_d = 0;
815 last_d = 10;
816
817 // get the query string
818 char *request_method_str = getenv("REQUEST_METHOD");
819 char *query_string = getenv("QUERY_STRING");
820 text_t query;
821
822 if (request_method_str != NULL
823 && (strcmp(request_method_str, "GET") == 0)
824 && query_string != NULL) {
825 // GET cgi args from querystring
826 query = query_string;
827
828 } else {
829 // debugging from command line
830 cout << "? " << endl;
831 char query_input[1024];
832 cin.get(query_input, 1024, '\n');
833 query = query_input;
834 }
835
836 // extract out the key=value pairs
837 text_t::iterator here = query.begin();
838 text_t::iterator end = query.end();
839 text_t key, value;
840
841 while (here != end) {
842 // get the next key and value pair
843 here = getdelimitstr (here, end, '=', key);
844 here = getdelimitstr (here, end, '&', value);
845
846 // store this key=value pair
847 if (!key.empty() && !value.empty()) {
848
849 // c: the collection name
850 if (key[0] == 'c') {
851 UCArray tmp;
852 toUCArray(value, tmp);
853 collection = GetCStr(tmp);
854 }
855
856 // e: the first expansion number
857 else if (key[0] == 'e') {
858 first_e = toLongInt(value);
859 }
860
861 // f: the last expansion number
862 else if (key[0] == 'f') {
863 last_e = toLongInt(value);
864 }
865
866 // h: the first document number
867 else if (key[0] == 'h') {
868 first_d = toLongInt(value);
869 }
870
871 // i: the last document number
872 else if (key[0] == 'i') {
873 last_d = toLongInt(value);
874 }
875
876 // k: the first thesaurus list number
877 else if (key[0] == 'k') {
878 first_l = toLongInt(value);
879 }
880
881 // l: the last thesaurus list number
882 else if (key[0] == 'l') {
883 last_l = toLongInt(value);
884 }
885
886 // n: the phrase number
887 else if (key[0] == 'n') {
888 phrasenumber = toLongInt(value);
889 }
890
891 // p: the phrase text
892 else if (key[0] == 'p') {
893 toUCArray(value, phrasetext);
894 }
895
896 // x: XML mode
897 else if (key[0] == 'x') {
898 XMLmode = true;
899 }
900
901 }
902 }
903}
904
905
906// Find the phrase number of a word in the index file
907
908void find_phrase_number_from_word(char *basepath, UCArray &query, DocNumArray &result) {
909
910 // Open the index file for searching
911 IndexData indexData;
912 char indexfilename[FILENAME_MAX] = "pword";
913 if (!indexData.LoadData (basepath, indexfilename)) {
914 FatalError (1, "Couldn't load index information for \"%s\"", indexfilename);
915 }
916
917 // set up the query object
918 QueryInfo queryInfo;
919 SetCStr (queryInfo.docLevel, "Document");
920 queryInfo.maxDocs = 5;
921 queryInfo.sortByRank = true;
922 queryInfo.exactWeights = false;
923 queryInfo.needRankInfo = true;
924 queryInfo.needTermFreqs = true;
925
926 // mode 1 = casefolded, unstemmed search
927 QueryNode *queryTree = ParseQuery(query, 1, 1);
928
929 // cout << "-- query --" << endl;
930 // PrintNode (cout, queryTree);
931
932 // perform the query
933 ExtQueryResult queryResult;
934 MGQuery (indexData, queryInfo, queryTree, queryResult);
935 // cout << "-- word lookup result -- " << endl << queryResult << endl ;
936
937 result.clear();
938 result = queryResult.docs;
939
940 // delete the query
941 if (queryTree != NULL) delete queryTree;
942}
943
944
945
946
947// cgi_error
948//
949// If for some reason we cannot proceed, output a simple error
950// page and exit(0) the program.
951
952void cgi_error(bool XMLmode, char *message) {
953
954 if (XMLmode) {
955 cout << "Content-type: text/plain" << endl << endl
956 << "<phinddata>" << endl
957 << "<phinderror>" << message << "</phinderror>" << endl
958 << "</phinddata>" << endl;
959 } else {
960 cout << "Content-type: text/html" << endl << endl
961 << "<html><head><title>phind error</title></head>" << endl
962 << "<body>" << endl
963 << "<p><h1>phind error</h1>"
964 << "<p> An error occured processing your request: <p><b>"
965 << message
966 << "</b></body></html>" << endl;
967 }
968 exit(0);
969}
970
971
972// split an expansion into prefix and suffix
973
974void split_phrase(UCArray word, UCArray body, UCArray &prefix, UCArray &suffix) {
975
976 prefix.clear();
977 suffix.clear();
978
979 bool readingPrefix = true;
980 UCArray::iterator here = word.begin();
981 UCArray::iterator end = word.end();
982
983 while (here != end) {
984
985 // if we've not read all the prefix, add the next char to the prefix
986 if (readingPrefix) {
987 if (phrase_match(body, here, end)) {
988 readingPrefix = false;
989 // trim whitespace from end of prefix & start of suffix
990 if (!prefix.empty()) {
991 prefix.pop_back();
992 }
993 if ((here != end) && (*here == ' ')) {
994 here++;
995 }
996 } else {
997 prefix.push_back(*here);
998 here++;
999 }
1000 }
1001 // if we've finished with the prefix, update the suffix
1002 else {
1003 suffix.push_back(*here);
1004 here++;
1005 }
1006 }
1007}
1008
1009// phrase_match
1010//
1011// compare two strings, one represented as an UCArray, the other as two
1012// UCArray iterators.
1013//
1014// Return true if the UCArray is the same as the phrase the iterators point
1015// to for the length of the UCArray.
1016
1017bool phrase_match(UCArray text, UCArray::iterator &here, UCArray::iterator end) {
1018
1019 UCArray::iterator one_here = text.begin();
1020 UCArray::iterator one_end = text.end();
1021 UCArray::iterator two_here = here;
1022
1023 // iterate over the length of the first string, comparing each element to
1024 // the corresponding element in the second string.
1025 while (one_here != one_end) {
1026
1027 if (two_here == end) {
1028 return false;
1029 } else if (*one_here != *two_here) {
1030 return false;
1031 }
1032 one_here++;
1033 two_here++;
1034 }
1035
1036 here = two_here;
1037 return true;
1038}
1039
1040
1041// Convert from text_t format
1042//
1043// Conversions from text_t to other types
1044
1045unsigned long toLongInt(text_t &value) {
1046
1047 unsigned long result = 0;
1048
1049 text_t::iterator here = value.begin();
1050 text_t::iterator end = value.end();
1051 while (here != end) {
1052 result *= 10;
1053 result += *here - '0';
1054 here++;
1055 }
1056
1057 return result;
1058}
1059
1060void toUCArray(text_t &in, UCArray &out) {
1061 out.clear();
1062 text_t::iterator here = in.begin();
1063 text_t::iterator end = in.end();
1064 while (here != end) {
1065 out.push_back((unsigned char) *here);
1066 here++;
1067 }
1068}
1069
Note: See TracBrowser for help on using the repository browser.