Context Navigation

source: trunk/gsdl/src/phind/host/phindcgi.cpp@ 2062

Last change on this file since 2062 was 2062, checked in by paynter, 23 years ago
Added a function from the receptionist that decodes phrase CGI arguments from URL-safe format (e.g. %43%41%54 means CAT) so we can do searches on UTF8 text.
Property svn:keywords set to `Author Date Id Revision`
File size: 29.0 KB

Line
1	/**********************************************************************
2	*
3	* phindcgi.cpp -- cgi program to serve phind phrase hierarchies
4	*
5	* Copyright 2000 Gordon W. Paynter
6	* Copyright 2000 The New Zealand Digital Library Project
7	*
8	*
9	* A component of the Greenstone digital library software
10	* from the New Zealand Digital Library Project at the
11	* University of Waikato, New Zealand.
12	*
13	* This program is free software; you can redistribute it and/or modify
14	* it under the terms of the GNU General Public License as published by
15	* the Free Software Foundation; either version 2 of the License, or
16	* (at your option) any later version.
17	*
18	* This program is distributed in the hope that it will be useful,
19	* but WITHOUT ANY WARRANTY; without even the implied warranty of
20	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
21	* GNU General Public License for more details.
22	*
23	* You should have received a copy of the GNU General Public License
24	* along with this program; if not, write to the Free Software
25	* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
26	*
27	*********************************************************************/
28
29	/*
30	* phindcgi.cpp
31	*
32	* The program itself reads request for a phrase's data from the
33	* QUERY_STRING variable, looks up the phrase (if necessary) in the MGPP
34	* pword database, then looks up the phrase's charatoristics in the MGPP
35	* pdata database, and reports output to STDOUT ar crude HTML or XML.
36	*
37	*/
38
39
40	#include <iostream.h>
41	#include <fstream.h>
42	#include <stdlib.h>
43	#include <stdio.h>
44	#include <assert.h>
45
46	#include <vector.h>
47	#include <algo.h>
48
49	// Include MGPP functionality.
50	#include <TextGet.h>
51	#include <MGQuery.h>
52	#include <Terms.h>
53	#include <messages.h>
54	#include <GSDLQueryParser.h>
55
56	// Include GSDL's text_t object, which makes parsing cgi arguments easier.
57	#include <text_t.h>
58	// Note that GSDL stores strings as text_t objects (vectors of 16-bit short int),
59	// while MGPP stores strings as UCArray objects (vectors of 8-bit unsigned char).
60
61
62
63	void get_gsdlsite_parameters(char *&gsdlhome);
64
65	void get_cgi_parameters(char &collection, char &classifier,
66	unsigned long &phrasenumber, UCArray &phrasetext,
67	unsigned long &first_e, unsigned long &last_e,
68	unsigned long &first_l, unsigned long &last_l,
69	unsigned long &first_d, unsigned long &last_d,
70	bool &XMLmode);
71
72	void decode_cgi_arg (text_t &argstr);
73
74	void print_expansions(char cgi_script, char collection, bool XMLmode, UCArray body,
75	TextData &textdata, vector <unsigned long> elist,
76	unsigned long first, unsigned long last);
77
78	void print_thesaurus_links(char cgi_script, char collection,
79	bool XMLmode, UCArray body, TextData &textdata,
80	vector <unsigned long> &linkdest,
81	vector <UCArray> &linktype,
82	unsigned long first, unsigned long last);
83
84	void print_documents(bool XMLmode, char basepath, char cgi_script,
85	char *collection,
86	vector <unsigned long> docNums,
87	vector <unsigned long> docFreq,
88	unsigned long first, unsigned long last);
89
90	void find_phrase_number_from_word(char *basepath, UCArray &query, DocNumArray &result);
91
92	void get_phrase_freq_data(TextData &textdata, unsigned long phrase,
93	UCArray &word, unsigned long &tf,
94	unsigned long &ef, unsigned long &df);
95
96	void get_phrase_all_data(TextData &textdata, unsigned long phrase,
97	UCArray &word,
98	unsigned long &tf, unsigned long &ef,
99	unsigned long &lf, unsigned long &df,
100	vector <unsigned long> &el,
101	vector <unsigned long> &linkdest,
102	vector <UCArray> &linktype,
103	vector <unsigned long> &docnum,
104	vector <unsigned long> &docfrq);
105
106	void split_phrase(UCArray word, UCArray body, UCArray &prefix, UCArray &suffix);
107	bool phrase_match(UCArray text, UCArray::iterator &here, UCArray::iterator end);
108
109	void get_document_all_data(TextData &docdata, unsigned long docNum,
110	UCArray &title, UCArray &hash);
111
112	void cgi_error(bool XMLmode, char *message);
113
114	void toUCArray(text_t &in, UCArray &out);
115	unsigned long toLongInt(text_t &value);
116
117
118
119	int main (int argc, char * argv[]) {
120
121
122	// the phrase to expand
123	unsigned long phrase = 0;
124	UCArray word;
125
126	// the frequency and occurances of the phrase
127	unsigned long tf;
128	vector <unsigned long> el, linkdest, docNums, docfreq;
129	vector <UCArray> linktype;
130
131	// the number of occurances to display
132	unsigned long ef, first_e, last_e, count_e,
133	lf, first_l, last_l, count_l,
134	df, first_d, last_d, count_d;
135
136	// are we in XML mode (as opposed to HTML mode)
137	bool XMLmode = false;
138
139	// Read the gsdlsite.cfg file
140	char *gsdlhome = NULL;
141	get_gsdlsite_parameters(gsdlhome);
142
143	if (gsdlhome == NULL) {
144	cgi_error(XMLmode, "GSDLHOME not set in gsdlsite.cfg file.");
145	}
146
147	// Get command-line parameters
148	char *collection = NULL;
149	char *classifier = NULL;
150	text_tmap param;
151	get_cgi_parameters(collection, classifier, phrase, word,
152	first_e, last_e, first_l, last_l, first_d, last_d, XMLmode);
153
154	if (collection == NULL) {
155	cgi_error(XMLmode, "No collection");
156	}
157
158	char basepath[FILENAME_MAX] = "";
159	strcat(basepath, gsdlhome);
160	strcat(basepath, "/collect/");
161	strcat(basepath, collection);
162	strcat(basepath, "/index/phind");
163	strcat(basepath, classifier);
164
165	// If we don't know the phrase number, look itup
166	if (phrase == 0) {
167
168	if (word.empty()) {
169	cgi_error(XMLmode, "No phrase number or word.");
170	}
171
172	DocNumArray result;
173	find_phrase_number_from_word(basepath, word, result);
174
175	if (result.empty()) {
176	cgi_error(XMLmode, "The search term does not occur in the collection.");
177	exit(0);
178	} else {
179	phrase = result[0];
180	}
181	}
182
183	// Create a TextData object to read the phrase data (pdata)
184	TextData textdata;
185	char filename[FILENAME_MAX] = "pdata";
186	if (!textdata.LoadData (basepath, filename)) {
187	FatalError (1, "Couldn't load text information for \"%s\"", filename);
188	}
189	get_phrase_all_data(textdata, phrase, word, tf, ef, lf, df, el,
190	linkdest, linktype, docNums, docfreq);
191
192
193	// Output the header
194	if (XMLmode) {
195	cout << "Content-type: text/plain" << endl << endl
196	<< "<phinddata id=\"" << phrase
197	<< "\" text=\"" << word
198	<< "\" tf=\"" << tf
199	<< "\" ef=\"" << ef
200	<< "\" df=\"" << df
201	<< "\" lf=\"" << lf
202	<< "\">" << endl;
203	} else {
204	cout << "Content-type: text/html" << endl << endl
205	<< "<html><head><title>" << word << "</title></head>" << endl
206	<< "<body><center>" << endl
207	<< "<p><h1>" << word << "</h1>" << endl
208	<< "<p><b>"<< word << "</b> occurs "
209	<< tf << " times in " << df << " documents" << endl;
210	}
211
212
213	// Output the thesaurus links
214	if ((lf > 0) && (first_l < last_l)) {
215
216	// figure out the number of phrases to output
217	if (last_l > lf) {
218	last_l = lf;
219	}
220	count_l = last_l - first_l;
221
222	if (XMLmode) {
223	cout << "<thesauruslist length=\"" << lf
224	<< "\" start=\"" << first_l
225	<< "\" end=\"" << last_l << "\">" << endl;
226	print_thesaurus_links(argv[0], collection, XMLmode, word, textdata,
227	linkdest, linktype, first_l, last_l);
228	cout << "</thesauruslist>" << endl;
229	}
230
231	// output links as HTML
232	else {
233	if (count_l == lf) {
234	cout << "<p><b> " << count_l << " thesaurus links</b>" << endl;
235	} else {
236	cout << "<p><b>" << count_l << " of " << lf << " thesaurus links</b>" << endl;
237	}
238
239	cout << "<p><table border=1><tr><th>type</th><th>topic</th><th>freq</th><th>docs</th></tr>" << endl;
240	print_thesaurus_links(argv[0], collection, XMLmode, word, textdata,
241	linkdest, linktype, first_l, last_l);
242
243	cout << "</table>" << endl;
244
245	if (last_l < lf) {
246	if ((last_l + 10) < lf) {
247	cout << "<br><a href='" << argv[0]
248	<< "?c=" << collection
249	<< "&n=" << phrase
250	<< "&e=" << first_e
251	<< "&f=" << last_e
252	<< "&h=" << first_d
253	<< "&i=" << last_d
254	<< "&k=" << first_l
255	<< "&l=" << (last_l + 10)
256	<< "'>Get more thesaurus links</a>"
257	<< endl;
258	}
259	cout << "<br><a href='" << argv[0]
260	<< "?c=" << collection
261	<< "&n=" << phrase
262	<< "&e=" << first_e
263	<< "&f=" << last_e
264	<< "&h=" << first_d
265	<< "&i=" << last_d
266	<< "&k=" << first_l
267	<< "&l=" << lf
268	<< "'>Get every thesaurus link</a>"
269	<< endl;
270	}
271	}
272
273	}
274
275	// Output the expansions
276	if ((ef > 0) && (first_e < last_e)) {
277
278	// figure out the number of phrases to output
279	if (last_e > el.size()) {
280	last_e = el.size();
281	}
282	count_e = last_e - first_e;
283
284	// output expansions as XML
285	if (XMLmode) {
286	cout << "<expansionlist length=\"" << ef
287	<< "\" start=\"" << first_e
288	<< "\" end=\"" << last_e << "\">" << endl;
289
290	print_expansions(argv[0], collection, XMLmode, word, textdata, el, first_e, last_e);
291
292	cout << "</expansionlist>" << endl;
293	}
294
295	// output expansions as HTML
296	else {
297	if (count_e == el.size()) {
298	cout << "<p><b> " << count_e << " expansions</b>" << endl;
299	} else {
300	cout << "<p><b>" << count_e << " of " << ef << " expansions</b>" << endl;
301	}
302
303	cout << "<p><table border=1><tr><th colspan=3>phrase</th><th>freq</th><th>docs</th></tr>" << endl;
304	print_expansions(argv[0], collection, XMLmode, word, textdata, el, first_e, last_e);
305	cout << "</table>" << endl;
306
307	if (last_e < ef) {
308	if ((last_e + 10) < ef) {
309	cout << "<br><a href='" << argv[0]
310	<< "?c=" << collection
311	<< "&n=" << phrase
312	<< "&e=" << first_e
313	<< "&f=" << (last_e + 10)
314	<< "&h=" << first_d
315	<< "&i=" << last_d
316	<< "&k=" << first_l
317	<< "&l=" << last_l
318	<< "'>Get more expansions</a>"
319	<< endl;
320	}
321	cout << "<br><a href='" << argv[0]
322	<< "?c=" << collection
323	<< "&n=" << phrase
324	<< "&e=" << first_e
325	<< "&f=" << ef
326	<< "&h=" << first_d
327	<< "&i=" << last_d
328	<< "&k=" << first_l
329	<< "&l=" << last_l
330	<< "'>Get every expansion</a>"
331	<< endl;
332	}
333	}
334	}
335
336	// Output the document occurances
337	if ((df > 0) && (first_d < last_d)) {
338
339	// figure out the phrases to output
340	if (last_d > docNums.size()) {
341	last_d = docNums.size();
342	}
343	count_d = last_d - first_d;
344
345	// output document list as XML
346	if (XMLmode) {
347	cout << "<documentlist length=\"" << df
348	<< "\" start=\"" << first_d
349	<< "\" end=\"" << last_d << "\">" << endl;
350
351	print_documents(XMLmode, basepath, "library", collection,
352	docNums, docfreq, first_d, last_d);
353
354	cout << "</documentlist>" << endl;
355	}
356
357	// output document list as HTML
358	else {
359
360	if (count_d == docNums.size()) {
361	cout << "<p><b> " << count_d << " documents</b>" << endl;
362	} else {
363	cout << "<p><b>" << count_d << " of " << df << " documents</b>" << endl;
364	}
365
366	cout << "<p><table border=1><tr><th align=left>document</th><th>freq</th></tr>" << endl;
367	print_documents(XMLmode, basepath, "library", collection,
368	docNums, docfreq, first_d, last_d);
369	cout << "</table>" << endl;
370
371	if (last_d < df) {
372	if ((last_d + 10) < df) {
373	cout << "<br><a href='" << argv[0]
374	<< "?c=" << collection
375	<< "&n=" << phrase
376	<< "&e=" << first_e
377	<< "&f=" << last_e
378	<< "&h=" << first_d
379	<< "&i=" << (last_d + 10)
380	<< "&k=" << first_l
381	<< "&l=" << last_l
382	<< "'>Get more documents</a>" << endl;
383	}
384	cout << "<br><a href='" << argv[0]
385	<< "?c=" << collection
386	<< "&n=" << phrase
387	<< "&e=" << first_e
388	<< "&f=" << last_e
389	<< "&h=" << first_d
390	<< "&i=" << df
391	<< "&k=" << first_l
392	<< "&l=" << last_l
393	<< "'>Get every document</a>" << endl;
394	}
395	}
396	}
397
398	// Close the document
399	if (XMLmode) {
400	cout << "</phinddata>" << endl;
401	} else {
402	cout << "</center></body></html>" << endl;
403	}
404
405	textdata.UnloadData ();
406	return 0;
407	}
408
409
410	// Print a list of expansions
411	//
412	// Given the textData and a list of phrase numbers, print out each of the
413	// expansions.
414
415	void print_expansions(char cgi_script, char collection, bool XMLmode, UCArray body,
416	TextData &textdata, vector <unsigned long> elist,
417	unsigned long first, unsigned long last) {
418
419	UCArray word;
420	unsigned long phrase, tf, df, ef;
421
422	UCArray suffix, prefix;
423
424	for (unsigned long e = first; e < last; e++) {
425
426	phrase = elist[e];
427	get_phrase_freq_data(textdata, phrase, word, tf, ef, df);
428
429	split_phrase(word, body, prefix, suffix);
430
431	if (XMLmode) {
432	// body is always the same as the text of the phrase, so no need to send it
433	cout << "<expansion num=\"" << e
434	<< "\" id=\"" << phrase
435	<< "\" tf=\"" << tf
436	<< "\" df=\"" << df;
437	if (!prefix.empty()) {
438	cout << "\" prefix=\"" << prefix;
439	}
440	if (!suffix.empty()) {
441	cout << "\" suffix=\"" << suffix;
442	}
443	cout << "\"/>" << endl;
444	} else {
445	cout << "<tr valign=top><td align=right><a href='" << cgi_script
446	<< "?c=" << collection << "&n=" << phrase << "'>" << prefix << "</a></td>"
447	<< "<td align=center><a href='" << cgi_script
448	<< "?c=" << collection << "&n=" << phrase << "'>" << body << "</a></td>"
449	<< "<td align=left><a href='" << cgi_script
450	<< "?c=" << collection << "&n=" << phrase << "'>" << suffix << "</a></td>"
451	<< "<td>" << tf << "</td><td>" << df << "</td></tr>" << endl;
452	}
453	}
454	}
455
456	void print_thesaurus_links(char cgi_script, char collection,
457	bool XMLmode, UCArray body, TextData &textdata,
458	vector <unsigned long> &linkdest,
459	vector <UCArray> &linktype,
460	unsigned long first, unsigned long last) {
461
462	// information describing each link in the list
463	unsigned long phrase, tf, ef, df;
464	UCArray type, text, newbody, suffix, prefix;
465
466	for (unsigned long l = first; l < last; l++) {
467
468	// get the phrase data
469	phrase = linkdest[l];
470	type = linktype[l];
471	get_phrase_freq_data(textdata, phrase, text, tf, ef, df);
472	// split_phrase(text, newbody, prefix, suffix);
473
474	if (XMLmode) {
475	cout << "<thesaurus num=\"" << l
476	<< "\" id=\"" << phrase
477	<< "\" tf=\"" << tf
478	<< "\" df=\"" << df
479	<< "\" type=\"" << type
480	<< "\" text=\"" << text
481	<< "\"/>" << endl;
482	} else {
483	cout << "<tr valign=top><td>" << type << "</td><td>"
484	<< "<a href='" << cgi_script << "?c=" << collection
485	<< "&n=" << phrase << "'>" << text << "</a>"
486	<< "</td><td>" << tf << "</td><td>" << df << "</td></tr>" << endl;
487	}
488	}
489	}
490
491
492	void print_documents(bool XMLmode, char basepath, char cgi_script, char *collection,
493	vector <unsigned long> docNums, vector <unsigned long> docFreq,
494	unsigned long first, unsigned long last) {
495
496	// Create a TextData object to read the document data
497	TextData docdata;
498	char filename[FILENAME_MAX] = "docs";
499	if (!docdata.LoadData (basepath, filename)) {
500	FatalError (1, "Couldn't load text information for \"%s\"", filename);
501	}
502
503	UCArray title, hash;
504	unsigned long freq, doc;
505
506	for (unsigned long d = first; d < last; d++) {
507	doc = docNums[d];
508	freq = docFreq[d];
509
510	get_document_all_data(docdata, doc, title, hash);
511
512	if (XMLmode) {
513	cout << "<document num=\"" << d
514	<< "\" hash=\"" << hash
515	<< "\" freq=\"" << freq
516	<< "\" title=\"" << title << "\"/>" << endl;
517	} else {
518	cout << "<tr valign=top><td><a href='" << cgi_script << "?c=" << collection
519	<< "&a=d&d=" << hash << "'>" << title << "</a>"
520	<< "</td><td>" << freq << "</td></tr>"
521	<< endl;
522	}
523	}
524	}
525
526
527
528	// Get the frequency data about a phrase
529	//
530	// The phrase is stored in textData as record phrase.
531	// We retrieve:
532	// word - the text of the phrase
533	// tf - the total frequency of the phrase
534	// ef - the expansion frequency of the phrase
535	// df - the document frequency of the phrase
536
537	void get_phrase_freq_data(TextData &textdata, unsigned long phrase,
538	UCArray &word, unsigned long &tf,
539	unsigned long &ef, unsigned long &df) {
540
541	UCArray text;
542	UCArray docLevel;
543	SetCStr(docLevel, "Document");
544
545	// Look the word up in the textData
546	if (!GetDocText (textdata, docLevel, phrase, text)) {
547	FatalError (1, "Error while trying to get phrase %u", phrase);
548	}
549
550	// Ignore everything up to the first colon
551	UCArray::iterator next = text.begin();
552	while (*next++ != ':');
553
554	// Get the word
555	word.clear();
556	for (; *next != ':'; next++) {
557	word.push_back(*next);
558	}
559
560	// Get total frequency
561	tf = 0;
562	for (next++; *next != ':'; next++) {
563	tf *= 10;
564	tf += (*next - '0');
565	}
566
567	// Get expansion frequency
568	ef = 0;
569	for (next++; *next != ':'; next++) {
570	ef *= 10;
571	ef += (*next - '0');
572	}
573
574	// Get document frequency
575	df = 0;
576	for (next++; *next != ':'; next++) {
577	df *= 10;
578	df += (*next - '0');
579	}
580	}
581
582	// Get all the data about a phrase
583	//
584	// The phrase is stored in textData as record phrase.
585	// We retrieve:
586	// word - the text of the phrase
587	// tf - the total frequency of the phrase
588	// ef - the expansion frequency of the phrase
589	// lf - the thesaurus link frequency of the phrase
590	// df - the document frequency of the phrase
591	// el - the list of phrases that are expansions of phrase
592	// ll - the list of phrases that are thesaurus links
593	// dl - the list of documents that contain phrase
594
595	void get_phrase_all_data(TextData &textdata, unsigned long phrase,
596	UCArray &word,
597	unsigned long &tf, unsigned long &ef,
598	unsigned long &lf, unsigned long &df,
599	vector <unsigned long> &el,
600	vector <unsigned long> &linkdest,
601	vector <UCArray> &linktype,
602	vector <unsigned long> &docnum,
603	vector <unsigned long> &docfrq) {
604	UCArray text;
605	UCArray docLevel;
606	SetCStr(docLevel, "Document");
607
608	// Look thwe word up in the textData
609	if (!GetDocText (textdata, docLevel, phrase, text)) {
610	FatalError (1, "Error while trying to get phrase %u", phrase);
611	}
612
613	// Ignore everything up to the first colon
614	UCArray::iterator next = text.begin();
615	while (*next++ != ':');
616
617	// ignore training cariage returns
618	while (text.back() == '\n') {
619	text.pop_back();
620	}
621
622	// Get the word
623	word.clear();
624	for (; *next != ':'; next++) {
625	word.push_back(*next);
626	}
627
628	// Get total frequency
629	tf = 0;
630	for (next++; *next != ':'; next++) {
631	tf *= 10;
632	tf += (*next - '0');
633	}
634
635	// Get expansion frequency
636	ef = 0;
637	for (next++; *next != ':'; next++) {
638	ef *= 10;
639	ef += (*next - '0');
640	}
641
642	// Get document frequency
643	df = 0;
644	for (next++; *next != ':'; next++) {
645	df *= 10;
646	df += (*next - '0');
647	}
648
649	// Get expansion list
650	el.clear();
651	unsigned long e = 0;
652	for (next++; *next != ':'; next++) {
653	if (*next == ',') {
654	el.push_back(e);
655	e = 0;
656	} else {
657	e *= 10;
658	e += (*next - '0');
659	}
660	}
661
662	// Get document list & the document frequency list
663	docnum.clear();
664	docfrq.clear();
665	bool readnum = false;
666	unsigned long d = 0;
667	for (next++; *next != ':'; next++) {
668	if (*next == ',') {
669	docnum.push_back(d);
670	readnum = true;
671	d = 0;
672	} else if (*next == ';') {
673	if (readnum) {
674	docfrq.push_back(d);
675	} else {
676	docnum.push_back(d);
677	docfrq.push_back(1);
678	}
679	readnum = false;
680	d = 0;
681	} else {
682	d *= 10;
683	d += (*next - '0');
684	}
685	}
686
687	// Get thesaurus link frequency & link list
688	text.push_back(':');
689	text.push_back(':');
690
691	// link frequency
692	lf = 0;
693	for (next++; *next != ':'; next++) {
694	lf *= 10;
695	lf += (*next - '0');
696	}
697
698	// two lists of link data
699	linkdest.clear();
700	linktype.clear();
701
702	UCArray thistype;
703	thistype.clear();
704	bool typedone = false;
705	unsigned long l = 0;
706	for (next++; *next != ':'; next++) {
707
708	if (!typedone) {
709	// first read the link type, a charactor string
710	if (*next == ',') {
711	typedone = true;
712	} else {
713	thistype.push_back(*next);
714	}
715	} else {
716	// having read the link type, read the list of link destinations
717	if (*next == ',') {
718	linkdest.push_back(l);
719	linktype.push_back(thistype);
720	l = 0;
721	} else if (*next == ';') {
722	linkdest.push_back(l);
723	linktype.push_back(thistype);
724	l = 0;
725	thistype.clear();
726	typedone = false;
727	} else {
728	l *= 10;
729	l += (*next - '0');
730	}
731	}
732	}
733	}
734
735	// Get all the data about a docment
736	//
737	// The document's detailes are stored in docData as record docNum.
738	// We retrieve:
739	// title - the document's title
740	// hash - the documnt's unique OID
741
742	void get_document_all_data(TextData &docdata, unsigned long docNum,
743	UCArray &title, UCArray &hash) {
744
745	UCArray text;
746	UCArray docLevel;
747	SetCStr(docLevel, "Document");
748
749	// Look the word up in the textData
750	if (!GetDocText (docdata, docLevel, docNum, text)) {
751	FatalError (1, "Error while trying to get document %u", docNum);
752	}
753
754	// Ignore everything up to the first colon
755	UCArray::iterator next = text.begin();
756	while (*next++ != '\t');
757
758	// Get the document OID (hash)
759	hash.clear();
760	for (; *next != '\t'; next++) {
761	hash.push_back(*next);
762	}
763
764	// Get the title
765	text.push_back('\n');
766	title.clear();
767	for (next++; *next != '\n'; next++) {
768	title.push_back(*next);
769	}
770	}
771
772
773	void get_gsdlsite_parameters(char *&gsdlhome) {
774
775	// open the file
776	ifstream gsdl("gsdlsite.cfg", ios::in);
777	if (!gsdl) {
778	cerr << "File gsdlsite.cfg could not be opened\n";
779	exit(1);
780	}
781
782	// read each line of the file
783	char buffer[2000];
784	while (!gsdl.eof()) {
785	gsdl.getline(buffer, 2000, '\n');
786
787	// read the gsdlhome variable
788	if (strncmp(buffer, "gsdlhome", 8) == 0) {
789
790	// find the start of the gsdlhome string
791	int len = strlen(buffer);
792	int i = 8;
793	while (i < len && (buffer[i] == ' ' \|\| buffer[i] == '\t')) {
794	i++;
795	}
796	// store the gsdlhome string
797	gsdlhome = new (char)[len-i];
798	strncpy(gsdlhome, &(buffer[i]), len-i);
799	}
800	}
801	}
802
803	void get_cgi_parameters(char &collection, char &classifier,
804	unsigned long &phrasenumber, UCArray &phrasetext,
805	unsigned long &first_e, unsigned long &last_e,
806	unsigned long &first_l, unsigned long &last_l,
807	unsigned long &first_d, unsigned long &last_d,
808	bool &XMLmode) {
809
810
811	// set the default parameters
812	phrasenumber = 0;
813	phrasetext.clear();
814	first_e = 0;
815	last_e = 10;
816	first_l = 0;
817	last_l = 10;
818	first_d = 0;
819	last_d = 10;
820
821	// get the query string
822	char *request_method_str = getenv("REQUEST_METHOD");
823	char *query_string = getenv("QUERY_STRING");
824	text_t query;
825
826	if (request_method_str != NULL
827	&& (strcmp(request_method_str, "GET") == 0)
828	&& query_string != NULL) {
829	// GET cgi args from querystring
830	query = query_string;
831
832	} else {
833	// debugging from command line
834	cout << "? " << endl;
835	char query_input[1024];
836	cin.get(query_input, 1024, '\n');
837	query = query_input;
838	}
839
840	// extract out the key=value pairs
841	text_t::iterator here = query.begin();
842	text_t::iterator end = query.end();
843	text_t key, value;
844
845	while (here != end) {
846	// get the next key and value pair
847	here = getdelimitstr (here, end, '=', key);
848	here = getdelimitstr (here, end, '&', value);
849
850	// store this key=value pair
851	if (!key.empty() && !value.empty()) {
852
853	// c: the collection name
854	if (key[0] == 'c') {
855	UCArray tmp;
856	toUCArray(value, tmp);
857	collection = GetCStr(tmp);
858	}
859
860	// d: the classifier number as string
861	if (key[0] == 'd') {
862	UCArray tmp;
863	toUCArray(value, tmp);
864	classifier = GetCStr(tmp);
865	}
866
867	// e: the first expansion number
868	else if (key[0] == 'e') {
869	first_e = toLongInt(value);
870	}
871
872	// f: the last expansion number
873	else if (key[0] == 'f') {
874	last_e = toLongInt(value);
875	}
876
877	// h: the first document number
878	else if (key[0] == 'h') {
879	first_d = toLongInt(value);
880	}
881
882	// i: the last document number
883	else if (key[0] == 'i') {
884	last_d = toLongInt(value);
885	}
886
887	// k: the first thesaurus list number
888	else if (key[0] == 'k') {
889	first_l = toLongInt(value);
890	}
891
892	// l: the last thesaurus list number
893	else if (key[0] == 'l') {
894	last_l = toLongInt(value);
895	}
896
897	// n: the phrase number
898	else if (key[0] == 'n') {
899	phrasenumber = toLongInt(value);
900	}
901
902	// p: the phrase text
903	else if (key[0] == 'p') {
904	decode_cgi_arg(value);
905	toUCArray(value, phrasetext);
906	}
907
908	// x: XML mode
909	else if (key[0] == 'x') {
910	XMLmode = true;
911	}
912
913	}
914	}
915
916	// if no classifier number is supplied, default to 1.
917	if (classifier == NULL) {
918	classifier = new (char)[2];
919	strcpy(classifier, "1");
920	}
921	}
922
923
924	// Convert %xx and + to their appropriate equivalents
925	//
926	// This function was copied from %GSDLHOME/src/recpt/cgiutils.cpp
927	// because it was much easier to copy it than to link against it.
928
929	static unsigned short hexdigit (unsigned short c) {
930	if (c >= '0' && c <= '9') return (c-'0');
931	if (c >= 'a' && c <= 'f') return (c-'a'+10);
932	if (c >= 'A' && c <= 'F') return (c-'A'+10);
933	return c;
934	}
935
936	void decode_cgi_arg (text_t &argstr) {
937	text_t::iterator in = argstr.begin();
938	text_t::iterator out = in;
939	text_t::iterator end = argstr.end();
940
941	while (in != end) {
942	if (in == '+') out = ' ';
943
944	else if (*in == '%') {
945	unsigned short c = '%';
946	in++;
947	if (in != end) {
948	c = hexdigit (*in);
949	in++;
950	}
951	if (in != end && c < 16) { // sanity check on the previous character
952	c = c16 + hexdigit (in);
953	}
954
955	*out = c;
956	} else out = in;
957
958	if (in != end) in++;
959	out++;
960	}
961
962	// remove the excess characters
963	argstr.erase (out, end);
964	}
965
966
967	// Find the phrase number of a word in the index file
968
969	void find_phrase_number_from_word(char *basepath, UCArray &query, DocNumArray &result) {
970
971	// Open the index file for searching
972	IndexData indexData;
973	char indexfilename[FILENAME_MAX] = "pword";
974	if (!indexData.LoadData (basepath, indexfilename)) {
975	FatalError (1, "Couldn't load index information for \"%s\"", indexfilename);
976	}
977
978	// set up the query object
979	QueryInfo queryInfo;
980	SetCStr (queryInfo.docLevel, "Document");
981	queryInfo.maxDocs = 5;
982	queryInfo.sortByRank = true;
983	queryInfo.exactWeights = false;
984	queryInfo.needRankInfo = true;
985	queryInfo.needTermFreqs = true;
986
987	// mode 1 = casefolded, unstemmed search
988	QueryNode *queryTree = ParseQuery(query, 1, 1);
989
990	// cout << "-- query --" << endl;
991	// PrintNode (cout, queryTree);
992
993	// perform the query
994	ExtQueryResult queryResult;
995	MGQuery (indexData, queryInfo, queryTree, queryResult);
996	// cout << "-- word lookup result -- " << endl << queryResult << endl ;
997
998	result.clear();
999	result = queryResult.docs;
1000
1001	// delete the query
1002	if (queryTree != NULL) delete queryTree;
1003	}
1004
1005
1006
1007
1008	// cgi_error
1009	//
1010	// If for some reason we cannot proceed, output a simple error
1011	// page and exit(0) the program.
1012
1013	void cgi_error(bool XMLmode, char *message) {
1014
1015	if (XMLmode) {
1016	cout << "Content-type: text/plain" << endl << endl
1017	<< "<phinddata>" << endl
1018	<< "<phinderror>" << message << "</phinderror>" << endl
1019	<< "</phinddata>" << endl;
1020	} else {
1021	cout << "Content-type: text/html" << endl << endl
1022	<< "<html><head><title>phind error</title></head>" << endl
1023	<< "<body>" << endl
1024	<< "<p><h1>phind error</h1>"
1025	<< "<p> An error occured processing your request: <p><b>"
1026	<< message
1027	<< "</b></body></html>" << endl;
1028	}
1029	exit(0);
1030	}
1031
1032
1033	// split an expansion into prefix and suffix
1034
1035	void split_phrase(UCArray word, UCArray body, UCArray &prefix, UCArray &suffix) {
1036
1037	prefix.clear();
1038	suffix.clear();
1039
1040	bool readingPrefix = true;
1041	UCArray::iterator here = word.begin();
1042	UCArray::iterator end = word.end();
1043
1044	while (here != end) {
1045
1046	// if we've not read all the prefix, add the next char to the prefix
1047	if (readingPrefix) {
1048	if (phrase_match(body, here, end)) {
1049	readingPrefix = false;
1050	// trim whitespace from end of prefix & start of suffix
1051	if (!prefix.empty()) {
1052	prefix.pop_back();
1053	}
1054	if ((here != end) && (*here == ' ')) {
1055	here++;
1056	}
1057	} else {
1058	prefix.push_back(*here);
1059	here++;
1060	}
1061	}
1062	// if we've finished with the prefix, update the suffix
1063	else {
1064	suffix.push_back(*here);
1065	here++;
1066	}
1067	}
1068	}
1069
1070	// phrase_match
1071	//
1072	// compare two strings, one represented as an UCArray, the other as two
1073	// UCArray iterators.
1074	//
1075	// Return true if the UCArray is the same as the phrase the iterators point
1076	// to for the length of the UCArray.
1077
1078	bool phrase_match(UCArray text, UCArray::iterator &here, UCArray::iterator end) {
1079
1080	UCArray::iterator one_here = text.begin();
1081	UCArray::iterator one_end = text.end();
1082	UCArray::iterator two_here = here;
1083
1084	// iterate over the length of the first string, comparing each element to
1085	// the corresponding element in the second string.
1086	while (one_here != one_end) {
1087
1088	if (two_here == end) {
1089	return false;
1090	} else if (one_here != two_here) {
1091	return false;
1092	}
1093	one_here++;
1094	two_here++;
1095	}
1096
1097	here = two_here;
1098	return true;
1099	}
1100
1101
1102	// Convert from text_t format
1103	//
1104	// Conversions from text_t to other types
1105
1106	unsigned long toLongInt(text_t &value) {
1107
1108	unsigned long result = 0;
1109
1110	text_t::iterator here = value.begin();
1111	text_t::iterator end = value.end();
1112	while (here != end) {
1113	result *= 10;
1114	result += *here - '0';
1115	here++;
1116	}
1117
1118	return result;
1119	}
1120
1121	void toUCArray(text_t &in, UCArray &out) {
1122	out.clear();
1123	text_t::iterator here = in.begin();
1124	text_t::iterator end = in.end();
1125	while (here != end) {
1126	out.push_back((unsigned char) *here);
1127	here++;
1128	}
1129	}
1130

Note: See TracBrowser for help on using the repository browser.

Download in other formats: