Context Navigation

source: trunk/gsdl/src/phind/host/phindcgi.cpp@ 2487

Last change on this file since 2487 was 2487, checked in by sjboddie, 23 years ago
Changes to get phind working under windows
Property svn:keywords set to `Author Date Id Revision`
File size: 29.7 KB

Line
1	/**********************************************************************
2	*
3	* phindcgi.cpp -- cgi program to serve phind phrase hierarchies
4	*
5	* Copyright 2000 Gordon W. Paynter
6	* Copyright 2000 The New Zealand Digital Library Project
7	*
8	*
9	* A component of the Greenstone digital library software
10	* from the New Zealand Digital Library Project at the
11	* University of Waikato, New Zealand.
12	*
13	* This program is free software; you can redistribute it and/or modify
14	* it under the terms of the GNU General Public License as published by
15	* the Free Software Foundation; either version 2 of the License, or
16	* (at your option) any later version.
17	*
18	* This program is distributed in the hope that it will be useful,
19	* but WITHOUT ANY WARRANTY; without even the implied warranty of
20	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
21	* GNU General Public License for more details.
22	*
23	* You should have received a copy of the GNU General Public License
24	* along with this program; if not, write to the Free Software
25	* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
26	*
27	*********************************************************************/
28
29	/*
30	* phindcgi.cpp
31	*
32	* The program itself reads request for a phrase's data from the
33	* QUERY_STRING variable, looks up the phrase (if necessary) in the MGPP
34	* pword database, then looks up the phrase's charatoristics in the MGPP
35	* pdata database, and reports output to STDOUT ar crude HTML or XML.
36	*
37	*/
38
39	#if defined(GSDL_USE_IOS_H)
40	# include <fstream.h>
41	# include <iostream.h>
42	#else
43	# include <fstream>
44	# include <iostream>
45	#endif
46
47	#include <stdlib.h>
48	#include <stdio.h>
49	#include <assert.h>
50
51	#if defined(GSDL_USE_STL_H)
52	# if defined(GSDL_USE_ALGO_H)
53	# include <algo.h>
54	# else
55	# include <algorithm.h>
56	# endif
57	# include <vector.h>
58	#else
59	# include <algorithm>
60	# include <vector>
61	#endif
62
63	// Include MGPP functionality.
64	#include <TextGet.h>
65	#include <MGQuery.h>
66	#include <Terms.h>
67	#include <messages.h>
68	#include <GSDLQueryParser.h>
69
70	// Include GSDL's text_t object, which makes parsing cgi arguments easier.
71	#include "text_t.h"
72	#include "fileutil.h"
73	// Note that GSDL stores strings as text_t objects (vectors of 16-bit short int),
74	// while MGPP stores strings as UCArray objects (vectors of 8-bit unsigned char).
75
76
77
78	void get_gsdlsite_parameters(char *&gsdlhome);
79
80	void get_cgi_parameters(char &collection, char &classifier,
81	unsigned long &phrasenumber, UCArray &phrasetext,
82	unsigned long &first_e, unsigned long &last_e,
83	unsigned long &first_l, unsigned long &last_l,
84	unsigned long &first_d, unsigned long &last_d,
85	bool &XMLmode);
86
87	void decode_cgi_arg (text_t &argstr);
88
89	void print_expansions(char cgi_script, char collection, bool XMLmode, UCArray body,
90	TextData &textdata, vector <unsigned long> elist,
91	unsigned long first, unsigned long last);
92
93	void print_thesaurus_links(char cgi_script, char collection,
94	bool XMLmode, UCArray body, TextData &textdata,
95	vector <unsigned long> &linkdest,
96	vector <UCArray> &linktype,
97	unsigned long first, unsigned long last);
98
99	void print_documents(bool XMLmode, char basepath, char cgi_script,
100	char *collection,
101	vector <unsigned long> docNums,
102	vector <unsigned long> docFreq,
103	unsigned long first, unsigned long last);
104
105	void find_phrase_number_from_word(char *basepath, UCArray &query, DocNumArray &result);
106
107	void get_phrase_freq_data(TextData &textdata, unsigned long phrase,
108	UCArray &word, unsigned long &tf,
109	unsigned long &ef, unsigned long &df);
110
111	void get_phrase_all_data(TextData &textdata, unsigned long phrase,
112	UCArray &word,
113	unsigned long &tf, unsigned long &ef,
114	unsigned long &lf, unsigned long &df,
115	vector <unsigned long> &el,
116	vector <unsigned long> &linkdest,
117	vector <UCArray> &linktype,
118	vector <unsigned long> &docnum,
119	vector <unsigned long> &docfrq);
120
121	void split_phrase(UCArray word, UCArray body, UCArray &prefix, UCArray &suffix);
122	bool phrase_match(UCArray text, UCArray::iterator &here, UCArray::iterator end);
123
124	void get_document_all_data(TextData &docdata, unsigned long docNum,
125	UCArray &title, UCArray &hash);
126
127	void cgi_error(bool XMLmode, char *message);
128
129	void toUCArray(text_t &in, UCArray &out);
130	unsigned long toLongInt(text_t &value);
131
132
133
134	int main (int argc, char * argv[]) {
135
136
137	// the phrase to expand
138	unsigned long phrase = 0;
139	UCArray word;
140
141	// the frequency and occurances of the phrase
142	unsigned long tf;
143	vector <unsigned long> el, linkdest, docNums, docfreq;
144	vector <UCArray> linktype;
145
146	// the number of occurances to display
147	unsigned long ef, first_e, last_e, count_e,
148	lf, first_l, last_l, count_l,
149	df, first_d, last_d, count_d;
150
151	// are we in XML mode (as opposed to HTML mode)
152	bool XMLmode = false;
153
154	// Read the gsdlsite.cfg file
155	char *gsdlhome = NULL;
156	get_gsdlsite_parameters(gsdlhome);
157
158	if (gsdlhome == NULL) {
159	cgi_error(XMLmode, "GSDLHOME not set in gsdlsite.cfg file.");
160	}
161
162	// Get command-line parameters
163	char *collection = NULL;
164	char *classifier = NULL;
165	text_tmap param;
166	get_cgi_parameters(collection, classifier, phrase, word,
167	first_e, last_e, first_l, last_l, first_d, last_d, XMLmode);
168
169	if (collection == NULL) {
170	cgi_error(XMLmode, "No collection");
171	}
172
173	text_t basepathstr = filename_cat(gsdlhome, "collect", collection,
174	"index", text_t("phind") + classifier);
175
176	char *basepath = basepathstr.getcstr();
177
178	// If we don't know the phrase number, look itup
179	if (phrase == 0) {
180
181	if (word.empty()) {
182	cgi_error(XMLmode, "No phrase number or word.");
183	}
184
185	DocNumArray result;
186	find_phrase_number_from_word(basepath, word, result);
187
188	if (result.empty()) {
189	cgi_error(XMLmode, "The search term does not occur in the collection.");
190	exit(0);
191	} else {
192	phrase = result[0];
193	}
194	}
195
196	// Create a TextData object to read the phrase data (pdata)
197	TextData textdata;
198
199	text_t fullpath = filename_cat(basepath, "pdata");
200	char *fullpathc = fullpath.getcstr();
201	#if defined __WIN32__
202	char *base = "";
203	#else
204	char *base = "/";
205	#endif
206
207	if (!textdata.LoadData (base, fullpathc)) {
208	FatalError (1, "Couldn't load text information for \"%s\"", fullpathc);
209	}
210
211	delete fullpathc;
212
213	get_phrase_all_data(textdata, phrase, word, tf, ef, lf, df, el,
214	linkdest, linktype, docNums, docfreq);
215
216
217	// Output the header
218	if (XMLmode) {
219	cout << "Content-type: text/plain" << endl << endl
220	<< "<phinddata id=\"" << phrase
221	<< "\" text=\"" << word
222	<< "\" tf=\"" << tf
223	<< "\" ef=\"" << ef
224	<< "\" df=\"" << df
225	<< "\" lf=\"" << lf
226	<< "\">" << endl;
227	} else {
228	cout << "Content-type: text/html" << endl << endl
229	<< "<html><head><title>" << word << "</title></head>" << endl
230	<< "<body><center>" << endl
231	<< "<p><h1>" << word << "</h1>" << endl
232	<< "<p><b>"<< word << "</b> occurs "
233	<< tf << " times in " << df << " documents" << endl;
234	}
235
236
237	// Output the thesaurus links
238	if ((lf > 0) && (first_l < last_l)) {
239
240	// figure out the number of phrases to output
241	if (last_l > lf) {
242	last_l = lf;
243	}
244	count_l = last_l - first_l;
245
246	if (XMLmode) {
247	cout << "<thesauruslist length=\"" << lf
248	<< "\" start=\"" << first_l
249	<< "\" end=\"" << last_l << "\">" << endl;
250	print_thesaurus_links(argv[0], collection, XMLmode, word, textdata,
251	linkdest, linktype, first_l, last_l);
252	cout << "</thesauruslist>" << endl;
253	}
254
255	// output links as HTML
256	else {
257	if (count_l == lf) {
258	cout << "<p><b> " << count_l << " thesaurus links</b>" << endl;
259	} else {
260	cout << "<p><b>" << count_l << " of " << lf << " thesaurus links</b>" << endl;
261	}
262
263	cout << "<p><table border=1><tr><th>type</th><th>topic</th><th>freq</th><th>docs</th></tr>" << endl;
264	print_thesaurus_links(argv[0], collection, XMLmode, word, textdata,
265	linkdest, linktype, first_l, last_l);
266
267	cout << "</table>" << endl;
268
269	if (last_l < lf) {
270	if ((last_l + 10) < lf) {
271	cout << "<br><a href='" << argv[0]
272	<< "?c=" << collection
273	<< "&n=" << phrase
274	<< "&e=" << first_e
275	<< "&f=" << last_e
276	<< "&h=" << first_d
277	<< "&i=" << last_d
278	<< "&k=" << first_l
279	<< "&l=" << (last_l + 10)
280	<< "'>Get more thesaurus links</a>"
281	<< endl;
282	}
283	cout << "<br><a href='" << argv[0]
284	<< "?c=" << collection
285	<< "&n=" << phrase
286	<< "&e=" << first_e
287	<< "&f=" << last_e
288	<< "&h=" << first_d
289	<< "&i=" << last_d
290	<< "&k=" << first_l
291	<< "&l=" << lf
292	<< "'>Get every thesaurus link</a>"
293	<< endl;
294	}
295	}
296
297	}
298
299	// Output the expansions
300	if ((ef > 0) && (first_e < last_e)) {
301
302	// figure out the number of phrases to output
303	if (last_e > el.size()) {
304	last_e = el.size();
305	}
306	count_e = last_e - first_e;
307
308	// output expansions as XML
309	if (XMLmode) {
310	cout << "<expansionlist length=\"" << ef
311	<< "\" start=\"" << first_e
312	<< "\" end=\"" << last_e << "\">" << endl;
313
314	print_expansions(argv[0], collection, XMLmode, word, textdata, el, first_e, last_e);
315
316	cout << "</expansionlist>" << endl;
317	}
318
319	// output expansions as HTML
320	else {
321	if (count_e == el.size()) {
322	cout << "<p><b> " << count_e << " expansions</b>" << endl;
323	} else {
324	cout << "<p><b>" << count_e << " of " << ef << " expansions</b>" << endl;
325	}
326
327	cout << "<p><table border=1><tr><th colspan=3>phrase</th><th>freq</th><th>docs</th></tr>" << endl;
328	print_expansions(argv[0], collection, XMLmode, word, textdata, el, first_e, last_e);
329	cout << "</table>" << endl;
330
331	if (last_e < ef) {
332	if ((last_e + 10) < ef) {
333	cout << "<br><a href='" << argv[0]
334	<< "?c=" << collection
335	<< "&n=" << phrase
336	<< "&e=" << first_e
337	<< "&f=" << (last_e + 10)
338	<< "&h=" << first_d
339	<< "&i=" << last_d
340	<< "&k=" << first_l
341	<< "&l=" << last_l
342	<< "'>Get more expansions</a>"
343	<< endl;
344	}
345	cout << "<br><a href='" << argv[0]
346	<< "?c=" << collection
347	<< "&n=" << phrase
348	<< "&e=" << first_e
349	<< "&f=" << ef
350	<< "&h=" << first_d
351	<< "&i=" << last_d
352	<< "&k=" << first_l
353	<< "&l=" << last_l
354	<< "'>Get every expansion</a>"
355	<< endl;
356	}
357	}
358	}
359
360	// Output the document occurances
361	if ((df > 0) && (first_d < last_d)) {
362
363	// figure out the phrases to output
364	if (last_d > docNums.size()) {
365	last_d = docNums.size();
366	}
367	count_d = last_d - first_d;
368
369	// output document list as XML
370	if (XMLmode) {
371	cout << "<documentlist length=\"" << df
372	<< "\" start=\"" << first_d
373	<< "\" end=\"" << last_d << "\">" << endl;
374
375	print_documents(XMLmode, basepath, "library", collection,
376	docNums, docfreq, first_d, last_d);
377
378	cout << "</documentlist>" << endl;
379	}
380
381	// output document list as HTML
382	else {
383
384	if (count_d == docNums.size()) {
385	cout << "<p><b> " << count_d << " documents</b>" << endl;
386	} else {
387	cout << "<p><b>" << count_d << " of " << df << " documents</b>" << endl;
388	}
389
390	cout << "<p><table border=1><tr><th align=left>document</th><th>freq</th></tr>" << endl;
391	print_documents(XMLmode, basepath, "library", collection,
392	docNums, docfreq, first_d, last_d);
393	cout << "</table>" << endl;
394
395	if (last_d < df) {
396	if ((last_d + 10) < df) {
397	cout << "<br><a href='" << argv[0]
398	<< "?c=" << collection
399	<< "&n=" << phrase
400	<< "&e=" << first_e
401	<< "&f=" << last_e
402	<< "&h=" << first_d
403	<< "&i=" << (last_d + 10)
404	<< "&k=" << first_l
405	<< "&l=" << last_l
406	<< "'>Get more documents</a>" << endl;
407	}
408	cout << "<br><a href='" << argv[0]
409	<< "?c=" << collection
410	<< "&n=" << phrase
411	<< "&e=" << first_e
412	<< "&f=" << last_e
413	<< "&h=" << first_d
414	<< "&i=" << df
415	<< "&k=" << first_l
416	<< "&l=" << last_l
417	<< "'>Get every document</a>" << endl;
418	}
419	}
420	}
421
422	// Close the document
423	if (XMLmode) {
424	cout << "</phinddata>" << endl;
425	} else {
426	cout << "</center></body></html>" << endl;
427	}
428
429	textdata.UnloadData ();
430
431	delete basepath;
432
433	return 0;
434	}
435
436
437	// Print a list of expansions
438	//
439	// Given the textData and a list of phrase numbers, print out each of the
440	// expansions.
441
442	void print_expansions(char cgi_script, char collection, bool XMLmode, UCArray body,
443	TextData &textdata, vector <unsigned long> elist,
444	unsigned long first, unsigned long last) {
445
446	UCArray word;
447	unsigned long phrase, tf, df, ef;
448
449	UCArray suffix, prefix;
450
451	for (unsigned long e = first; e < last; e++) {
452
453	phrase = elist[e];
454	get_phrase_freq_data(textdata, phrase, word, tf, ef, df);
455
456	split_phrase(word, body, prefix, suffix);
457
458	if (XMLmode) {
459	// body is always the same as the text of the phrase, so no need to send it
460	cout << "<expansion num=\"" << e
461	<< "\" id=\"" << phrase
462	<< "\" tf=\"" << tf
463	<< "\" df=\"" << df;
464	if (!prefix.empty()) {
465	cout << "\" prefix=\"" << prefix;
466	}
467	if (!suffix.empty()) {
468	cout << "\" suffix=\"" << suffix;
469	}
470	cout << "\"/>" << endl;
471	} else {
472	cout << "<tr valign=top><td align=right><a href='" << cgi_script
473	<< "?c=" << collection << "&n=" << phrase << "'>" << prefix << "</a></td>"
474	<< "<td align=center><a href='" << cgi_script
475	<< "?c=" << collection << "&n=" << phrase << "'>" << body << "</a></td>"
476	<< "<td align=left><a href='" << cgi_script
477	<< "?c=" << collection << "&n=" << phrase << "'>" << suffix << "</a></td>"
478	<< "<td>" << tf << "</td><td>" << df << "</td></tr>" << endl;
479	}
480	}
481	}
482
483	void print_thesaurus_links(char cgi_script, char collection,
484	bool XMLmode, UCArray body, TextData &textdata,
485	vector <unsigned long> &linkdest,
486	vector <UCArray> &linktype,
487	unsigned long first, unsigned long last) {
488
489	// information describing each link in the list
490	unsigned long phrase, tf, ef, df;
491	UCArray type, text, newbody, suffix, prefix;
492
493	for (unsigned long l = first; l < last; l++) {
494
495	// get the phrase data
496	phrase = linkdest[l];
497	type = linktype[l];
498	get_phrase_freq_data(textdata, phrase, text, tf, ef, df);
499	// split_phrase(text, newbody, prefix, suffix);
500
501	if (XMLmode) {
502	cout << "<thesaurus num=\"" << l
503	<< "\" id=\"" << phrase
504	<< "\" tf=\"" << tf
505	<< "\" df=\"" << df
506	<< "\" type=\"" << type
507	<< "\" text=\"" << text
508	<< "\"/>" << endl;
509	} else {
510	cout << "<tr valign=top><td>" << type << "</td><td>"
511	<< "<a href='" << cgi_script << "?c=" << collection
512	<< "&n=" << phrase << "'>" << text << "</a>"
513	<< "</td><td>" << tf << "</td><td>" << df << "</td></tr>" << endl;
514	}
515	}
516	}
517
518
519	void print_documents(bool XMLmode, char basepath, char cgi_script, char *collection,
520	vector <unsigned long> docNums, vector <unsigned long> docFreq,
521	unsigned long first, unsigned long last) {
522
523	// _chdir(basepath);
524
525	// Create a TextData object to read the document data
526	TextData docdata;
527
528	text_t fullpath = filename_cat(basepath, "docs");
529	char *fullpathc = fullpath.getcstr();
530	#if defined __WIN32__
531	char *base = "";
532	#else
533	char *base = "/";
534	#endif
535
536	if (!docdata.LoadData (base, fullpathc)) {
537	FatalError (1, "Couldn't load text information for \"%s\"", fullpathc);
538	}
539
540	delete fullpathc;
541
542	UCArray title, hash;
543	unsigned long freq, doc;
544
545	for (unsigned long d = first; d < last; d++) {
546	doc = docNums[d];
547	freq = docFreq[d];
548
549	get_document_all_data(docdata, doc, title, hash);
550
551	if (XMLmode) {
552	cout << "<document num=\"" << d
553	<< "\" hash=\"" << hash
554	<< "\" freq=\"" << freq
555	<< "\" title=\"" << title << "\"/>" << endl;
556	} else {
557	cout << "<tr valign=top><td><a href='" << cgi_script << "?c=" << collection
558	<< "&a=d&d=" << hash << "'>" << title << "</a>"
559	<< "</td><td>" << freq << "</td></tr>"
560	<< endl;
561	}
562	}
563	}
564
565
566
567	// Get the frequency data about a phrase
568	//
569	// The phrase is stored in textData as record phrase.
570	// We retrieve:
571	// word - the text of the phrase
572	// tf - the total frequency of the phrase
573	// ef - the expansion frequency of the phrase
574	// df - the document frequency of the phrase
575
576	void get_phrase_freq_data(TextData &textdata, unsigned long phrase,
577	UCArray &word, unsigned long &tf,
578	unsigned long &ef, unsigned long &df) {
579
580	UCArray text;
581	UCArray docLevel;
582	SetCStr(docLevel, "Document");
583
584	// Look the word up in the textData
585	if (!GetDocText (textdata, docLevel, phrase, text)) {
586	FatalError (1, "Error while trying to get phrase %u", phrase);
587	}
588
589	// Ignore everything up to the first colon
590	UCArray::iterator next = text.begin();
591	while (*next++ != ':');
592
593	// Get the word
594	word.clear();
595	for (; *next != ':'; next++) {
596	word.push_back(*next);
597	}
598
599	// Get total frequency
600	tf = 0;
601	for (next++; *next != ':'; next++) {
602	tf *= 10;
603	tf += (*next - '0');
604	}
605
606	// Get expansion frequency
607	ef = 0;
608	for (next++; *next != ':'; next++) {
609	ef *= 10;
610	ef += (*next - '0');
611	}
612
613	// Get document frequency
614	df = 0;
615	for (next++; *next != ':'; next++) {
616	df *= 10;
617	df += (*next - '0');
618	}
619	}
620
621	// Get all the data about a phrase
622	//
623	// The phrase is stored in textData as record phrase.
624	// We retrieve:
625	// word - the text of the phrase
626	// tf - the total frequency of the phrase
627	// ef - the expansion frequency of the phrase
628	// lf - the thesaurus link frequency of the phrase
629	// df - the document frequency of the phrase
630	// el - the list of phrases that are expansions of phrase
631	// ll - the list of phrases that are thesaurus links
632	// dl - the list of documents that contain phrase
633
634	void get_phrase_all_data(TextData &textdata, unsigned long phrase,
635	UCArray &word,
636	unsigned long &tf, unsigned long &ef,
637	unsigned long &lf, unsigned long &df,
638	vector <unsigned long> &el,
639	vector <unsigned long> &linkdest,
640	vector <UCArray> &linktype,
641	vector <unsigned long> &docnum,
642	vector <unsigned long> &docfrq) {
643	UCArray text;
644	UCArray docLevel;
645	SetCStr(docLevel, "Document");
646
647	// Look thwe word up in the textData
648	if (!GetDocText (textdata, docLevel, phrase, text)) {
649	FatalError (1, "Error while trying to get phrase %u", phrase);
650	}
651
652	// Ignore everything up to the first colon
653	UCArray::iterator next = text.begin();
654	while (*next++ != ':');
655
656	// ignore training cariage returns
657	while (text.back() == '\n') {
658	text.pop_back();
659	}
660
661	// Get the word
662	word.clear();
663	for (; *next != ':'; next++) {
664	word.push_back(*next);
665	}
666
667	// Get total frequency
668	tf = 0;
669	for (next++; *next != ':'; next++) {
670	tf *= 10;
671	tf += (*next - '0');
672	}
673
674	// Get expansion frequency
675	ef = 0;
676	for (next++; *next != ':'; next++) {
677	ef *= 10;
678	ef += (*next - '0');
679	}
680
681	// Get document frequency
682	df = 0;
683	for (next++; *next != ':'; next++) {
684	df *= 10;
685	df += (*next - '0');
686	}
687
688	// Get expansion list
689	el.clear();
690	unsigned long e = 0;
691	for (next++; *next != ':'; next++) {
692	if (*next == ',') {
693	el.push_back(e);
694	e = 0;
695	} else {
696	e *= 10;
697	e += (*next - '0');
698	}
699	}
700
701	// Get document list & the document frequency list
702	docnum.clear();
703	docfrq.clear();
704	bool readnum = false;
705	unsigned long d = 0;
706	for (next++; *next != ':'; next++) {
707	if (*next == ',') {
708	docnum.push_back(d);
709	readnum = true;
710	d = 0;
711	} else if (*next == ';') {
712	if (readnum) {
713	docfrq.push_back(d);
714	} else {
715	docnum.push_back(d);
716	docfrq.push_back(1);
717	}
718	readnum = false;
719	d = 0;
720	} else {
721	d *= 10;
722	d += (*next - '0');
723	}
724	}
725
726	// Get thesaurus link frequency & link list
727	text.push_back(':');
728	text.push_back(':');
729
730	// link frequency
731	lf = 0;
732	for (next++; *next != ':'; next++) {
733	lf *= 10;
734	lf += (*next - '0');
735	}
736
737	// two lists of link data
738	linkdest.clear();
739	linktype.clear();
740
741	UCArray thistype;
742	thistype.clear();
743	bool typedone = false;
744	unsigned long l = 0;
745	for (next++; *next != ':'; next++) {
746
747	if (!typedone) {
748	// first read the link type, a charactor string
749	if (*next == ',') {
750	typedone = true;
751	} else {
752	thistype.push_back(*next);
753	}
754	} else {
755	// having read the link type, read the list of link destinations
756	if (*next == ',') {
757	linkdest.push_back(l);
758	linktype.push_back(thistype);
759	l = 0;
760	} else if (*next == ';') {
761	linkdest.push_back(l);
762	linktype.push_back(thistype);
763	l = 0;
764	thistype.clear();
765	typedone = false;
766	} else {
767	l *= 10;
768	l += (*next - '0');
769	}
770	}
771	}
772	}
773
774	// Get all the data about a docment
775	//
776	// The document's detailes are stored in docData as record docNum.
777	// We retrieve:
778	// title - the document's title
779	// hash - the documnt's unique OID
780
781	void get_document_all_data(TextData &docdata, unsigned long docNum,
782	UCArray &title, UCArray &hash) {
783
784	UCArray text;
785	UCArray docLevel;
786	SetCStr(docLevel, "Document");
787
788	// Look the word up in the textData
789	if (!GetDocText (docdata, docLevel, docNum, text)) {
790	FatalError (1, "Error while trying to get document %u", docNum);
791	}
792
793	// Ignore everything up to the first colon
794	UCArray::iterator next = text.begin();
795	while (*next++ != '\t');
796
797	// Get the document OID (hash)
798	hash.clear();
799	for (; *next != '\t'; next++) {
800	hash.push_back(*next);
801	}
802
803	// Get the title
804	text.push_back('\n');
805	title.clear();
806	for (next++; *next != '\n'; next++) {
807	title.push_back(*next);
808	}
809	}
810
811
812	void get_gsdlsite_parameters(char *&gsdlhome) {
813
814	// open the file
815	ifstream gsdl("gsdlsite.cfg", ios::in);
816	if (!gsdl) {
817	cerr << "File gsdlsite.cfg could not be opened\n";
818	exit(1);
819	}
820
821	// read each line of the file
822	char buffer[2000];
823	while (!gsdl.eof()) {
824	gsdl.getline(buffer, 2000, '\n');
825
826	// read the gsdlhome variable
827	if (strncmp(buffer, "gsdlhome", 8) == 0) {
828
829	// find the start of the gsdlhome string
830	int len = strlen(buffer);
831	int i = 8;
832	while (i < len && (buffer[i] == ' ' \|\| buffer[i] == '\t')) {
833	i++;
834	}
835	// store the gsdlhome string
836	gsdlhome = new char[len-i];
837	strncpy(gsdlhome, &(buffer[i]), len-i);
838	}
839	}
840	}
841
842	void get_cgi_parameters(char &collection, char &classifier,
843	unsigned long &phrasenumber, UCArray &phrasetext,
844	unsigned long &first_e, unsigned long &last_e,
845	unsigned long &first_l, unsigned long &last_l,
846	unsigned long &first_d, unsigned long &last_d,
847	bool &XMLmode) {
848
849
850	// set the default parameters
851	phrasenumber = 0;
852	phrasetext.clear();
853	first_e = 0;
854	last_e = 10;
855	first_l = 0;
856	last_l = 10;
857	first_d = 0;
858	last_d = 10;
859
860	// get the query string
861	char *request_method_str = getenv("REQUEST_METHOD");
862	char *query_string = getenv("QUERY_STRING");
863	text_t query;
864
865	if (request_method_str != NULL
866	&& (strcmp(request_method_str, "GET") == 0)
867	&& query_string != NULL) {
868	// GET cgi args from querystring
869	query = query_string;
870
871	} else {
872	// debugging from command line
873	cout << "? " << endl;
874	char query_input[1024];
875	cin.get(query_input, 1024, '\n');
876	query = query_input;
877	}
878
879	// extract out the key=value pairs
880	text_t::iterator here = query.begin();
881	text_t::iterator end = query.end();
882	text_t key, value;
883
884	while (here != end) {
885	// get the next key and value pair
886	here = getdelimitstr (here, end, '=', key);
887	here = getdelimitstr (here, end, '&', value);
888
889	// store this key=value pair
890	if (!key.empty() && !value.empty()) {
891
892	// c: the collection name
893	if (key[0] == 'c') {
894	UCArray tmp;
895	toUCArray(value, tmp);
896	collection = GetCStr(tmp);
897	}
898
899	// d: the classifier number as string
900	if (key[0] == 'd') {
901	UCArray tmp;
902	toUCArray(value, tmp);
903	classifier = GetCStr(tmp);
904	}
905
906	// e: the first expansion number
907	else if (key[0] == 'e') {
908	first_e = toLongInt(value);
909	}
910
911	// f: the last expansion number
912	else if (key[0] == 'f') {
913	last_e = toLongInt(value);
914	}
915
916	// h: the first document number
917	else if (key[0] == 'h') {
918	first_d = toLongInt(value);
919	}
920
921	// i: the last document number
922	else if (key[0] == 'i') {
923	last_d = toLongInt(value);
924	}
925
926	// k: the first thesaurus list number
927	else if (key[0] == 'k') {
928	first_l = toLongInt(value);
929	}
930
931	// l: the last thesaurus list number
932	else if (key[0] == 'l') {
933	last_l = toLongInt(value);
934	}
935
936	// n: the phrase number
937	else if (key[0] == 'n') {
938	phrasenumber = toLongInt(value);
939	}
940
941	// p: the phrase text
942	else if (key[0] == 'p') {
943	decode_cgi_arg(value);
944	toUCArray(value, phrasetext);
945	}
946
947	// x: XML mode
948	else if (key[0] == 'x') {
949	XMLmode = true;
950	}
951
952	}
953	}
954
955	// if no classifier number is supplied, default to 1.
956	if (classifier == NULL) {
957	classifier = new char[2];
958	strcpy(classifier, "1");
959	}
960	}
961
962
963	// Convert %xx and + to their appropriate equivalents
964	//
965	// This function was copied from %GSDLHOME/src/recpt/cgiutils.cpp
966	// because it was much easier to copy it than to link against it.
967
968	static unsigned short hexdigit (unsigned short c) {
969	if (c >= '0' && c <= '9') return (c-'0');
970	if (c >= 'a' && c <= 'f') return (c-'a'+10);
971	if (c >= 'A' && c <= 'F') return (c-'A'+10);
972	return c;
973	}
974
975	void decode_cgi_arg (text_t &argstr) {
976	text_t::iterator in = argstr.begin();
977	text_t::iterator out = in;
978	text_t::iterator end = argstr.end();
979
980	while (in != end) {
981	if (in == '+') out = ' ';
982
983	else if (*in == '%') {
984	unsigned short c = '%';
985	in++;
986	if (in != end) {
987	c = hexdigit (*in);
988	in++;
989	}
990	if (in != end && c < 16) { // sanity check on the previous character
991	c = c16 + hexdigit (in);
992	}
993
994	*out = c;
995	} else out = in;
996
997	if (in != end) in++;
998	out++;
999	}
1000
1001	// remove the excess characters
1002	argstr.erase (out, end);
1003	}
1004
1005
1006	// Find the phrase number of a word in the index file
1007
1008	void find_phrase_number_from_word(char *basepath, UCArray &query, DocNumArray &result) {
1009
1010	// Open the index file for searching
1011	IndexData indexData;
1012
1013	text_t fullpath = filename_cat(basepath, "pword");
1014	char *fullpathc = fullpath.getcstr();
1015	#if defined __WIN32__
1016	char *base = "";
1017	#else
1018	char *base = "/";
1019	#endif
1020
1021	if (!indexData.LoadData (base, fullpathc)) {
1022	FatalError (1, "Couldn't load index information for \"%s\"", fullpathc);
1023	}
1024
1025	delete fullpathc;
1026
1027	// set up the query object
1028	QueryInfo queryInfo;
1029	SetCStr (queryInfo.docLevel, "Document");
1030	queryInfo.maxDocs = 5;
1031	queryInfo.sortByRank = true;
1032	queryInfo.exactWeights = false;
1033	queryInfo.needRankInfo = true;
1034	queryInfo.needTermFreqs = true;
1035
1036	// mode 1 = casefolded, unstemmed search
1037	QueryNode *queryTree = ParseQuery(query, 1, 1);
1038
1039	// cout << "-- query --" << endl;
1040	// PrintNode (cout, queryTree);
1041
1042	// perform the query
1043	ExtQueryResult queryResult;
1044	MGQuery (indexData, queryInfo, queryTree, queryResult);
1045	// cout << "-- word lookup result -- " << endl << queryResult << endl ;
1046
1047	result.clear();
1048	result = queryResult.docs;
1049
1050	// delete the query
1051	if (queryTree != NULL) delete queryTree;
1052	}
1053
1054
1055
1056
1057	// cgi_error
1058	//
1059	// If for some reason we cannot proceed, output a simple error
1060	// page and exit(0) the program.
1061
1062	void cgi_error(bool XMLmode, char *message) {
1063
1064	if (XMLmode) {
1065	cout << "Content-type: text/plain" << endl << endl
1066	<< "<phinddata>" << endl
1067	<< "<phinderror>" << message << "</phinderror>" << endl
1068	<< "</phinddata>" << endl;
1069	} else {
1070	cout << "Content-type: text/html" << endl << endl
1071	<< "<html><head><title>phind error</title></head>" << endl
1072	<< "<body>" << endl
1073	<< "<p><h1>phind error</h1>"
1074	<< "<p> An error occured processing your request: <p><b>"
1075	<< message
1076	<< "</b></body></html>" << endl;
1077	}
1078	exit(0);
1079	}
1080
1081
1082	// split an expansion into prefix and suffix
1083
1084	void split_phrase(UCArray word, UCArray body, UCArray &prefix, UCArray &suffix) {
1085
1086	prefix.clear();
1087	suffix.clear();
1088
1089	bool readingPrefix = true;
1090	UCArray::iterator here = word.begin();
1091	UCArray::iterator end = word.end();
1092
1093	while (here != end) {
1094
1095	// if we've not read all the prefix, add the next char to the prefix
1096	if (readingPrefix) {
1097	if (phrase_match(body, here, end)) {
1098	readingPrefix = false;
1099	// trim whitespace from end of prefix & start of suffix
1100	if (!prefix.empty()) {
1101	prefix.pop_back();
1102	}
1103	if ((here != end) && (*here == ' ')) {
1104	here++;
1105	}
1106	} else {
1107	prefix.push_back(*here);
1108	here++;
1109	}
1110	}
1111	// if we've finished with the prefix, update the suffix
1112	else {
1113	suffix.push_back(*here);
1114	here++;
1115	}
1116	}
1117	}
1118
1119	// phrase_match
1120	//
1121	// compare two strings, one represented as an UCArray, the other as two
1122	// UCArray iterators.
1123	//
1124	// Return true if the UCArray is the same as the phrase the iterators point
1125	// to for the length of the UCArray.
1126
1127	bool phrase_match(UCArray text, UCArray::iterator &here, UCArray::iterator end) {
1128
1129	UCArray::iterator one_here = text.begin();
1130	UCArray::iterator one_end = text.end();
1131	UCArray::iterator two_here = here;
1132
1133	// iterate over the length of the first string, comparing each element to
1134	// the corresponding element in the second string.
1135	while (one_here != one_end) {
1136
1137	if (two_here == end) {
1138	return false;
1139	} else if (one_here != two_here) {
1140	return false;
1141	}
1142	one_here++;
1143	two_here++;
1144	}
1145
1146	here = two_here;
1147	return true;
1148	}
1149
1150
1151	// Convert from text_t format
1152	//
1153	// Conversions from text_t to other types
1154
1155	unsigned long toLongInt(text_t &value) {
1156
1157	unsigned long result = 0;
1158
1159	text_t::iterator here = value.begin();
1160	text_t::iterator end = value.end();
1161	while (here != end) {
1162	result *= 10;
1163	result += *here - '0';
1164	here++;
1165	}
1166
1167	return result;
1168	}
1169
1170	void toUCArray(text_t &in, UCArray &out) {
1171	out.clear();
1172	text_t::iterator here = in.begin();
1173	text_t::iterator end = in.end();
1174	while (here != end) {
1175	out.push_back((unsigned char) *here);
1176	here++;
1177	}
1178	}
1179

Note: See TracBrowser for help on using the repository browser.

Download in other formats: