Context Navigation

source: trunk/gsdl/src/recpt/phindaction.cpp@ 3036

Last change on this file since 3036 was 2542, checked in by sjboddie, 23 years ago
* empty log message *
Property svn:keywords set to `Author Date Id Revision`
File size: 26.0 KB

Line
1	/**********************************************************************
2	*
3	* phindaction.cpp --
4	*
5	* Copyright 2001 Gordon W. Paynter
6	* Copyright 2001 The New Zealand Digital Library Project
7	*
8	* A component of the Greenstone digital library software
9	* from the New Zealand Digital Library Project at the
10	* University of Waikato, New Zealand.
11	*
12	* This program is free software; you can redistribute it and/or modify
13	* it under the terms of the GNU General Public License as published by
14	* the Free Software Foundation; either version 2 of the License, or
15	* (at your option) any later version.
16	*
17	* This program is distributed in the hope that it will be useful,
18	* but WITHOUT ANY WARRANTY; without even the implied warranty of
19	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20	* GNU General Public License for more details.
21	*
22	* You should have received a copy of the GNU General Public License
23	* along with this program; if not, write to the Free Software
24	* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
25	*
26	*********************************************************************/
27
28	// Note that this action uses mgpp to retrieve phind info, calling MGQuery
29	// etc. directly, not through the protocol. This breaks our receptionist -
30	// collection server separation and should be fixed some day I guess.
31
32	#include "phindaction.h"
33	#include "fileutil.h"
34
35	phindaction::phindaction () {
36
37	cgiarginfo arg_ainfo;
38
39	arg_ainfo.shortname = "pc";
40	arg_ainfo.longname = "phind classifier";
41	arg_ainfo.multiplechar = true;
42	arg_ainfo.defaultstatus = cgiarginfo::weak;
43	arg_ainfo.argdefault = "";
44	arg_ainfo.savedarginfo = cgiarginfo::mustnot;
45	argsinfo.addarginfo (NULL, arg_ainfo);
46
47	arg_ainfo.shortname = "pxml";
48	arg_ainfo.longname = "phind XML mode";
49	arg_ainfo.multiplechar = false;
50	arg_ainfo.defaultstatus = cgiarginfo::weak;
51	arg_ainfo.argdefault = "0";
52	arg_ainfo.savedarginfo = cgiarginfo::mustnot;
53	argsinfo.addarginfo (NULL, arg_ainfo);
54
55	arg_ainfo.shortname = "ppnum";
56	arg_ainfo.longname = "phind phrase number";
57	arg_ainfo.multiplechar = true;
58	arg_ainfo.defaultstatus = cgiarginfo::weak;
59	arg_ainfo.argdefault = "0";
60	arg_ainfo.savedarginfo = cgiarginfo::mustnot;
61	argsinfo.addarginfo (NULL, arg_ainfo);
62
63	arg_ainfo.shortname = "pptext";
64	arg_ainfo.longname = "phind phrase text";
65	arg_ainfo.multiplechar = true;
66	arg_ainfo.defaultstatus = cgiarginfo::weak;
67	arg_ainfo.argdefault = "";
68	arg_ainfo.savedarginfo = cgiarginfo::mustnot;
69	argsinfo.addarginfo (NULL, arg_ainfo);
70
71	arg_ainfo.shortname = "pfe";
72	arg_ainfo.longname = "phind first_e";
73	arg_ainfo.multiplechar = true;
74	arg_ainfo.defaultstatus = cgiarginfo::weak;
75	arg_ainfo.argdefault = "0";
76	arg_ainfo.savedarginfo = cgiarginfo::mustnot;
77	argsinfo.addarginfo (NULL, arg_ainfo);
78
79	arg_ainfo.shortname = "ple";
80	arg_ainfo.longname = "phind last_e";
81	arg_ainfo.multiplechar = true;
82	arg_ainfo.defaultstatus = cgiarginfo::weak;
83	arg_ainfo.argdefault = "10";
84	arg_ainfo.savedarginfo = cgiarginfo::mustnot;
85	argsinfo.addarginfo (NULL, arg_ainfo);
86
87	arg_ainfo.shortname = "pfl";
88	arg_ainfo.longname = "phind first_l";
89	arg_ainfo.multiplechar = true;
90	arg_ainfo.defaultstatus = cgiarginfo::weak;
91	arg_ainfo.argdefault = "0";
92	arg_ainfo.savedarginfo = cgiarginfo::mustnot;
93	argsinfo.addarginfo (NULL, arg_ainfo);
94
95	arg_ainfo.shortname = "pll";
96	arg_ainfo.longname = "phind last_l";
97	arg_ainfo.multiplechar = true;
98	arg_ainfo.defaultstatus = cgiarginfo::weak;
99	arg_ainfo.argdefault = "10";
100	arg_ainfo.savedarginfo = cgiarginfo::mustnot;
101	argsinfo.addarginfo (NULL, arg_ainfo);
102
103	arg_ainfo.shortname = "pfd";
104	arg_ainfo.longname = "phind first_d";
105	arg_ainfo.multiplechar = true;
106	arg_ainfo.defaultstatus = cgiarginfo::weak;
107	arg_ainfo.argdefault = "0";
108	arg_ainfo.savedarginfo = cgiarginfo::mustnot;
109	argsinfo.addarginfo (NULL, arg_ainfo);
110
111	arg_ainfo.shortname = "pld";
112	arg_ainfo.longname = "phind last_d";
113	arg_ainfo.multiplechar = true;
114	arg_ainfo.defaultstatus = cgiarginfo::weak;
115	arg_ainfo.argdefault = "10";
116	arg_ainfo.savedarginfo = cgiarginfo::mustnot;
117	argsinfo.addarginfo (NULL, arg_ainfo);
118	}
119
120	phindaction::~phindaction () {
121	}
122
123	void phindaction::get_cgihead_info (cgiargsclass &/args/, recptprotolistclass * /protos/,
124	response_t &response,text_t &response_data,
125	ostream &/logout/) {
126	response = content;
127	response_data = "text/html";
128	}
129
130	bool phindaction::do_action (cgiargsclass &args, recptprotolistclass *protos,
131	browsermapclass * /browsers/, displayclass &disp,
132	outconvertclass &outconvert, ostream &textout,
133	ostream &logout) {
134
135	unsigned long count_l, count_e, count_d;
136	unsigned long phrase = args["ppnum"].getulong();
137	text_t &word = args["pptext"];
138	unsigned long first_e = args["pfe"].getulong();
139	unsigned long last_e = args["ple"].getulong();
140	unsigned long first_l = args["pfl"].getulong();
141	unsigned long last_l = args["pll"].getulong();
142	unsigned long first_d = args["pfd"].getulong();
143	unsigned long last_d = args["pld"].getulong();
144	bool XMLmode = false;
145	if (args["pxml"] == "1") XMLmode = true;
146
147	// must have a valid collection server
148	recptproto *collectproto = protos->getrecptproto (args["c"], logout);
149	if (collectproto == NULL) {
150	output_error("phindaction: ERROR: collection not set", textout,
151	outconvert, disp, logout, XMLmode);
152	return true;
153	}
154
155	// the frequency and occurances of the phrase
156	unsigned long tf;
157	vector <unsigned long> el, linkdest, docNums, docfreq;
158	vector <UCArray> linktype;
159
160	// the number of occurances to display
161	unsigned long ef, lf, df;
162
163	text_t basepath = filename_cat(gsdlhome, "collect", args["c"],
164	"index", "phind" + args["pc"]);
165
166	// If we don't know the phrase number, look it up
167	if (phrase == 0) {
168
169	if (word.empty()) {
170	output_error("phindaction: ERROR: no phrase number or word", textout,
171	outconvert, disp, logout, XMLmode);
172	return true;
173	}
174
175	DocNumArray result;
176	find_phrase_number_from_word(basepath, word, result);
177
178	if (result.empty()) {
179	output_error("phindaction: The search term does not occur in the collection",
180	textout, outconvert, disp, logout, XMLmode);
181	return true;
182	} else {
183	phrase = result[0];
184	}
185	}
186
187	// Create a TextData object to read the phrase data (pdata)
188	TextData textdata;
189
190	text_t fullpath = filename_cat(basepath, "pdata");
191	char *fullpathc = fullpath.getcstr();
192	#if defined __WIN32__
193	char *base = "";
194	#else
195	char *base = "/";
196	#endif
197
198	if (!textdata.LoadData (base, fullpathc)) {
199	// FatalError (1, "Couldn't load text information for \"%s\"", fullpathc);
200	exit (0);
201	}
202
203	delete fullpathc;
204
205	get_phrase_all_data(textdata, phrase, word, tf, ef, lf, df, el,
206	linkdest, linktype, docNums, docfreq);
207
208	// Output the header
209	if (XMLmode) {
210	textout << "Content-type: text/plain\n\n"
211	<< "<phinddata id=\"" << phrase
212	<< "\" text=\"" << word
213	<< "\" tf=\"" << tf
214	<< "\" ef=\"" << ef
215	<< "\" df=\"" << df
216	<< "\" lf=\"" << lf
217	<< "\">\n";
218	} else {
219	textout << "Content-type: text/html\n\n"
220	<< "<html><head><title>" << word << "</title></head>\n"
221	<< "<body><center>\n"
222	<< "<p><h1>" << word << "</h1>\n"
223	<< "<p><b>"<< word << "</b> occurs "
224	<< tf << " times in " << df << " documents\n";
225	}
226
227	// Output the thesaurus links
228	if ((lf > 0) && (first_l < last_l)) {
229
230	// figure out the number of phrases to output
231	if (last_l > lf) {
232	last_l = lf;
233	}
234	count_l = last_l - first_l;
235
236	if (XMLmode) {
237	textout << "<thesauruslist length=\"" << lf
238	<< "\" start=\"" << first_l
239	<< "\" end=\"" << last_l << "\">\n";
240	print_thesaurus_links(args["c"], XMLmode, textdata, linkdest, linktype,
241	first_l, last_l, disp, outconvert, textout);
242	textout << "</thesauruslist>\n";
243	}
244
245	// output links as HTML
246	else {
247	if (count_l == lf) {
248	textout << "<p><b> " << count_l << " thesaurus links</b>\n";
249	} else {
250	textout << "<p><b>" << count_l << " of " << lf << " thesaurus links</b>\n";
251	}
252
253	textout << "<p><table border=1><tr><th>type</th><th>topic</th><th>freq</th><th>docs</th></tr>\n";
254	print_thesaurus_links(args["c"], XMLmode, textdata, linkdest, linktype,
255	first_l, last_l, disp, outconvert, textout);
256	textout << "</table>\n";
257
258	if (last_l < lf) {
259	if ((last_l + 10) < lf) {
260	textout << outconvert << disp
261	<< "<br><a href=\"_gwcgi_?"
262	<< "c=" << args["c"]
263	<< "&ppnum=" << phrase
264	<< "&pfe=" << first_e
265	<< "&ple=" << last_e
266	<< "&pfd=" << first_d
267	<< "&pld=" << last_d
268	<< "&pfl=" << first_l
269	<< "&pll=" << (last_l + 10)
270	<< "\">Get more thesaurus links</a>\n";
271	}
272	textout << outconvert << disp
273	<< "<br><a href=\"_gwcgi_?"
274	<< "c=" << args["c"]
275	<< "&ppnum=" << phrase
276	<< "&pfe=" << first_e
277	<< "&ple=" << last_e
278	<< "&pfd=" << first_d
279	<< "&pld=" << last_d
280	<< "&pfl=" << first_l
281	<< "&pll=" << lf
282	<< "\">Get every thesaurus link</a>\n" ;
283	}
284	}
285	}
286
287	// Output the expansions
288	if ((ef > 0) && (first_e < last_e)) {
289
290	// figure out the number of phrases to output
291	if (last_e > el.size()) {
292	last_e = el.size();
293	}
294	count_e = last_e - first_e;
295
296	// output expansions as XML
297	if (XMLmode) {
298	textout << "<expansionlist length=\"" << ef
299	<< "\" start=\"" << first_e
300	<< "\" end=\"" << last_e << "\">" << endl;
301
302	print_expansions(args["c"], XMLmode, word, textdata, el, first_e,
303	last_e, disp, outconvert, textout);
304
305	textout << "</expansionlist>\n";
306	}
307
308	// output expansions as HTML
309	else {
310	if (count_e == el.size()) {
311	textout << "<p><b> " << count_e << " expansions</b>\n";
312	} else {
313	textout << "<p><b>" << count_e << " of " << ef << " expansions</b>\n";
314	}
315
316	textout << "<p><table border=1><tr><th colspan=3>phrase</th><th>freq</th><th>docs</th></tr>\n";
317	print_expansions(args["c"], XMLmode, word, textdata, el, first_e,
318	last_e, disp, outconvert, textout);
319	textout << "</table>\n";
320
321	if (last_e < ef) {
322	if ((last_e + 10) < ef) {
323	textout << outconvert << disp
324	<< "<br><a href=\"_gwcgi_?"
325	<< "c=" << args["c"]
326	<< "&ppnum=" << phrase
327	<< "&pfe=" << first_e
328	<< "&ple=" << (last_e + 10)
329	<< "&pfd=" << first_d
330	<< "&pld=" << last_d
331	<< "&pfl=" << first_l
332	<< "&pll=" << last_l
333	<< "\">Get more expansions</a>\n";
334	}
335	textout << outconvert << disp
336	<< "<br><a href=\"_gwcgi_?"
337	<< "c=" << args["c"]
338	<< "&ppnum=" << phrase
339	<< "&pfe=" << first_e
340	<< "&ple=" << ef
341	<< "&pfd=" << first_d
342	<< "&pld=" << last_d
343	<< "&pfl=" << first_l
344	<< "&pll=" << last_l
345	<< "\">Get every expansion</a>\n";
346	}
347	}
348	}
349
350	// Output the document occurances
351	if ((df > 0) && (first_d < last_d)) {
352
353	// figure out the phrases to output
354	if (last_d > docNums.size()) {
355	last_d = docNums.size();
356	}
357	count_d = last_d - first_d;
358
359	// output document list as XML
360	if (XMLmode) {
361	textout << "<documentlist length=\"" << df
362	<< "\" start=\"" << first_d
363	<< "\" end=\"" << last_d << "\">\n";
364
365	print_documents(XMLmode, basepath, args["c"], docNums, docfreq,
366	first_d, last_d, disp, outconvert, textout);
367
368	textout << "</documentlist>\n";
369	}
370
371	// output document list as HTML
372	else {
373
374	if (count_d == docNums.size()) {
375	textout << "<p><b> " << count_d << " documents</b>\n";
376	} else {
377	textout << "<p><b>" << count_d << " of " << df << " documents</b>\n";
378	}
379
380	textout << "<p><table border=1><tr><th align=left>document</th><th>freq</th></tr>\n";
381	print_documents(XMLmode, basepath, args["c"], docNums, docfreq,
382	first_d, last_d, disp, outconvert, textout);
383	textout << "</table>\n";
384
385	if (last_d < df) {
386	if ((last_d + 10) < df) {
387	textout << outconvert << disp
388	<< "<br><a href=\"_gwcgi_?"
389	<< "c=" << args["c"]
390	<< "&ppnum=" << phrase
391	<< "&pfe=" << first_e
392	<< "&ple=" << last_e
393	<< "&pfd=" << first_d
394	<< "&pld=" << (last_d + 10)
395	<< "&pfl=" << first_l
396	<< "&pll=" << last_l
397	<< "\">Get more documents</a>\n";
398	}
399	textout << outconvert << disp
400	<< "<br><a href=\"_gwcgi_?"
401	<< "c=" << args["c"]
402	<< "&ppnum=" << phrase
403	<< "&pfe=" << first_e
404	<< "&ple=" << last_e
405	<< "&pfd=" << first_d
406	<< "&pld=" << df
407	<< "&pfl=" << first_l
408	<< "&pll=" << last_l
409	<< "\">Get every document</a>\n";
410	}
411	}
412	}
413
414	// Close the document
415	if (XMLmode) {
416	textout << "</phinddata>\n";
417	} else {
418	textout << "</center></body></html>\n";
419	}
420
421	textdata.UnloadData ();
422
423	return true;
424	}
425
426	// Find the phrase number of a word in the index file
427	void phindaction::find_phrase_number_from_word(const text_t &basepath,
428	const text_t &query,
429	DocNumArray &result) {
430
431	// Open the index file for searching
432	IndexData indexData;
433
434	text_t fullpath = filename_cat(basepath, "pword");
435	char *fullpathc = fullpath.getcstr();
436	#if defined __WIN32__
437	char *base = "";
438	#else
439	char *base = "/";
440	#endif
441
442	if (!indexData.LoadData (base, fullpathc)) {
443	// FatalError (1, "Couldn't load index information for \"%s\"", fullpathc);
444	exit (0);
445	}
446
447	delete fullpathc;
448
449	// set up the query object
450	QueryInfo queryInfo;
451	SetCStr (queryInfo.docLevel, "Document");
452	queryInfo.maxDocs = 5;
453	queryInfo.sortByRank = true;
454	queryInfo.exactWeights = false;
455	queryInfo.needRankInfo = true;
456	queryInfo.needTermFreqs = true;
457
458	// mode 1 = casefolded, unstemmed search
459	UCArray ucquery;
460	toUCArray(query, ucquery);
461	QueryNode *queryTree = ParseQuery(ucquery, 1, 1);
462
463	// perform the query
464	ExtQueryResult queryResult;
465	MGQuery (indexData, queryInfo, queryTree, queryResult);
466	// cout << "-- word lookup result -- " << endl << queryResult << endl ;
467
468	result.clear();
469	result = queryResult.docs;
470
471	// delete the query
472	if (queryTree != NULL) delete queryTree;
473
474	indexData.UnloadData();
475	}
476
477	// Get all the data about a phrase
478	//
479	// The phrase is stored in textData as record phrase.
480	// We retrieve:
481	// word - the text of the phrase
482	// tf - the total frequency of the phrase
483	// ef - the expansion frequency of the phrase
484	// lf - the thesaurus link frequency of the phrase
485	// df - the document frequency of the phrase
486	// el - the list of phrases that are expansions of phrase
487	// ll - the list of phrases that are thesaurus links
488	// dl - the list of documents that contain phrase
489	void phindaction::get_phrase_all_data(TextData &textdata, unsigned long phrase,
490	text_t &word, unsigned long &tf, unsigned long &ef,
491	unsigned long &lf, unsigned long &df,
492	vector <unsigned long> &el,
493	vector <unsigned long> &linkdest,
494	vector <UCArray> &linktype,
495	vector <unsigned long> &docnum,
496	vector <unsigned long> &docfrq) {
497	UCArray text;
498	UCArray docLevel;
499	SetCStr(docLevel, "Document");
500
501	// Look the word up in the textData
502	if (!GetDocText (textdata, docLevel, phrase, text)) {
503	// FatalError (1, "Error while trying to get phrase %u", phrase);
504	exit (0);
505	}
506
507	// Ignore everything up to the first colon
508	UCArray::iterator next = text.begin();
509	while (*next++ != ':');
510
511	// ignore training carriage returns
512	while (text.back() == '\n') {
513	text.pop_back();
514	}
515
516	// Get the word
517	word.clear();
518	for (; *next != ':'; next++) {
519	word.push_back(*next);
520	}
521
522	// Get total frequency
523	tf = 0;
524	for (next++; *next != ':'; next++) {
525	tf *= 10;
526	tf += (*next - '0');
527	}
528
529	// Get expansion frequency
530	ef = 0;
531	for (next++; *next != ':'; next++) {
532	ef *= 10;
533	ef += (*next - '0');
534	}
535
536	// Get document frequency
537	df = 0;
538	for (next++; *next != ':'; next++) {
539	df *= 10;
540	df += (*next - '0');
541	}
542
543	// Get expansion list
544	el.clear();
545	unsigned long e = 0;
546	for (next++; *next != ':'; next++) {
547	if (*next == ',') {
548	el.push_back(e);
549	e = 0;
550	} else {
551	e *= 10;
552	e += (*next - '0');
553	}
554	}
555
556	// Get document list & the document frequency list
557	docnum.clear();
558	docfrq.clear();
559	bool readnum = false;
560	unsigned long d = 0;
561	for (next++; *next != ':'; next++) {
562	if (*next == ',') {
563	docnum.push_back(d);
564	readnum = true;
565	d = 0;
566	} else if (*next == ';') {
567	if (readnum) {
568	docfrq.push_back(d);
569	} else {
570	docnum.push_back(d);
571	docfrq.push_back(1);
572	}
573	readnum = false;
574	d = 0;
575	} else {
576	d *= 10;
577	d += (*next - '0');
578	}
579	}
580
581	// Get thesaurus link frequency & link list
582	text.push_back(':');
583	text.push_back(':');
584
585	// link frequency
586	lf = 0;
587	for (next++; *next != ':'; next++) {
588	lf *= 10;
589	lf += (*next - '0');
590	}
591
592	// two lists of link data
593	linkdest.clear();
594	linktype.clear();
595
596	UCArray thistype;
597	thistype.clear();
598	bool typedone = false;
599	unsigned long l = 0;
600	for (next++; *next != ':'; next++) {
601
602	if (!typedone) {
603	// first read the link type, a charactor string
604	if (*next == ',') {
605	typedone = true;
606	} else {
607	thistype.push_back(*next);
608	}
609	} else {
610	// having read the link type, read the list of link destinations
611	if (*next == ',') {
612	linkdest.push_back(l);
613	linktype.push_back(thistype);
614	l = 0;
615	} else if (*next == ';') {
616	linkdest.push_back(l);
617	linktype.push_back(thistype);
618	l = 0;
619	thistype.clear();
620	typedone = false;
621	} else {
622	l *= 10;
623	l += (*next - '0');
624	}
625	}
626	}
627	}
628
629	void phindaction::print_thesaurus_links(const text_t &collection, bool XMLmode,
630	TextData &textdata, vector <unsigned long> &linkdest,
631	vector <UCArray> &linktype, unsigned long first,
632	unsigned long last, displayclass &disp,
633	outconvertclass &outconvert, ostream &textout) {
634
635	// information describing each link in the list
636	unsigned long phrase, tf, ef, df;
637	UCArray type, text;
638
639	for (unsigned long l = first; l < last; l++) {
640
641	// get the phrase data
642	phrase = linkdest[l];
643	type = linktype[l];
644	get_phrase_freq_data(textdata, phrase, text, tf, ef, df);
645
646	if (XMLmode) {
647	textout << "<thesaurus num=\"" << l
648	<< "\" id=\"" << phrase
649	<< "\" tf=\"" << tf
650	<< "\" df=\"" << df
651	<< "\" type=\"" << type
652	<< "\" text=\"" << text
653	<< "\"/>\n";
654	} else {
655	textout << "<tr valign=top><td>" << type << "</td><td>";
656	textout << outconvert << disp
657	<< "<a href=\"_gwcgi_?c=" << collection;
658	textout << "&ppnum=" << phrase << "\">" << text << "</a>"
659	<< "</td><td>" << tf << "</td><td>" << df << "</td></tr>\n";
660	}
661	}
662	}
663
664	// Get the frequency data about a phrase
665	//
666	// The phrase is stored in textData as record phrase.
667	// We retrieve:
668	// word - the text of the phrase
669	// tf - the total frequency of the phrase
670	// ef - the expansion frequency of the phrase
671	// df - the document frequency of the phrase
672	void phindaction::get_phrase_freq_data(TextData &textdata, unsigned long phrase,
673	UCArray &word, unsigned long &tf,
674	unsigned long &ef, unsigned long &df) {
675
676	UCArray text;
677	UCArray docLevel;
678	SetCStr(docLevel, "Document");
679
680	// Look the word up in the textData
681	if (!GetDocText (textdata, docLevel, phrase, text)) {
682	// FatalError (1, "Error while trying to get phrase %u", phrase);
683	exit (0);
684	}
685
686	// Ignore everything up to the first colon
687	UCArray::iterator next = text.begin();
688	while (*next++ != ':');
689
690	// Get the word
691	word.clear();
692	for (; *next != ':'; next++) {
693	word.push_back(*next);
694	}
695
696	// Get total frequency
697	tf = 0;
698	for (next++; *next != ':'; next++) {
699	tf *= 10;
700	tf += (*next - '0');
701	}
702
703	// Get expansion frequency
704	ef = 0;
705	for (next++; *next != ':'; next++) {
706	ef *= 10;
707	ef += (*next - '0');
708	}
709
710	// Get document frequency
711	df = 0;
712	for (next++; *next != ':'; next++) {
713	df *= 10;
714	df += (*next - '0');
715	}
716	}
717
718	// Print a list of expansions
719	//
720	// Given the textData and a list of phrase numbers, print out each of the
721	// expansions.
722	void phindaction::print_expansions(const text_t &collection, bool XMLmode,
723	const text_t &body, TextData &textdata,
724	const vector <unsigned long> &elist,
725	unsigned long first, unsigned long last,
726	displayclass &disp, outconvertclass &outconvert,
727	ostream &textout) {
728
729	UCArray word;
730	unsigned long phrase, tf, df, ef;
731
732	UCArray suffix, prefix, ucbody;
733
734	toUCArray(body, ucbody);
735
736	for (unsigned long e = first; e < last; e++) {
737
738	phrase = elist[e];
739	get_phrase_freq_data(textdata, phrase, word, tf, ef, df);
740
741	split_phrase(word, ucbody, prefix, suffix);
742
743	if (XMLmode) {
744	// body is always the same as the text of the phrase, so no need to send it
745	textout << "<expansion num=\"" << e
746	<< "\" id=\"" << phrase
747	<< "\" tf=\"" << tf
748	<< "\" df=\"" << df;
749	if (!prefix.empty()) {
750	textout << "\" prefix=\"" << prefix;
751	}
752	if (!suffix.empty()) {
753	textout << "\" suffix=\"" << suffix;
754	}
755	textout << "\"/>\n";
756	} else {
757	textout << outconvert << disp
758	<< "<tr valign=top><td align=right><a href=\"_gwcgi_?"
759	<< "c=" << collection << "&ppnum=" << phrase << "\">";
760	textout << prefix << "</a></td>";
761	textout <<outconvert << disp
762	<< "<td align=center><a href=\"_gwcgi_?"
763	<< "c=" << collection << "&ppnum=" << phrase << "\">"
764	<< body << "</a></td>"
765	<< "<td align=left><a href=\"_gwcgi_?"
766	<< "c=" << collection << "&ppnum=" << phrase << "\">";
767	textout << suffix << "</a></td>"
768	<< "<td>" << tf << "</td><td>" << df << "</td></tr>\n";
769	}
770	}
771	}
772
773	// split an expansion into prefix and suffix
774	void phindaction::split_phrase(const UCArray &word, const UCArray &body,
775	UCArray &prefix, UCArray &suffix) {
776
777	prefix.clear();
778	suffix.clear();
779
780	bool readingPrefix = true;
781	UCArray::const_iterator here = word.begin();
782	UCArray::const_iterator end = word.end();
783
784	while (here != end) {
785
786	// if we've not read all the prefix, add the next char to the prefix
787	if (readingPrefix) {
788	if (phrase_match(body, here, end)) {
789	readingPrefix = false;
790	// trim whitespace from end of prefix & start of suffix
791	if (!prefix.empty()) {
792	prefix.pop_back();
793	}
794	if ((here != end) && (*here == ' ')) {
795	here++;
796	}
797	} else {
798	prefix.push_back(*here);
799	here++;
800	}
801	}
802	// if we've finished with the prefix, update the suffix
803	else {
804	suffix.push_back(*here);
805	here++;
806	}
807	}
808	}
809
810	// phrase_match
811	//
812	// compare two strings, one represented as an UCArray, the other as two
813	// UCArray iterators.
814	//
815	// Return true if the UCArray is the same as the phrase the iterators point
816	// to for the length of the UCArray.
817	bool phindaction::phrase_match(const UCArray &text, UCArray::const_iterator &here,
818	UCArray::const_iterator end) {
819
820	UCArray::const_iterator one_here = text.begin();
821	UCArray::const_iterator one_end = text.end();
822	UCArray::const_iterator two_here = here;
823
824	// iterate over the length of the first string, comparing each element to
825	// the corresponding element in the second string.
826	while (one_here != one_end) {
827
828	if (two_here == end) {
829	return false;
830	} else if (one_here != two_here) {
831	return false;
832	}
833	one_here++;
834	two_here++;
835	}
836
837	here = two_here;
838	return true;
839	}
840
841	void phindaction::print_documents(bool XMLmode, const text_t &basepath,
842	const text_t &collection,
843	const vector <unsigned long> &docNums,
844	const vector <unsigned long> &docFreq,
845	unsigned long first, unsigned long last,
846	displayclass &disp, outconvertclass &outconvert,
847	ostream &textout) {
848
849	// Create a TextData object to read the document data
850	TextData docdata;
851
852	text_t fullpath = filename_cat(basepath, "docs");
853	char *fullpathc = fullpath.getcstr();
854	#if defined __WIN32__
855	char *base = "";
856	#else
857	char *base = "/";
858	#endif
859
860	if (!docdata.LoadData (base, fullpathc)) {
861	// FatalError (1, "Couldn't load text information for \"%s\"", fullpathc);
862	exit (0);
863	}
864
865	delete fullpathc;
866
867	UCArray title, hash;
868	unsigned long freq, doc;
869
870	for (unsigned long d = first; d < last; d++) {
871	doc = docNums[d];
872	freq = docFreq[d];
873
874	get_document_all_data(docdata, doc, title, hash);
875
876	if (XMLmode) {
877	textout << "<document num=\"" << d
878	<< "\" hash=\"" << hash
879	<< "\" freq=\"" << freq
880	<< "\" title=\"" << title << "\"/>\n";
881	} else {
882	textout << outconvert << disp
883	<< "<tr valign=top><td><a href=\"_gwcgi_?"
884	<< "c=" << collection;
885	textout << "&a=d&d=" << hash << "\">" << title << "</a>"
886	<< "</td><td>" << freq << "</td></tr>\n";
887	}
888	}
889
890	docdata.UnloadData();
891	}
892
893	// Get all the data about a docment
894	//
895	// The document's details are stored in docData as record docNum.
896	// We retrieve:
897	// title - the document's title
898	// hash - the document's unique OID
899	void phindaction::get_document_all_data(TextData &docdata, unsigned long docNum,
900	UCArray &title, UCArray &hash) {
901
902	UCArray text;
903	UCArray docLevel;
904	SetCStr(docLevel, "Document");
905
906	// Look the word up in the textData
907	if (!GetDocText (docdata, docLevel, docNum, text)) {
908	// FatalError (1, "Error while trying to get document %u", docNum);
909	exit (0);
910	}
911
912	// Ignore everything up to the first colon
913	UCArray::iterator next = text.begin();
914	while (*next++ != '\t');
915
916	// Get the document OID (hash)
917	hash.clear();
918	for (; *next != '\t'; next++) {
919	hash.push_back(*next);
920	}
921
922	// Get the title
923	text.push_back('\n');
924	title.clear();
925	for (next++; *next != '\n'; next++) {
926	title.push_back(*next);
927	}
928	}
929
930	void phindaction::toUCArray(const text_t &in, UCArray &out) {
931	out.clear();
932	text_t::const_iterator here = in.begin();
933	text_t::const_iterator end = in.end();
934	while (here != end) {
935	out.push_back((unsigned char) *here);
936	here++;
937	}
938	}
939
940	void phindaction::output_error (const text_t &message, ostream &textout,
941	outconvertclass &outconvert,
942	displayclass & disp, ostream &logout,
943	bool XMLmode) {
944
945	logout << outconvert << message << "\n";
946	if (XMLmode) {
947	textout << outconvert
948	<< "<phinddata>\n"
949	<< "<phinderror>" << message << "</phinderror>\n"
950	<< "</phinddata>\n";
951	} else {
952	textout << outconvert << disp
953	<< "_header_\n"
954	<< message
955	<< "_footer_\n";
956	}
957	}

Note: See TracBrowser for help on using the repository browser.

Download in other formats: