Context Navigation

source: trunk/gsdl/src/recpt/querytools.cpp@ 10733

Last change on this file since 10733 was 10411, checked in by mdewsnip, 19 years ago
Added some code to fix bugs in Lucene "all" and phrase searching with a plain search form.
Property svn:executable set to ``* Property svn:keywords set to `Author Date Id Revision`
File size: 19.0 KB

Line
1	/**********************************************************************
2	*
3	* querytools.cpp --
4	* Copyright (C) 1999 The New Zealand Digital Library Project
5	*
6	* A component of the Greenstone digital library software
7	* from the New Zealand Digital Library Project at the
8	* University of Waikato, New Zealand.
9	*
10	* This program is free software; you can redistribute it and/or modify
11	* it under the terms of the GNU General Public License as published by
12	* the Free Software Foundation; either version 2 of the License, or
13	* (at your option) any later version.
14	*
15	* This program is distributed in the hope that it will be useful,
16	* but WITHOUT ANY WARRANTY; without even the implied warranty of
17	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18	* GNU General Public License for more details.
19	*
20	* You should have received a copy of the GNU General Public License
21	* along with this program; if not, write to the Free Software
22	* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23	*
24	*********************************************************************/
25
26	#include "querytools.h"
27	#include <ctype.h>
28	#include "unitool.h" // for is_unicode_letdig
29
30	// request.filterResultOptions and request.fields (if required) should
31	// be set from the calling code
32	void set_queryfilter_options (FilterRequest_t &request, const text_t &querystring,
33	cgiargsclass &args) {
34
35	request.filterName = "QueryFilter";
36
37	OptionValue_t option;
38
39	option.name = "Term";
40	option.value = querystring;
41	request.filterOptions.push_back (option);
42
43	option.name = "QueryType";
44	option.value = (args.getintarg("t")) ? "ranked" : "boolean";
45	request.filterOptions.push_back (option);
46
47	option.name = "MatchMode";
48	option.value = (args.getintarg("t")) ? "some" : "all";
49	request.filterOptions.push_back (option);
50
51	option.name = "Casefold";
52	option.value = (args.getintarg("k")) ? "true" : "false";
53	request.filterOptions.push_back (option);
54
55	option.name = "Stem";
56	option.value = (args.getintarg("s")) ? "true" : "false";
57	request.filterOptions.push_back (option);
58
59	if (!args["h"].empty()) {
60	option.name = "Index";
61	option.value = args["h"];
62	request.filterOptions.push_back (option);
63	}
64
65	if (!args["j"].empty()) {
66	option.name = "Subcollection";
67	option.value = args["j"];
68	request.filterOptions.push_back (option);
69	}
70
71	if (!args["n"].empty()) {
72	option.name = "Language";
73	option.value = args["n"];
74	request.filterOptions.push_back (option);
75	}
76
77	if (!args["g"].empty()) { // granularity for mgpp
78	option.name = "Level";
79	option.value = args["g"];
80	request.filterOptions.push_back (option);
81	}
82
83	set_more_queryfilter_options (request, args);
84	}
85
86	void set_queryfilter_options (FilterRequest_t &request, const text_t &querystring1,
87	const text_t &querystring2, cgiargsclass &args) {
88
89	set_queryfilter_options (request, querystring1, args);
90
91	// fill in the second query if needed
92	if (!args["cq2"].empty()) {
93	OptionValue_t option;
94
95	option.name = "CombineQuery";
96	option.value = args["cq2"];
97	request.filterOptions.push_back (option);
98
99	option.name = "Term";
100	option.value = querystring2;
101	request.filterOptions.push_back (option);
102
103	option.name = "QueryType";
104	option.value = (args.getintarg("t")) ? "ranked" : "boolean";
105	request.filterOptions.push_back (option);
106
107	option.name = "Casefold";
108	option.value = (args.getintarg("k")) ? "true" : "false";
109	request.filterOptions.push_back (option);
110
111	option.name = "Stem";
112	option.value = (args.getintarg("s")) ? "true" : "false";
113	request.filterOptions.push_back (option);
114
115	if (!args["h2"].empty()) {
116	option.name = "Index";
117	option.value = args["h2"];
118	request.filterOptions.push_back (option);
119	}
120
121	if (!args["j2"].empty()) {
122	option.name = "Subcollection";
123	option.value = args["j2"];
124	request.filterOptions.push_back (option);
125	}
126
127	if (!args["n2"].empty()) {
128	option.name = "Language";
129	option.value = args["n2"];
130	request.filterOptions.push_back (option);
131	}
132	}
133	set_more_queryfilter_options (request, args);
134	}
135
136	void set_more_queryfilter_options (FilterRequest_t &request, cgiargsclass &args) {
137
138	OptionValue_t option;
139	int arg_m = args.getintarg("m");
140
141	option.name = "Maxdocs";
142	option.value = arg_m;
143	request.filterOptions.push_back (option);
144
145	// option.name = "StartResults";
146	// option.value = args["r"];
147	// request.filterOptions.push_back (option);
148
149	// option.name = "EndResults";
150	// int endresults = args.getintarg("o") + (args.getintarg("r") - 1);
151	// if ((endresults > arg_m) && (arg_m != -1)) endresults = arg_m;
152	// option.value = endresults;
153	// request.filterOptions.push_back (option);
154	}
155
156	void format_querystring (text_t &querystring, int querymode, bool segment) {
157	text_t formattedstring;
158
159	if (querymode == 1 && !segment) return;
160
161	text_t::const_iterator here = querystring.begin();
162	text_t::const_iterator end = querystring.end();
163
164	// space is used to insert spaces between Chinese
165	// characters. No space is needed before the first
166	// Chinese character.
167	bool space = false;
168
169	// want to remove ()\|!& from querystring so boolean queries are just
170	// "all the words" queries (unless querymode is advanced)
171	while (here != end) {
172	if ((querymode == 0) && (here == '(' \|\| here == ')' \|\| *here == '\|' \|\|
173	here == '!' \|\| here == '&')) {
174	formattedstring.push_back(' ');
175	} else if (segment) {
176	if ((here >= 0x4e00 && here <= 0x9fa5) \|\|
177	(here >= 0xf900 && here <= 0xfa2d)) {
178	// Chinese character
179	if (!space) formattedstring.push_back (0x200b); // zero width space
180	formattedstring.push_back (*here);
181	formattedstring.push_back (0x200b);
182	space = true;
183	} else {
184
185	// non-Chinese character
186	formattedstring.push_back (*here);
187	space = false;
188
189	}
190
191	} else {
192	formattedstring.push_back (*here);
193	}
194	++here;
195	}
196	querystring = formattedstring;
197	}
198
199
200
201	void add_dates(text_t &querystring, int startdate, int enddate,
202	int startbc, int endbc, int ct)
203	{
204	if(startdate)
205	{
206	int querystringis = 0;
207	text_t::const_iterator here = querystring.begin();
208	text_t::const_iterator end = querystring.end();
209	while(here!=end)
210	{
211	if(!(isspace((*here)))){
212	here = end;
213	querystringis = 1;
214	}
215	else
216	++here;
217	}
218	//converting BCE dates
219	if(startbc && startdate > 0)
220	{
221	startdate *= -1;
222	}
223	if(endbc && enddate > 0)
224	{
225	enddate *= -1;
226	}
227	if(enddate != 0 && enddate<startdate)
228	{
229	cout<<"enddate too small"<<endl;
230	return;
231	}
232	if(querystringis)
233	querystring.appendcstr(" AND");
234	if(!enddate)
235	{
236	if (ct==1) {
237	mgpp_adddateelem(querystring,startdate);
238	}
239	else { // lucene
240	lucene_adddateelem(querystring,startdate);
241	}
242	}
243	else{
244	int nextdate = startdate;
245	querystring.appendcstr(" (");
246	while(nextdate<=enddate)
247	{
248	if(nextdate!=0) {
249	if (ct==1) {
250	mgpp_adddateelem(querystring,nextdate);
251	}
252	else { // lucene
253	lucene_adddateelem(querystring,nextdate);
254	}
255	}
256	++nextdate;
257	}
258	querystring.appendcstr(" )");
259	}
260	}
261
262	}
263
264	void get_phrases (const text_t &querystring, text_tarray &phrases) {
265
266	phrases.erase (phrases.begin(), phrases.end());
267	if (!querystring.empty()) {
268
269	text_t::const_iterator end = querystring.end();
270	text_t::const_iterator here = findchar (querystring.begin(), end, '"');
271	if (here != end) {
272	text_t tmptext;
273	bool foundquote = false;
274	while (here != end) {
275	if (*here == '"') {
276	if (foundquote) {
277	if (!tmptext.empty()) {
278	phrases.push_back(tmptext);
279	tmptext.clear();
280	}
281	foundquote = false;
282	} else foundquote = true;
283	} else {
284	if (foundquote) tmptext.push_back (*here);
285	}
286	++here;
287	}
288	}
289	}
290	}
291
292	// search history tool
293	// also used for form query macros
294	text_t escape_quotes(const text_t &querystring) {
295
296	text_t::const_iterator here = querystring.begin();
297	text_t::const_iterator end = querystring.end();
298
299	text_t escquery = "";
300	while (here != end) {
301	if (here != '\'' && here != '\"' && here != '\n' && here != '\r') escquery.push_back(*here);
302	else if (here == '\n' \|\| here == '\r') {
303	escquery.push_back(' ');
304	} else {
305	escquery +="\\\\";
306	escquery.push_back(*here);
307	}
308
309	++here;
310	}
311	return escquery;
312
313	}
314
315	// some query form parsing functions for use with mgpp
316
317	void parse_reg_query_form(text_t &querystring, cgiargsclass &args)
318	{
319	querystring.clear();
320
321	const int ct = args.getintarg("ct");
322	int argt = args.getintarg("t");// t=0 -and, t=1 - or
323
324	text_t combine;
325	if (ct==1) {
326	if (argt == 0) combine = "&";
327	else combine = "\|";
328	}
329	else { // lucene
330	if (argt == 0) combine = "AND";
331	else combine = "OR";
332	}
333
334	text_t field = args["fqf"];
335	if (field.empty()) return; // no query
336	text_tarray fields;
337	splitchar(field.begin(), field.end(), ',', fields);
338
339	text_t value = args["fqv"];
340	if (value.empty()) return; // somethings wrong
341	text_tarray values;
342	splitchar(value.begin(), value.end(), ',', values);
343
344
345	for (int i=0; i< values.size(); ++i) {
346	if (!values[i].empty()) {
347	if (ct == 1) {
348	mgpp_addqueryelem(querystring, fields[i], values[i], combine);
349	}
350	else { // lucene
351	lucene_addqueryelem(querystring, fields[i], values[i], combine);
352	}
353	}
354	}
355
356	}
357
358
359	void parse_adv_query_form(text_t &querystring, cgiargsclass &args){
360
361	querystring.clear();
362
363	const int ct = args.getintarg("ct");
364	text_t combine;
365	if (ct==1) {
366	combine = "&";
367	}
368	else { // lucene
369	combine = "AND";
370	}
371
372	text_t field = args["fqf"];
373	if (field.empty()) return; // no query
374	text_tarray fields;
375	splitchar(field.begin(), field.end(), ',', fields);
376
377	text_t value = args["fqv"];
378	if (value.empty()) return; // somethings wrong
379	text_tarray values;
380	splitchar(value.begin(), value.end(), ',', values);
381
382	text_t stem = args["fqs"];
383	if (stem.empty()) return; // somethings wrong
384	text_tarray stems;
385	splitchar(stem.begin(), stem.end(), ',', stems);
386
387	text_t fold = args["fqk"];
388	if (fold.empty()) return; // somethings wrong
389	text_tarray folds;
390	splitchar(fold.begin(), fold.end(), ',', folds);
391
392	text_t comb = args["fqc"];
393	if (comb.empty()) return; //somethings wrong
394	text_tarray combs;
395	splitchar(comb.begin(), comb.end(), ',', combs);
396
397	for(int i=0; i< values.size(); ++i) {
398	if (!values[i].empty()) {
399	if (i!=0) {
400	if (ct==1) {
401	if (combs[i-1]=="and") combine = "&";
402	else if (combs[i-1]=="or")combine = "\|";
403	else if (combs[i-1]=="not")combine = "!";
404	}
405	else { // lucene
406	if (combs[i-1]=="and") combine = "AND";
407	else if (combs[i-1]=="or")combine = "OR";
408	else if (combs[i-1]=="not")combine = "NOT";
409	}
410	}
411	text_t term = addstemcase(values[i], stems[i], folds[i]);
412	mgpp_addqueryelem(querystring, fields[i], term, combine);
413	}
414
415	}
416	}
417
418	text_t addstemcase(const text_t &terms, const text_t &stem, const text_t &fold) {
419
420	text_t outtext;
421	text_t word;
422	//unsigned short c;
423	text_t::const_iterator here = terms.begin();
424	text_t::const_iterator end = terms.end();
425
426	while (here !=end) {
427
428	if (is_unicode_letdig(*here)) {
429	// not word boundary
430	word.push_back(*here);
431	++here;
432	}
433	else {
434	// found word boundary
435	if (!word.empty() ) {
436	if (stem == "1" \|\| fold =="1") {
437	word += "#";
438	if (stem == "1") word += "s";
439	//else word += "u";
440
441	if (fold == "1") word += "i";
442	//else word += "c";
443	}
444
445	word += " ";
446	outtext += word;
447	word.clear();
448	}
449	if (*here == '\"') {
450	outtext.push_back(*here);
451	}
452	++here;
453	}
454	}
455
456	// get last word
457	if (!word.empty()) {
458	if (stem == "1"\|\| fold == "1") {
459	word += "#";
460	if (stem == "1") word += "s";
461	//else word += "u";
462
463	if (fold == "1") word += "i";
464	//else word += "c";
465	}
466	word += " ";
467	outtext += word;
468	}
469	return outtext;
470	}
471
472
473	void mgpp_adddateelem(text_t& querystring, const int date)
474	{
475	querystring.appendcstr(" [");
476	if(date<0) {
477	querystring.appendcstr("bc");
478	querystring.appendint((date*-1));
479	}
480	else {
481	querystring.appendint(date);
482	}
483	querystring.appendcstr("]:CV");
484	}
485
486	void lucene_adddateelem(text_t& querystring, const int date)
487	{
488	querystring.appendcstr(" CV:(");
489	if(date<0) {
490	querystring.appendcstr("bc");
491	querystring.appendint((date*-1));
492	}
493	else {
494	querystring.appendint(date);
495	}
496	querystring.appendcstr(")");
497	}
498
499
500	void mgpp_addqueryelem(text_t &querystring, text_t &tag,
501	text_t &query, text_t &combine) {
502	if (!querystring.empty()) { // have to put and/or
503	querystring += " " + combine + " ";
504
505	}
506	if (tag=="ZZ" \|\| tag=="") { // just add onto querystring
507	querystring += query;
508	}
509	else {
510	querystring += "["+query+"]:"+tag;
511	}
512
513	}
514
515	void lucene_addqueryelem(text_t &querystring, text_t &tag,
516	text_t &query, text_t &combine) {
517	if (!querystring.empty()) { // have to put and/or
518	querystring += " " + combine + " ";
519
520	}
521	if (tag=="ZZ" \|\| tag=="") { // just add onto querystring
522	querystring += query;
523	}
524	else {
525	querystring += tag+":("+query+")";
526	}
527	}
528
529
530	void addqueryelem_ex(text_t &querystring, const text_t &tag,
531	const text_t &terms, const text_t &stem, const text_t &fold,
532	const text_t& combine, const text_t& word_combine) {
533	if (!querystring.empty()) { // have to put and/or
534	querystring += " " + combine + " ";
535	}
536	text_t outtext; outtext.reserve(512);
537	text_t word; word.reserve(100);
538	//unsigned short c;
539	text_t::const_iterator here = terms.begin();
540	text_t::const_iterator end = terms.end();
541	bool inquote = false, firstword = true;
542
543	text_t word2; word2.reserve(256);
544
545	while (here !=end) {
546	if (is_unicode_space(*here)) {
547	if (word2 == "AND") { word2.clear(); word2.push_back(7527); word2.appendcarr("AND", 3); word2.push_back(7527); }
548	else if (word2 == "OR") { word2.clear(); word2.push_back(7527); word2.appendcarr("OR", 2); word2.push_back(7527); }
549	else if (word2 == "NOT") { word2.clear(); word2.push_back(7527); word2.appendcarr("NOT", 3); word2.push_back(7527); }
550	else if (word2 == "NEAR") { word2.clear(); word2.push_back(7527); word2.appendcarr("NEAR", 4); word2.push_back(7527); }
551	else if (word2 == "WITHIN") { word2.clear(); word2.push_back(7527); word2.appendcarr("WITHIN", 6); word2.push_back(7527); }
552	if (inquote) {
553	word2.push_back(*here);
554	}
555	word.append(word2); word2.clear();
556
557	if (!inquote && !word.empty() ) {
558	// found word boundary
559
560	if (stem == "1" \|\| fold =="1") {
561	word += "#";
562	if (stem == "1") word += "s";
563	//else word += "u";
564
565	if (fold == "1") word += "i";
566	//else word += "c";
567	}
568	if (firstword) {
569	firstword = false;
570	} else {
571	outtext += " " + word_combine + " ";
572	}
573	outtext += "[" + word + "]:"+tag;
574	word.clear();
575	}
576	++here;
577	} else if (*here == '\"') {
578	word2.push_back(*here);
579	inquote = !inquote;
580	++here;
581	} else {
582	// not word boundary
583	word2.push_back(*here);
584	++here;
585	}
586	}
587
588	// get last word
589	if (!word2.empty()) {
590	if (word2 == "AND") { word2.clear(); word2.push_back(7527); word2.appendcarr("AND", 3); word2.push_back(7527); }
591	else if (word2 == "OR") { word2.clear(); word2.push_back(7527); word2.appendcarr("OR", 2); word2.push_back(7527); }
592	else if (word2 == "NOT") { word2.clear(); word2.push_back(7527); word2.appendcarr("NOT", 3); word2.push_back(7527); }
593	else if (word2 == "NEAR") { word2.clear(); word2.push_back(7527); word2.appendcarr("NEAR", 4); word2.push_back(7527); }
594	else if (word2 == "WITHIN") { word2.clear(); word2.push_back(7527); word2.appendcarr("WITHIN", 6); word2.push_back(7527); }
595	word.append(word2); word2.clear();
596
597	if (stem == "1"\|\| fold == "1") {
598	word += "#";
599	if (stem == "1") word += "s";
600	//else word += "u";
601
602	if (fold == "1") word += "i";
603	//else word += "c";
604	}
605	if (!outtext.empty()) outtext += " " + word_combine + " ";
606	outtext += "[" + word + "]:"+tag;
607	}
608	querystring += "(" + outtext + ")";
609	}
610
611
612	void add_field_info(text_t &querystring, const text_t &tag, int type) {
613
614	if (type == 1) { //mgpp
615	querystring = "["+querystring+"]:"+tag;
616	} else if (type == 2) { // lucene
617	querystring = tag+":("+querystring+")";
618	}
619
620	}
621
622
623	void format_field_info(text_t &querystring, cgiargsclass &args) {
624
625	text_t tag = args["fqf"];
626	if (tag == "ZZ" \|\| tag == "") {
627	return; // do nothing
628	}
629
630	int argct = args.getintarg("ct");
631	int argt = args.getintarg("t");// t=0 -and, t=1 - or
632	int argb = args.getintarg("b"); // b=0 simple, b=1 advanced
633
634	// Special code for Lucene
635	// The default operator for Lucene is "or", so we need to add "+" symbols when t == 0
636	// Also, we need to be careful not to mess up phrase searches
637	if (argct == 2) {
638	text_t processed_querystring = "";
639	text_t queryelement = "";
640	text_t combine = ((argt == 0) ? "+" : "");
641	bool in_phrase = false;
642	text_t::const_iterator here = querystring.begin();
643	text_t::const_iterator end = querystring.end();
644	while (here != end) {
645	if (is_unicode_letdig(*here)) {
646	queryelement.push_back(*here);
647	}
648
649	// Detect phrase starts/finishes
650	else if (*here == '"') {
651	queryelement.push_back(*here);
652	if (in_phrase == false) in_phrase = true;
653	else if (in_phrase == true) {
654	add_field_info(queryelement, tag, argct);
655	processed_querystring += combine + queryelement;
656	queryelement.clear();
657	in_phrase = false;
658	}
659	}
660
661	// Found word boundary
662	else if (in_phrase) {
663	queryelement.push_back(*here);
664	}
665	else {
666	if (!queryelement.empty()) {
667	add_field_info(queryelement, tag, argct);
668	processed_querystring += combine + queryelement;
669	queryelement.clear();
670	}
671	processed_querystring.push_back(*here);
672	}
673
674	++here;
675	}
676
677	// Get last element
678	if (!queryelement.empty()) {
679	add_field_info(queryelement, tag, argct);
680	processed_querystring += combine + queryelement;
681	}
682
683	querystring = processed_querystring;
684	return;
685	}
686
687	if (argb==0 && argt==0) {
688	// simple 'and' search - just put tag info round whole query string
689	add_field_info(querystring, tag, argct);
690	return;
691	}
692
693	// we need to individually tag words
694	text_t outtext;
695	text_t word;
696	//unsigned short c;
697	text_t::const_iterator here = querystring.begin();
698	text_t::const_iterator end = querystring.end();
699
700	while (here !=end) {
701
702	if (is_unicode_letdig(here)\|\| here == '#' \|\| *here == '/' ) {
703	// include term modifiers in a word just in case
704	// not word boundary
705	word.push_back(*here);
706	++here;
707	}
708	else {
709	// found word boundary
710	if (!word.empty() ) {
711	add_field_info(word, tag, argct);
712	outtext += word;
713	word.clear();
714	}
715	// everything else, we add into the query string
716	outtext.push_back(*here);
717	++here;
718	}
719	}
720
721	// get last word
722	if (!word.empty()) {
723	add_field_info(word, tag, argct);
724	outtext += word;
725	}
726
727	querystring = outtext;
728	}
729

Note: See TracBrowser for help on using the repository browser.

Download in other formats: