Context Navigation

source: gsdl/trunk/runtime-src/src/recpt/querytools.cpp@ 20602

Last change on this file since 20602 was 20602, checked in by kjdon, 15 years ago
get_plain_query_terms: first pass through to remove TI:(...) and [...]:TI, and AND,OR,NOT for lucene, then remove term modifiers etc
Property svn:executable set to ``* Property svn:keywords set to `Author Date Id Revision`
File size: 27.1 KB

Line
1	/**********************************************************************
2	*
3	* querytools.cpp --
4	* Copyright (C) 1999 The New Zealand Digital Library Project
5	*
6	* A component of the Greenstone digital library software
7	* from the New Zealand Digital Library Project at the
8	* University of Waikato, New Zealand.
9	*
10	* This program is free software; you can redistribute it and/or modify
11	* it under the terms of the GNU General Public License as published by
12	* the Free Software Foundation; either version 2 of the License, or
13	* (at your option) any later version.
14	*
15	* This program is distributed in the hope that it will be useful,
16	* but WITHOUT ANY WARRANTY; without even the implied warranty of
17	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18	* GNU General Public License for more details.
19	*
20	* You should have received a copy of the GNU General Public License
21	* along with this program; if not, write to the Free Software
22	* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23	*
24	*********************************************************************/
25
26	#include "querytools.h"
27	#include <ctype.h>
28	#include "unitool.h" // for is_unicode_letdig
29
30	// sets the ct, qt, qto arguments
31	void set_query_type_args(ColInfoResponse_t *cinfo, cgiargsclass &args) {
32
33	if (args["ct"].empty()) {
34	text_t build_type = cinfo->buildType;
35	if (build_type == "mgpp") {
36	args["ct"] = "1";
37	} else if (build_type == "lucene") {
38	args["ct"] = "2";
39	} else {
40	args["ct"] = "0";
41	}
42	}
43	text_t arg_ct = args["ct"];
44	if (arg_ct == "0") {
45	// mg
46	args["qt"] = "0";
47	args["qto"] = "0";
48	return;
49	}
50
51	if (!args["qt"].empty() && !args["qto"].empty()) {
52	return;
53	}
54
55	text_tmap::iterator check = cinfo->format.find("SearchTypes");
56	text_t search_types;
57	if(check != cinfo->format.end() && !(*check).second.empty()){
58	search_types = (*check).second;
59	} else {
60	// assume plain,form
61	if (args["qto"].empty()) args["qto"] = "3";
62	if (args["qt"].empty()) {
63	int arg_qto = args.getintarg("qto");
64	if (arg_qto == 2) {
65	args["qt"] = "1";
66	} else {
67	args["qt"] = "0";
68	}
69	}
70	return;
71	}
72
73
74	if (args["qto"].empty()) {
75	unsigned int type = 0;
76	if (findword(search_types.begin(), search_types.end(), "form") != search_types.end()) {
77	type \|= 2;
78	}
79	if (findword(search_types.begin(), search_types.end(), "plain") != search_types.end()) {
80	type \|= 1;
81	}
82	args.setintarg("qto", type);
83	}
84
85	if (args["qt"].empty()) {
86	int arg_qto = args.getintarg("qto");
87	if (arg_qto == 2 \|\| (arg_qto == 3 && starts_with(search_types, "form"))) {
88	args["qt"] = "1";
89	} else {
90	args["qt"] = "0";
91	}
92	}
93	}
94
95	// sets the ks, ss, afs (casesupport, stemsupport, accentfoldsupport) args
96	void set_stem_index_args(ColInfoResponse_t *cinfo, cgiargsclass &args) {
97	int stemIndexes = cinfo->stemIndexes;
98
99	if (stemIndexes & SIcasefold) {
100	args["ks"] = 1;
101	}
102	if (stemIndexes & SIstem) {
103	args["ss"] = 1;
104	}
105	if (stemIndexes & SIaccentfold) {
106	args["afs"] = 1;
107	}
108
109	}
110
111	// request.filterResultOptions and request.fields (if required) should
112	// be set from the calling code
113	void set_queryfilter_options (FilterRequest_t &request,
114	const text_t &querystring,
115	cgiargsclass &args) {
116
117	request.filterName = "QueryFilter";
118
119	OptionValue_t option;
120
121	option.name = "Term";
122	option.value = querystring;
123	request.filterOptions.push_back (option);
124
125	option.name = "QueryType";
126	option.value = (args.getintarg("t")) ? "ranked" : "boolean";
127	request.filterOptions.push_back (option);
128
129	option.name = "MatchMode";
130	// mgpp in advanced mode, always use some query
131	if (args.getintarg("ct") == 1 && args.getintarg("b") == 1) {
132	option.value = "some";
133	} else {
134	option.value = (args.getintarg("t")) ? "some" : "all";
135	}
136	request.filterOptions.push_back (option);
137
138	option.name = "Casefold";
139	option.value = (args.getintarg("k")) ? "true" : "false";
140	request.filterOptions.push_back (option);
141
142	option.name = "Stem";
143	option.value = (args.getintarg("s")) ? "true" : "false";
144	request.filterOptions.push_back (option);
145
146	option.name = "AccentFold";
147	option.value = (args.getintarg("af")) ? "true" : "false";
148	request.filterOptions.push_back (option);
149
150	if (!args["h"].empty()) {
151	option.name = "Index";
152	option.value = args["h"];
153	request.filterOptions.push_back (option);
154	}
155
156	if (!args["j"].empty()) {
157	option.name = "Subcollection";
158	option.value = args["j"];
159	request.filterOptions.push_back (option);
160	}
161
162	if (!args["n"].empty()) {
163	option.name = "Language";
164	option.value = args["n"];
165	request.filterOptions.push_back (option);
166	}
167
168	if (!args["g"].empty()) { // granularity for mgpp
169	option.name = "Level";
170	option.value = args["g"];
171	request.filterOptions.push_back (option);
172	}
173
174	if (!args["fs"].empty()) { // filter string for lucene
175	option.name = "FilterString";
176	option.value = args["fs"];
177	request.filterOptions.push_back (option);
178	}
179
180	if (!args["sf"].empty()) { // sort field for lucene
181	option.name = "SortField";
182	option.value = args["sf"];
183	request.filterOptions.push_back (option);
184	}
185
186	if (!args["fuzziness"].empty() && args["fuzziness"] != "100") { // fuzziness value for lucene
187	option.name = "Fuzziness";
188	option.value = (text_t) "0." + args["fuzziness"];
189	request.filterOptions.push_back (option);
190	}
191
192	set_more_queryfilter_options (request, args);
193	}
194
195	void set_queryfilter_options (FilterRequest_t &request,
196	const text_t &querystring1,
197	const text_t &querystring2, cgiargsclass &args) {
198
199	set_queryfilter_options (request, querystring1, args);
200
201	// fill in the second query if needed
202	if (!args["cq2"].empty()) {
203	OptionValue_t option;
204
205	option.name = "CombineQuery";
206	option.value = args["cq2"];
207	request.filterOptions.push_back (option);
208
209	option.name = "Term";
210	option.value = querystring2;
211	request.filterOptions.push_back (option);
212
213	option.name = "QueryType";
214	option.value = (args.getintarg("t")) ? "ranked" : "boolean";
215	request.filterOptions.push_back (option);
216
217	option.name = "Casefold";
218	option.value = (args.getintarg("k")) ? "true" : "false";
219	request.filterOptions.push_back (option);
220
221	option.name = "Stem";
222	option.value = (args.getintarg("s")) ? "true" : "false";
223	request.filterOptions.push_back (option);
224
225	option.name = "AccentFold";
226	option.value = (args.getintarg("af")) ? "true" : "false";
227	request.filterOptions.push_back (option);
228
229	if (!args["h2"].empty()) {
230	option.name = "Index";
231	option.value = args["h2"];
232	request.filterOptions.push_back (option);
233	}
234
235	if (!args["j2"].empty()) {
236	option.name = "Subcollection";
237	option.value = args["j2"];
238	request.filterOptions.push_back (option);
239	}
240
241	if (!args["n2"].empty()) {
242	option.name = "Language";
243	option.value = args["n2"];
244	request.filterOptions.push_back (option);
245	}
246	}
247	set_more_queryfilter_options (request, args);
248	}
249
250	void set_more_queryfilter_options (FilterRequest_t &request,
251	cgiargsclass &args) {
252
253	OptionValue_t option;
254	int arg_m = args.getintarg("m");
255
256	option.name = "Maxdocs";
257	option.value = arg_m;
258	request.filterOptions.push_back (option);
259
260	// option.name = "StartResults";
261	// option.value = args["r"];
262	// request.filterOptions.push_back (option);
263
264	// option.name = "EndResults";
265	// int endresults = args.getintarg("o") + (args.getintarg("r") - 1);
266	// if ((endresults > arg_m) && (arg_m != -1)) endresults = arg_m;
267	// option.value = endresults;
268	// request.filterOptions.push_back (option);
269	}
270
271	bool is_special_character(int indexer_type, unsigned short character) {
272	// mgpp
273	if (indexer_type == 1) {
274	return (character == '#' \|\| character == '/' \|\| character == '*');
275	}
276	// lucene
277	else if (indexer_type == 2) {
278	return (character == '?' \|\| character == '*' \|\| character == '~' \|\|
279	character == '^');
280	}
281	return false;
282	}
283
284	// This function removes boolean operators from simple searches, and segments
285	// chinese characters if segment=true
286	void format_querystring (text_t &querystring, int querymode, bool segment) {
287	text_t formattedstring;
288
289	// advanced search, no segmenting, don't need to do anything
290	if (querymode == 1 && !segment) return;
291
292	text_t::const_iterator here = querystring.begin();
293	text_t::const_iterator end = querystring.end();
294
295	// space is used to insert spaces between Chinese
296	// characters. No space is needed before the first
297	// Chinese character.
298	bool space = false;
299
300	// want to remove ()\|!& from querystring so boolean queries are just
301	// "all the words" queries (unless querymode is advanced)
302	while (here != end) {
303	if ((querymode == 0) && (here == '(' \|\| here == ')' \|\| *here == '\|' \|\|
304	here == '!' \|\| here == '&')) {
305	formattedstring.push_back(' ');
306	} else if (segment) {
307	if ((here >= 0x2e80 && here <= 0xd7a3) \|\|
308	( here >= 0xf900 && here <= 0xfa6a)) {
309	/* text_t not big enough to handle these. */
310	/* (here >= 0x20000 && here <= 0x2a6d6) \|\|
311	(here >= 0x2f800 && here <= 0x2fa1d)) { */
312
313	// CJK character
314	if (!space) formattedstring.push_back (0x200b); // zero width space
315	formattedstring.push_back (*here);
316	formattedstring.push_back (0x200b);
317	space = true;
318	} else {
319
320	// non-Chinese character
321	formattedstring.push_back (*here);
322	space = false;
323
324	}
325
326	} else {
327	formattedstring.push_back (*here);
328	}
329	++here;
330	}
331	querystring = formattedstring;
332	}
333
334	// turn query string into terms separated by spaces.
335	// still working on this...
336	text_t get_plain_query_terms(const text_t &querystring, const text_t &arg_ct) {
337	text_t::const_iterator here = querystring.begin();
338	text_t::const_iterator end = querystring.end();
339
340	// lets look for [] and () first - these are a pain.
341	text_t::const_iterator bracket;
342	text_t query_no_brackets = "";
343
344	// mgpp brackets: [xxx]:TI
345	if (findchar(here, end, '[') != end) {
346	while ((bracket = findchar(here, end, '[')) != end) {
347	// get the first bit
348	query_no_brackets += substr(here, bracket);
349	bracket++;
350	here = bracket;
351	// get the end bracket
352	bracket = findchar(here, end, ']');
353	query_no_brackets += substr(here, bracket);
354	// skip the :TI bits
355	while (*bracket != ' ' && bracket != end) { bracket++;}
356	here = bracket;
357	}
358	if (here != end) {
359	query_no_brackets += substr(here,end);
360	}
361	} else if (findchar(here, end, '(') != end) {
362	// lucene brackets TI:(xxx)
363	while ((bracket = findchar(here, end, '(')) != end) {
364	// back up the field name
365	text_t::const_iterator old_bracket = bracket;
366	while (*bracket != ' ' && bracket != here) {
367	--bracket;
368	}
369	if (bracket != here) {
370	// get the first bit
371	query_no_brackets += substr(here, bracket+1);
372	}
373	here = old_bracket +1;
374	// get the end bracket
375	bracket = findchar(here, end, ')');
376	query_no_brackets += substr(here, bracket);
377	if (bracket != end) {
378	here = bracket+1;
379	}
380	}
381	if (here != end) {
382	query_no_brackets += substr(here,end);
383	}
384	} else {
385	// was no brackets
386	query_no_brackets = querystring;
387	}
388
389
390	if (arg_ct == "2") { // lucene
391	// look for AND OR NOT and remove
392	here = query_no_brackets.begin();
393	end = query_no_brackets.end();
394	text_tlist terms;
395	splitword(here, end, "AND", terms);
396	joinchar(terms, ' ', query_no_brackets);
397	here = query_no_brackets.begin();
398	end = query_no_brackets.end();
399	splitword(here, end, "OR", terms);
400	joinchar(terms, ' ', query_no_brackets);
401	here = query_no_brackets.begin();
402	end = query_no_brackets.end();
403	splitword(here, end, "NOT", terms);
404	joinchar(terms, ' ', query_no_brackets);
405
406	}
407	text_t terms = "";
408	bool space = false;
409	here = query_no_brackets.begin();
410	end = query_no_brackets.end();
411
412	while (here != end) {
413	if (here == '#' \|\| here == '/') {
414	// skip over #is /10 etc
415	++here;
416	while (here != end && *here != ' ') {
417	++here;
418	}
419	if (here == end) break;
420	}
421	if (is_unicode_letdig(*here)) {
422	terms.push_back(*here);
423	space = false;
424	} else {
425	if (!space) {
426	terms.push_back(' ');
427	space = true;
428	}
429	}
430	++here;
431	}
432	return terms;
433
434	}
435
436	// search history tool
437	// also used for form query macros
438	text_t escape_quotes(const text_t &querystring) {
439
440	text_t::const_iterator here = querystring.begin();
441	text_t::const_iterator end = querystring.end();
442
443	text_t escquery = "";
444	while (here != end) {
445	if (here != '\'' && here != '\"' && here != '\n' && here != '\r') escquery.push_back(*here);
446	else if (here == '\n' \|\| here == '\r') {
447	escquery.push_back(' ');
448	} else {
449	escquery +="\\\\";
450	escquery.push_back(*here);
451	}
452
453	++here;
454	}
455	return escquery;
456
457	}
458
459	// Parses the terms into words, and adds #si if necessary
460	text_t addstemcase(const text_t &terms, const text_t &stem, const text_t &fold,
461	const int indexer_type) {
462
463	// the default stem and case are set to 0 if this is being used, so we are only adding on qualifiers if stem or fold is 1.
464	if (stem == "0" && fold == "0") {
465	return terms;
466	}
467	// this is only for mgpp collections, shouldn't be called for anything else
468	if (indexer_type != 1) {
469	return terms;
470	}
471
472	text_t outtext;
473	text_t word;
474
475	text_t::const_iterator here = terms.begin();
476	text_t::const_iterator end = terms.end();
477
478	while (here !=end) {
479
480	if (is_unicode_letdig(here) \|\| is_special_character(indexer_type, here)) {
481	// not word boundary
482	word.push_back(*here);
483	++here;
484	}
485	else {
486	// found word boundary
487	if (!word.empty() ) {
488	if (starts_with(word, "NEAR") \|\| starts_with(word, "WITHIN")) {
489	outtext += word;
490	word.clear();
491	}
492	else {
493	word += "#";
494	if (stem == "1") word += "s";
495	if (fold == "1") word += "i";
496	outtext += word;
497	word.clear();
498	}
499	}
500	// this only used in advanced form, so we leave in boolean operators
501	if (here == '\"' \|\| here == '&' \|\| here == '\|' \|\| here == '!' \|\|
502	here == '(' \|\| here == ')' \|\| is_unicode_space(*here)) {
503	outtext.push_back(*here);
504	}
505	++here;
506	}
507	}
508
509	// get last word
510	if (!word.empty()) {
511	word += "#";
512	if (stem == "1") word += "s";
513	if (fold == "1") word += "i";
514	word += " ";
515	outtext += word;
516	}
517	return outtext;
518	}
519
520
521	// some query form parsing functions for use with mgpp & lucene
522
523	void parse_reg_query_form(text_t &querystring, cgiargsclass &args, bool segment)
524	{
525	querystring.clear();
526
527	int argct = args.getintarg("ct");
528	int argt = args.getintarg("t");// t=0 -and, t=1 - or
529	int argb = args.getintarg("b");
530
531	text_t combine;
532
533	// lucene uses global combine, so only need this for mgpp
534	if (argct==1) {
535	if (argt == 0) combine = "&";
536	else combine = "\|";
537	}
538
539	text_t field = args["fqf"];
540	if (field.empty()) return; // no query
541	text_tarray fields;
542	splitchar(field.begin(), field.end(), ',', fields);
543
544	text_t value = args["fqv"];
545	if (value.empty()) return; // somethings wrong
546	text_tarray values;
547	splitchar(value.begin(), value.end(), ',', values);
548
549
550	for (int i=0; i< values.size(); ++i) {
551	if (!values[i].empty()) {
552	text_t this_value = values[i];
553	// remove operators for simple search, segments text if necessary
554	format_querystring(this_value, argb, segment);
555	// add tag info for this field (and other processing)
556	format_field_info(this_value, fields[i], argct, argt, argb);
557	// add into query string
558	if (argct == 2) {
559	// lucene
560	// we don't worry about AND/OR, cos this is done by defaultcombineoperator
561	querystring += this_value+" ";
562	} else {
563	// mgpp
564	if (!querystring.empty()) {
565	querystring += " "+ combine+ " ";
566	}
567	querystring += this_value;
568	}
569	}
570	}
571	}
572
573
574	void parse_adv_query_form(text_t &querystring, cgiargsclass &args, bool segment){
575	querystring.clear();
576
577	const int argct = args.getintarg("ct");
578	int argt = 0;// arg t is either not used (lucene) or used for natural/ranked (mgpp), so we set it to 0 = AND, by default
579	int argb = args.getintarg("b");
580	text_t combine;
581	if (argct==1) {
582	combine = "&";
583	}
584	else { // lucene
585	combine = "AND";
586	}
587
588	text_t field = args["fqf"];
589	if (field.empty()) return; // no query
590	text_tarray fields;
591	splitchar(field.begin(), field.end(), ',', fields);
592
593	text_t value = args["fqv"];
594	if (value.empty()) return; // somethings wrong
595	text_tarray values;
596	splitchar(value.begin(), value.end(), ',', values);
597
598	text_t comb = args["fqc"];
599	if (comb.empty()) return; //somethings wrong
600	text_tarray combs;
601	splitchar(comb.begin(), comb.end(), ',', combs);
602
603	text_tarray stems;
604	text_tarray folds;
605	if (argct == 1) {// mgpp - lucene doesn't do stem/case
606	text_t stem = args["fqs"];
607	if (stem.empty()) return; // somethings wrong
608	splitchar(stem.begin(), stem.end(), ',', stems);
609
610	text_t fold = args["fqk"];
611	if (fold.empty()) return; // somethings wrong
612	splitchar(fold.begin(), fold.end(), ',', folds);
613	}
614
615	for(int i=0; i< values.size(); ++i) {
616	if (!values[i].empty()) {
617	if (i!=0) {
618	if (argct==1) {
619	if (combs[i-1]=="and") combine = "&";
620	else if (combs[i-1]=="or")combine = "\|";
621	else if (combs[i-1]=="not")combine = "!";
622	}
623	else { // lucene
624	if (combs[i-1]=="and") combine = "AND";
625	else if (combs[i-1]=="or")combine = "OR";
626	else if (combs[i-1]=="not")combine = "NOT";
627	}
628	}
629	text_t this_value = values[i];
630	// remove operators for simple search, segments text if necessary
631	format_querystring(this_value, argb, segment);
632	if (argct == 1) { // mgpp only
633	this_value = addstemcase(this_value, stems[i], folds[i], argct);
634	}
635	// add tag info for this field (and other processing)
636	format_field_info(this_value, fields[i], argct, argt, argb);
637	// add into query string
638	if (!querystring.empty()) {
639	querystring += " "+ combine+ " ";
640	}
641	querystring += this_value;
642
643	}
644	}
645	}
646
647	// Extended addqueryelem for Human Info project
648	void addqueryelem_ex(text_t &querystring, const text_t &tag,
649	const text_t &terms, const text_t &stem,
650	const text_t &fold,
651	const text_t& combine, const text_t& word_combine) {
652
653	if (!querystring.empty()) { // have to put and/or
654	querystring += " " + combine + " ";
655	}
656	text_t outtext; outtext.reserve(512);
657	text_t word; word.reserve(100);
658	//unsigned short c;
659	text_t::const_iterator here = terms.begin();
660	text_t::const_iterator end = terms.end();
661	bool inquote = false, firstword = true;
662
663	text_t word2; word2.reserve(256);
664
665	while (here !=end) {
666	if (is_unicode_space(*here)) {
667	if (word2 == "AND") { word2.clear(); word2.push_back(7527); word2.appendcarr("AND", 3); word2.push_back(7527); }
668	else if (word2 == "OR") { word2.clear(); word2.push_back(7527); word2.appendcarr("OR", 2); word2.push_back(7527); }
669	else if (word2 == "NOT") { word2.clear(); word2.push_back(7527); word2.appendcarr("NOT", 3); word2.push_back(7527); }
670	else if (word2 == "NEAR") { word2.clear(); word2.push_back(7527); word2.appendcarr("NEAR", 4); word2.push_back(7527); }
671	else if (word2 == "WITHIN") { word2.clear(); word2.push_back(7527); word2.appendcarr("WITHIN", 6); word2.push_back(7527); }
672	if (inquote) {
673	word2.push_back(*here);
674	}
675	word.append(word2); word2.clear();
676
677	if (!inquote && !word.empty() ) {
678	// found word boundary
679
680	if (stem == "1" \|\| fold =="1") {
681	word += "#";
682	if (stem == "1") word += "s";
683	//else word += "u";
684
685	if (fold == "1") word += "i";
686	//else word += "c";
687	}
688	if (firstword) {
689	firstword = false;
690	} else {
691	outtext += " " + word_combine + " ";
692	}
693	outtext += "[" + word + "]:"+tag;
694	word.clear();
695	}
696	++here;
697	} else if (*here == '\"') {
698	word2.push_back(*here);
699	inquote = !inquote;
700	++here;
701	} else {
702	// not word boundary
703	word2.push_back(*here);
704	++here;
705	}
706	}
707
708	// get last word
709	if (!word2.empty()) {
710	if (word2 == "AND") { word2.clear(); word2.push_back(7527); word2.appendcarr("AND", 3); word2.push_back(7527); }
711	else if (word2 == "OR") { word2.clear(); word2.push_back(7527); word2.appendcarr("OR", 2); word2.push_back(7527); }
712	else if (word2 == "NOT") { word2.clear(); word2.push_back(7527); word2.appendcarr("NOT", 3); word2.push_back(7527); }
713	else if (word2 == "NEAR") { word2.clear(); word2.push_back(7527); word2.appendcarr("NEAR", 4); word2.push_back(7527); }
714	else if (word2 == "WITHIN") { word2.clear(); word2.push_back(7527); word2.appendcarr("WITHIN", 6); word2.push_back(7527); }
715	word.append(word2); word2.clear();
716
717	if (stem == "1"\|\| fold == "1") {
718	word += "#";
719	if (stem == "1") word += "s";
720	//else word += "u";
721
722	if (fold == "1") word += "i";
723	//else word += "c";
724	}
725	if (!outtext.empty()) outtext += " " + word_combine + " ";
726	outtext += "[" + word + "]:"+tag;
727	}
728	querystring += "(" + outtext + ")";
729	}
730
731	void add_field_info(text_t &querystring, const text_t &tag, int type) {
732
733	if (tag == "") return; // do nothing
734	if (tag == "ZZ" && type == 1) return; // mgpp doesn't use ZZ tag internally
735	if (type == 1) { //mgpp
736	querystring = "["+querystring+"]:"+tag;
737	} else if (type == 2) { // lucene
738	querystring = tag+":("+querystring+")";
739	}
740
741	}
742
743
744	void format_field_info_lucene(text_t &querystring, text_t &tag, int argt, int argb) {
745
746	int type = 2; //lucene
747
748	if (argb==0) { // simple
749	// there will be no & or \| as they should have already been removed
750	// just tag the entire thing
751	if (tag != "") {
752	add_field_info(querystring, tag, type);
753	}
754	return;
755	}
756
757	// need to replace & with &&, \| with \|\|
758	text_t::const_iterator here = querystring.begin();
759	text_t::const_iterator end = querystring.end();
760
761	text_t finalquery = "";
762	while (here != end) {
763	if (*here == '&') {
764	finalquery.push_back('&');
765	finalquery.push_back('&');
766	while (*(here+1) == '&') {
767	++here;
768	}
769	}
770	else if (*here == '\|') {
771	finalquery.push_back('\|');
772	finalquery.push_back('\|');
773	while (*(here+1) == '\|') {
774	++here;
775	}
776	}
777	else {
778	finalquery.push_back(*here);
779	}
780	++here;
781	}
782	querystring = finalquery;
783	add_field_info(querystring, tag, type);
784	}
785
786
787	void format_field_info_mgpp(text_t &querystring, text_t tag, int argt, int argb) {
788
789	if (tag == "ZZ") tag = ""; // ZZ is a special tag meaning no tag (all fields)
790	if (tag == "" && argb == 1) {
791	return; // no field specifier, advanced mode, the query stays as written
792	}
793
794	int type = 1; // mgpp
795
796	bool simple_and = (argb==0 && argt==0);
797	text_t finalquery = "";
798	text_t fieldpart ="";
799	text_t queryelem = "";
800	bool in_phrase = false;
801	bool in_field = false;
802
803	text_t::const_iterator here = querystring.begin();
804	text_t::const_iterator end = querystring.end();
805	while (here != end) {
806	if (is_unicode_letdig(here) \|\| here == '&' \|\| is_special_character(type, *here)) {
807	queryelem.push_back(*here);
808	}
809	else if (*here == '\|') {
810	in_field = false;
811	}
812	else if (here == '!' \|\| here == '(' \|\| *here == ')') {
813	if (!in_phrase) { // ignore these if in_phrase
814	// output field, then output operator
815	in_field = false;
816	if (!queryelem.empty()) {
817	if (!simple_and && !fieldpart.empty()) {
818	add_field_info(fieldpart, tag, type);
819	finalquery += fieldpart;
820	finalquery.push_back(' ');
821	fieldpart.clear();
822	}
823	fieldpart += queryelem;
824	}
825	if (!fieldpart.empty()) {
826	add_field_info(fieldpart, tag, type);
827	finalquery += fieldpart;
828	finalquery.push_back(' ');
829	}
830	fieldpart.clear();
831	queryelem.clear();
832	finalquery.push_back(*here);
833	finalquery.push_back(' ');
834	}
835	}
836	else if (*here == '"') {
837	queryelem.push_back(*here);
838	if (in_phrase == false) in_phrase = true;
839	else {
840	in_phrase = false;
841	}
842	}
843
844	// Found word boundary, in a phrase
845	else if (in_phrase) {
846	queryelem.push_back(*here);
847	}
848	// Found a word boundary
849	else {
850	if (!queryelem.empty()) {
851	if (queryelem == "&") {
852	in_field = true;
853	queryelem.clear();
854	}
855	else if (starts_with(queryelem, "NEAR") \|\| starts_with(queryelem, "WITHIN")) {
856
857	if (argb==1) {
858	// simple search, these not allowed
859	in_field = true;
860	fieldpart += queryelem;
861	fieldpart.push_back(' ');
862	}
863	queryelem.clear();
864
865	}
866	else {
867	if (!simple_and && !in_field) {
868	if (!fieldpart.empty()) {
869	add_field_info(fieldpart, tag, type);
870	finalquery += fieldpart;
871	finalquery.push_back(' ');
872	fieldpart.clear();
873	}
874	}
875
876	fieldpart += queryelem;
877	fieldpart.push_back(' ');
878	queryelem.clear();
879	}
880	}
881	}
882	++here;
883	}
884	// at the end
885	if (!queryelem.empty()) {
886	if (!simple_and && !in_field && !fieldpart.empty()) {
887	add_field_info(fieldpart, tag, type);
888	finalquery += fieldpart;
889	finalquery.push_back(' ');
890	fieldpart.clear();
891	}
892	fieldpart += queryelem;
893	}
894	if (!fieldpart.empty()) {
895	add_field_info(fieldpart, tag, type);
896	finalquery += fieldpart;
897	fieldpart.clear();
898
899	// doesn't the following just leave a dangling space at the end ?? (used to make mgpp crash)
900	// consider cutting this line
901	finalquery.push_back(' ');
902	}
903
904	querystring = finalquery;
905	}
906
907
908	void format_field_info(text_t &querystring, text_t tag, int argct, int argt, int argb) {
909	if (argct == 1) {
910	format_field_info_mgpp(querystring, tag, argt, argb);
911	} else if (argct == 2) {
912	format_field_info_lucene(querystring, tag, argt, argb);
913	}
914	}
915
916	void mgpp_adddateelem(text_t& querystring, const int date)
917	{
918	querystring.appendcstr(" [");
919	if(date<0) {
920	querystring.appendcstr("bc");
921	querystring.appendint((date*-1));
922	}
923	else {
924	querystring.appendint(date);
925	}
926	querystring.appendcstr("]:CV");
927	}
928
929	void lucene_adddateelem(text_t& querystring, const int date)
930	{
931	querystring.appendcstr(" CV:(");
932	if(date<0) {
933	querystring.appendcstr("bc");
934	querystring.appendint((date*-1));
935	}
936	else {
937	querystring.appendint(date);
938	}
939	querystring.appendcstr(")");
940	}
941
942
943	void add_dates(text_t &querystring, int startdate, int enddate,
944	int startbc, int endbc, int ct)
945	{
946	if(startdate)
947	{
948	int querystringis = 0;
949	text_t::const_iterator here = querystring.begin();
950	text_t::const_iterator end = querystring.end();
951	while(here!=end)
952	{
953	if(!(isspace((*here)))){
954	here = end;
955	querystringis = 1;
956	}
957	else
958	++here;
959	}
960	//converting BCE dates
961	if(startbc && startdate > 0)
962	{
963	startdate *= -1;
964	}
965	if(endbc && enddate > 0)
966	{
967	enddate *= -1;
968	}
969	if(enddate != 0 && enddate<startdate)
970	{
971	cout<<"enddate too small"<<endl;
972	return;
973	}
974	if(querystringis)
975	querystring.appendcstr(" AND");
976	if(!enddate)
977	{
978	if (ct==1) {
979	mgpp_adddateelem(querystring,startdate);
980	}
981	else { // lucene
982	lucene_adddateelem(querystring,startdate);
983	}
984	}
985	else{
986	int nextdate = startdate;
987	querystring.appendcstr(" (");
988	while(nextdate<=enddate)
989	{
990	if(nextdate!=0) {
991	if (ct==1) {
992	mgpp_adddateelem(querystring,nextdate);
993	}
994	else { // lucene
995	lucene_adddateelem(querystring,nextdate);
996	}
997	}
998	++nextdate;
999	}
1000	querystring.appendcstr(" )");
1001	}
1002	}
1003
1004	}

Note: See TracBrowser for help on using the repository browser.

Download in other formats: