Context Navigation

← Previous Revision
Latest Revision
Next Revision →
Blame
Revision Log

source: documentation/trunk/packages/dokuwiki-2011-05-25a/inc/fulltext.php@ 25027

Last change on this file since 25027 was 25027, checked in by jmt12, 12 years ago
Adding the packages directory, and within it a configured version of dokuwiki all ready to run
File size: 24.9 KB

Line
1	<?php
2	/**
3	* DokuWiki fulltextsearch functions using the index
4	*
5	* @license GPL 2 (http://www.gnu.org/licenses/gpl.html)
6	* @author Andreas Gohr <[email protected]>
7	*/
8
9	if(!defined('DOKU_INC')) die('meh.');
10
11	/**
12	* create snippets for the first few results only
13	*/
14	if(!defined('FT_SNIPPET_NUMBER')) define('FT_SNIPPET_NUMBER',15);
15
16	/**
17	* The fulltext search
18	*
19	* Returns a list of matching documents for the given query
20	*
21	* refactored into ft_pageSearch(), _ft_pageSearch() and trigger_event()
22	*
23	*/
24	function ft_pageSearch($query,&$highlight){
25
26	$data['query'] = $query;
27	$data['highlight'] =& $highlight;
28
29	return trigger_event('SEARCH_QUERY_FULLPAGE', $data, '_ft_pageSearch');
30	}
31
32	/**
33	* Returns a list of matching documents for the given query
34	*
35	* @author Andreas Gohr <[email protected]>
36	* @author Kazutaka Miyasaka <[email protected]>
37	*/
38	function _ft_pageSearch(&$data) {
39	$Indexer = idx_get_indexer();
40
41	// parse the given query
42	$q = ft_queryParser($Indexer, $data['query']);
43	$data['highlight'] = $q['highlight'];
44
45	if (empty($q['parsed_ary'])) return array();
46
47	// lookup all words found in the query
48	$lookup = $Indexer->lookup($q['words']);
49
50	// get all pages in this dokuwiki site (!: includes nonexistent pages)
51	$pages_all = array();
52	foreach ($Indexer->getPages() as $id) {
53	$pages_all[$id] = 0; // base: 0 hit
54	}
55
56	// process the query
57	$stack = array();
58	foreach ($q['parsed_ary'] as $token) {
59	switch (substr($token, 0, 3)) {
60	case 'W+:':
61	case 'W-:':
62	case 'W_:': // word
63	$word = substr($token, 3);
64	$stack[] = (array) $lookup[$word];
65	break;
66	case 'P+:':
67	case 'P-:': // phrase
68	$phrase = substr($token, 3);
69	// since phrases are always parsed as ((W1)(W2)...(P)),
70	// the end($stack) always points the pages that contain
71	// all words in this phrase
72	$pages = end($stack);
73	$pages_matched = array();
74	foreach(array_keys($pages) as $id){
75	$text = utf8_strtolower(rawWiki($id));
76	if (strpos($text, $phrase) !== false) {
77	$pages_matched[$id] = 0; // phrase: always 0 hit
78	}
79	}
80	$stack[] = $pages_matched;
81	break;
82	case 'N+:':
83	case 'N-:': // namespace
84	$ns = substr($token, 3);
85	$pages_matched = array();
86	foreach (array_keys($pages_all) as $id) {
87	if (strpos($id, $ns) === 0) {
88	$pages_matched[$id] = 0; // namespace: always 0 hit
89	}
90	}
91	$stack[] = $pages_matched;
92	break;
93	case 'AND': // and operation
94	list($pages1, $pages2) = array_splice($stack, -2);
95	$stack[] = ft_resultCombine(array($pages1, $pages2));
96	break;
97	case 'OR': // or operation
98	list($pages1, $pages2) = array_splice($stack, -2);
99	$stack[] = ft_resultUnite(array($pages1, $pages2));
100	break;
101	case 'NOT': // not operation (unary)
102	$pages = array_pop($stack);
103	$stack[] = ft_resultComplement(array($pages_all, $pages));
104	break;
105	}
106	}
107	$docs = array_pop($stack);
108
109	if (empty($docs)) return array();
110
111	// check: settings, acls, existence
112	foreach (array_keys($docs) as $id) {
113	if (isHiddenPage($id) \|\| auth_quickaclcheck($id) < AUTH_READ \|\| !page_exists($id, '', false)) {
114	unset($docs[$id]);
115	}
116	}
117
118	// sort docs by count
119	arsort($docs);
120
121	return $docs;
122	}
123
124	/**
125	* Returns the backlinks for a given page
126	*
127	* Uses the metadata index.
128	*/
129	function ft_backlinks($id){
130	$result = array();
131
132	$result = idx_get_indexer()->lookupKey('relation_references', $id);
133
134	if(!count($result)) return $result;
135
136	// check ACL permissions
137	foreach(array_keys($result) as $idx){
138	if(auth_quickaclcheck($result[$idx]) < AUTH_READ){
139	unset($result[$idx]);
140	}
141	}
142
143	sort($result);
144	return $result;
145	}
146
147	/**
148	* Returns the pages that use a given media file
149	*
150	* Does a quick lookup with the fulltext index, then
151	* evaluates the instructions of the found pages
152	*
153	* Aborts after $max found results
154	*/
155	function ft_mediause($id,$max){
156	if(!$max) $max = 1; // need to find at least one
157
158	$result = array();
159
160	// quick lookup of the mediafile
161	// FIXME use metadata key lookup
162	$media = noNS($id);
163	$matches = idx_lookup(idx_tokenizer($media));
164	$docs = array_keys(ft_resultCombine(array_values($matches)));
165	if(!count($docs)) return $result;
166
167	// go through all found pages
168	$found = 0;
169	$pcre = preg_quote($media,'/');
170	foreach($docs as $doc){
171	$ns = getNS($doc);
172	preg_match_all('/\{\{([^\|}]'.$pcre.'[^\|}])(\|[^}]+)?\}\}/i',rawWiki($doc),$matches);
173	foreach($matches[1] as $img){
174	$img = trim($img);
175	if(preg_match('/^https?:\/\//i',$img)) continue; // skip external images
176	list($img) = explode('?',$img); // remove any parameters
177	resolve_mediaid($ns,$img,$exists); // resolve the possibly relative img
178
179	if($img == $id){ // we have a match
180	$result[] = $doc;
181	$found++;
182	break;
183	}
184	}
185	if($found >= $max) break;
186	}
187
188	sort($result);
189	return $result;
190	}
191
192
193
194	/**
195	* Quicksearch for pagenames
196	*
197	* By default it only matches the pagename and ignores the
198	* namespace. This can be changed with the second parameter.
199	* The third parameter allows to search in titles as well.
200	*
201	* The function always returns titles as well
202	*
203	* @triggers SEARCH_QUERY_PAGELOOKUP
204	* @author Andreas Gohr <[email protected]>
205	* @author Adrian Lang <[email protected]>
206	*/
207	function ft_pageLookup($id, $in_ns=false, $in_title=false){
208	$data = compact('id', 'in_ns', 'in_title');
209	$data['has_titles'] = true; // for plugin backward compatibility check
210	return trigger_event('SEARCH_QUERY_PAGELOOKUP', $data, '_ft_pageLookup');
211	}
212
213	function _ft_pageLookup(&$data){
214	// split out original parameters
215	$id = $data['id'];
216	if (preg_match('/(?:^\| )@(\w+)/', $id, $matches)) {
217	$ns = cleanID($matches[1]) . ':';
218	$id = str_replace($matches[0], '', $id);
219	}
220
221	$in_ns = $data['in_ns'];
222	$in_title = $data['in_title'];
223	$cleaned = cleanID($id);
224
225	$Indexer = idx_get_indexer();
226	$page_idx = $Indexer->getPages();
227
228	$pages = array();
229	if ($id !== '' && $cleaned !== '') {
230	foreach ($page_idx as $p_id) {
231	if ((strpos($in_ns ? $p_id : noNSorNS($p_id), $cleaned) !== false)) {
232	if (!isset($pages[$p_id]))
233	$pages[$p_id] = p_get_first_heading($p_id, METADATA_DONT_RENDER);
234	}
235	}
236	if ($in_title) {
237	foreach ($Indexer->lookupKey('title', $id, '_ft_pageLookupTitleCompare') as $p_id) {
238	if (!isset($pages[$p_id]))
239	$pages[$p_id] = p_get_first_heading($p_id, METADATA_DONT_RENDER);
240	}
241	}
242	}
243
244	if (isset($ns)) {
245	foreach (array_keys($pages) as $p_id) {
246	if (strpos($p_id, $ns) !== 0) {
247	unset($pages[$p_id]);
248	}
249	}
250	}
251
252	// discard hidden pages
253	// discard nonexistent pages
254	// check ACL permissions
255	foreach(array_keys($pages) as $idx){
256	if(!isVisiblePage($idx) \|\| !page_exists($idx) \|\|
257	auth_quickaclcheck($idx) < AUTH_READ) {
258	unset($pages[$idx]);
259	}
260	}
261
262	uksort($pages,'ft_pagesorter');
263	return $pages;
264	}
265
266	/**
267	* Tiny helper function for comparing the searched title with the title
268	* from the search index. This function is a wrapper around stripos with
269	* adapted argument order and return value.
270	*/
271	function _ft_pageLookupTitleCompare($search, $title) {
272	return stripos($title, $search) !== false;
273	}
274
275	/**
276	* Sort pages based on their namespace level first, then on their string
277	* values. This makes higher hierarchy pages rank higher than lower hierarchy
278	* pages.
279	*/
280	function ft_pagesorter($a, $b){
281	$ac = count(explode(':',$a));
282	$bc = count(explode(':',$b));
283	if($ac < $bc){
284	return -1;
285	}elseif($ac > $bc){
286	return 1;
287	}
288	return strcmp ($a,$b);
289	}
290
291	/**
292	* Creates a snippet extract
293	*
294	* @author Andreas Gohr <[email protected]>
295	* @triggers FULLTEXT_SNIPPET_CREATE
296	*/
297	function ft_snippet($id,$highlight){
298	$text = rawWiki($id);
299	$text = str_replace("\xC2\xAD",'',$text); // remove soft-hyphens
300	$evdata = array(
301	'id' => $id,
302	'text' => &$text,
303	'highlight' => &$highlight,
304	'snippet' => '',
305	);
306
307	$evt = new Doku_Event('FULLTEXT_SNIPPET_CREATE',$evdata);
308	if ($evt->advise_before()) {
309	$match = array();
310	$snippets = array();
311	$utf8_offset = $offset = $end = 0;
312	$len = utf8_strlen($text);
313
314	// build a regexp from the phrases to highlight
315	$re1 = '('.join('\|',array_map('ft_snippet_re_preprocess', array_map('preg_quote_cb',array_filter((array) $highlight)))).')';
316	$re2 = "$re1.{0,75}(?!\\1)$re1";
317	$re3 = "$re1.{0,45}(?!\\1)$re1.{0,45}(?!\\1)(?!\\2)$re1";
318
319	for ($cnt=4; $cnt--;) {
320	if (0) {
321	} else if (preg_match('/'.$re3.'/iu',$text,$match,PREG_OFFSET_CAPTURE,$offset)) {
322	} else if (preg_match('/'.$re2.'/iu',$text,$match,PREG_OFFSET_CAPTURE,$offset)) {
323	} else if (preg_match('/'.$re1.'/iu',$text,$match,PREG_OFFSET_CAPTURE,$offset)) {
324	} else {
325	break;
326	}
327
328	list($str,$idx) = $match[0];
329
330	// convert $idx (a byte offset) into a utf8 character offset
331	$utf8_idx = utf8_strlen(substr($text,0,$idx));
332	$utf8_len = utf8_strlen($str);
333
334	// establish context, 100 bytes surrounding the match string
335	// first look to see if we can go 100 either side,
336	// then drop to 50 adding any excess if the other side can't go to 50,
337	$pre = min($utf8_idx-$utf8_offset,100);
338	$post = min($len-$utf8_idx-$utf8_len,100);
339
340	if ($pre>50 && $post>50) {
341	$pre = $post = 50;
342	} else if ($pre>50) {
343	$pre = min($pre,100-$post);
344	} else if ($post>50) {
345	$post = min($post, 100-$pre);
346	} else {
347	// both are less than 50, means the context is the whole string
348	// make it so and break out of this loop - there is no need for the
349	// complex snippet calculations
350	$snippets = array($text);
351	break;
352	}
353
354	// establish context start and end points, try to append to previous
355	// context if possible
356	$start = $utf8_idx - $pre;
357	$append = ($start < $end) ? $end : false; // still the end of the previous context snippet
358	$end = $utf8_idx + $utf8_len + $post; // now set it to the end of this context
359
360	if ($append) {
361	$snippets[count($snippets)-1] .= utf8_substr($text,$append,$end-$append);
362	} else {
363	$snippets[] = utf8_substr($text,$start,$end-$start);
364	}
365
366	// set $offset for next match attempt
367	// substract strlen to avoid splitting a potential search success,
368	// this is an approximation as the search pattern may match strings
369	// of varying length and it will fail if the context snippet
370	// boundary breaks a matching string longer than the current match
371	$utf8_offset = $utf8_idx + $post;
372	$offset = $idx + strlen(utf8_substr($text,$utf8_idx,$post));
373	$offset = utf8_correctIdx($text,$offset);
374	}
375
376	$m = "\1";
377	$snippets = preg_replace('/'.$re1.'/iu',$m.'$1'.$m,$snippets);
378	$snippet = preg_replace('/'.$m.'([^'.$m.']*?)'.$m.'/iu','<strong class="search_hit">$1</strong>',hsc(join('... ',$snippets)));
379
380	$evdata['snippet'] = $snippet;
381	}
382	$evt->advise_after();
383	unset($evt);
384
385	return $evdata['snippet'];
386	}
387
388	/**
389	* Wraps a search term in regex boundary checks.
390	*/
391	function ft_snippet_re_preprocess($term) {
392	// do not process asian terms where word boundaries are not explicit
393	if(preg_match('/'.IDX_ASIAN.'/u',$term)){
394	return $term;
395	}
396
397	if(substr($term,0,2) == '\\*'){
398	$term = substr($term,2);
399	}else{
400	$term = '\b'.$term;
401	}
402
403	if(substr($term,-2,2) == '\\*'){
404	$term = substr($term,0,-2);
405	}else{
406	$term = $term.'\b';
407	}
408	return $term;
409	}
410
411	/**
412	* Combine found documents and sum up their scores
413	*
414	* This function is used to combine searched words with a logical
415	* AND. Only documents available in all arrays are returned.
416	*
417	* based upon PEAR's PHP_Compat function for array_intersect_key()
418	*
419	* @param array $args An array of page arrays
420	*/
421	function ft_resultCombine($args){
422	$array_count = count($args);
423	if($array_count == 1){
424	return $args[0];
425	}
426
427	$result = array();
428	if ($array_count > 1) {
429	foreach ($args[0] as $key => $value) {
430	$result[$key] = $value;
431	for ($i = 1; $i !== $array_count; $i++) {
432	if (!isset($args[$i][$key])) {
433	unset($result[$key]);
434	break;
435	}
436	$result[$key] += $args[$i][$key];
437	}
438	}
439	}
440	return $result;
441	}
442
443	/**
444	* Unites found documents and sum up their scores
445	*
446	* based upon ft_resultCombine() function
447	*
448	* @param array $args An array of page arrays
449	* @author Kazutaka Miyasaka <[email protected]>
450	*/
451	function ft_resultUnite($args) {
452	$array_count = count($args);
453	if ($array_count === 1) {
454	return $args[0];
455	}
456
457	$result = $args[0];
458	for ($i = 1; $i !== $array_count; $i++) {
459	foreach (array_keys($args[$i]) as $id) {
460	$result[$id] += $args[$i][$id];
461	}
462	}
463	return $result;
464	}
465
466	/**
467	* Computes the difference of documents using page id for comparison
468	*
469	* nearly identical to PHP5's array_diff_key()
470	*
471	* @param array $args An array of page arrays
472	* @author Kazutaka Miyasaka <[email protected]>
473	*/
474	function ft_resultComplement($args) {
475	$array_count = count($args);
476	if ($array_count === 1) {
477	return $args[0];
478	}
479
480	$result = $args[0];
481	foreach (array_keys($result) as $id) {
482	for ($i = 1; $i !== $array_count; $i++) {
483	if (isset($args[$i][$id])) unset($result[$id]);
484	}
485	}
486	return $result;
487	}
488
489	/**
490	* Parses a search query and builds an array of search formulas
491	*
492	* @author Andreas Gohr <[email protected]>
493	* @author Kazutaka Miyasaka <[email protected]>
494	*/
495	function ft_queryParser($Indexer, $query){
496	/**
497	* parse a search query and transform it into intermediate representation
498	*
499	* in a search query, you can use the following expressions:
500	*
501	* words:
502	* include
503	* -exclude
504	* phrases:
505	* "phrase to be included"
506	* -"phrase you want to exclude"
507	* namespaces:
508	* @include:namespace (or ns:include:namespace)
509	* ^exclude:namespace (or -ns:exclude:namespace)
510	* groups:
511	* ()
512	* -()
513	* operators:
514	* and ('and' is the default operator: you can always omit this)
515	* or (or pipe symbol '\|', lower precedence than 'and')
516	*
517	* e.g. a query [ aa "bb cc" @dd:ee ] means "search pages which contain
518	* a word 'aa', a phrase 'bb cc' and are within a namespace 'dd:ee'".
519	* this query is equivalent to [ -(-aa or -"bb cc" or -ns:dd:ee) ]
520	* as long as you don't mind hit counts.
521	*
522	* intermediate representation consists of the following parts:
523	*
524	* ( ) - group
525	* AND - logical and
526	* OR - logical or
527	* NOT - logical not
528	* W+:, W-:, W_: - word (underscore: no need to highlight)
529	* P+:, P-: - phrase (minus sign: logically in NOT group)
530	* N+:, N-: - namespace
531	*/
532	$parsed_query = '';
533	$parens_level = 0;
534	$terms = preg_split('/(-?".*?")/u', utf8_strtolower($query), -1, PREG_SPLIT_DELIM_CAPTURE \| PREG_SPLIT_NO_EMPTY);
535
536	foreach ($terms as $term) {
537	$parsed = '';
538	if (preg_match('/^(-?)"(.+)"$/u', $term, $matches)) {
539	// phrase-include and phrase-exclude
540	$not = $matches[1] ? 'NOT' : '';
541	$parsed = $not.ft_termParser($Indexer, $matches[2], false, true);
542	} else {
543	// fix incomplete phrase
544	$term = str_replace('"', ' ', $term);
545
546	// fix parentheses
547	$term = str_replace(')' , ' ) ', $term);
548	$term = str_replace('(' , ' ( ', $term);
549	$term = str_replace('- (', ' -(', $term);
550
551	// treat pipe symbols as 'OR' operators
552	$term = str_replace('\|', ' or ', $term);
553
554	// treat ideographic spaces (U+3000) as search term separators
555	// FIXME: some more separators?
556	$term = preg_replace('/[ \x{3000}]+/u', ' ', $term);
557	$term = trim($term);
558	if ($term === '') continue;
559
560	$tokens = explode(' ', $term);
561	foreach ($tokens as $token) {
562	if ($token === '(') {
563	// parenthesis-include-open
564	$parsed .= '(';
565	++$parens_level;
566	} elseif ($token === '-(') {
567	// parenthesis-exclude-open
568	$parsed .= 'NOT(';
569	++$parens_level;
570	} elseif ($token === ')') {
571	// parenthesis-any-close
572	if ($parens_level === 0) continue;
573	$parsed .= ')';
574	$parens_level--;
575	} elseif ($token === 'and') {
576	// logical-and (do nothing)
577	} elseif ($token === 'or') {
578	// logical-or
579	$parsed .= 'OR';
580	} elseif (preg_match('/^(?:\^\|-ns:)(.+)$/u', $token, $matches)) {
581	// namespace-exclude
582	$parsed .= 'NOT(N+:'.$matches[1].')';
583	} elseif (preg_match('/^(?:@\|ns:)(.+)$/u', $token, $matches)) {
584	// namespace-include
585	$parsed .= '(N+:'.$matches[1].')';
586	} elseif (preg_match('/^-(.+)$/', $token, $matches)) {
587	// word-exclude
588	$parsed .= 'NOT('.ft_termParser($Indexer, $matches[1]).')';
589	} else {
590	// word-include
591	$parsed .= ft_termParser($Indexer, $token);
592	}
593	}
594	}
595	$parsed_query .= $parsed;
596	}
597
598	// cleanup (very sensitive)
599	$parsed_query .= str_repeat(')', $parens_level);
600	do {
601	$parsed_query_old = $parsed_query;
602	$parsed_query = preg_replace('/(NOT)?\(\)/u', '', $parsed_query);
603	} while ($parsed_query !== $parsed_query_old);
604	$parsed_query = preg_replace('/(NOT\|OR)+\)/u', ')' , $parsed_query);
605	$parsed_query = preg_replace('/(OR)+/u' , 'OR' , $parsed_query);
606	$parsed_query = preg_replace('/\(OR/u' , '(' , $parsed_query);
607	$parsed_query = preg_replace('/^OR\|OR$/u' , '' , $parsed_query);
608	$parsed_query = preg_replace('/\)(NOT)?\(/u' , ')AND$1(', $parsed_query);
609
610	// adjustment: make highlightings right
611	$parens_level = 0;
612	$notgrp_levels = array();
613	$parsed_query_new = '';
614	$tokens = preg_split('/(NOT\(\|[()])/u', $parsed_query, -1, PREG_SPLIT_DELIM_CAPTURE \| PREG_SPLIT_NO_EMPTY);
615	foreach ($tokens as $token) {
616	if ($token === 'NOT(') {
617	$notgrp_levels[] = ++$parens_level;
618	} elseif ($token === '(') {
619	++$parens_level;
620	} elseif ($token === ')') {
621	if ($parens_level-- === end($notgrp_levels)) array_pop($notgrp_levels);
622	} elseif (count($notgrp_levels) % 2 === 1) {
623	// turn highlight-flag off if terms are logically in "NOT" group
624	$token = preg_replace('/([WPN])\+\:/u', '$1-:', $token);
625	}
626	$parsed_query_new .= $token;
627	}
628	$parsed_query = $parsed_query_new;
629
630	/**
631	* convert infix notation string into postfix (Reverse Polish notation) array
632	* by Shunting-yard algorithm
633	*
634	* see: http://en.wikipedia.org/wiki/Reverse_Polish_notation
635	* see: http://en.wikipedia.org/wiki/Shunting-yard_algorithm
636	*/
637	$parsed_ary = array();
638	$ope_stack = array();
639	$ope_precedence = array(')' => 1, 'OR' => 2, 'AND' => 3, 'NOT' => 4, '(' => 5);
640	$ope_regex = '/([()]\|OR\|AND\|NOT)/u';
641
642	$tokens = preg_split($ope_regex, $parsed_query, -1, PREG_SPLIT_DELIM_CAPTURE \| PREG_SPLIT_NO_EMPTY);
643	foreach ($tokens as $token) {
644	if (preg_match($ope_regex, $token)) {
645	// operator
646	$last_ope = end($ope_stack);
647	while ($ope_precedence[$token] <= $ope_precedence[$last_ope] && $last_ope != '(') {
648	$parsed_ary[] = array_pop($ope_stack);
649	$last_ope = end($ope_stack);
650	}
651	if ($token == ')') {
652	array_pop($ope_stack); // this array_pop always deletes '('
653	} else {
654	$ope_stack[] = $token;
655	}
656	} else {
657	// operand
658	$token_decoded = str_replace(array('OP', 'CP'), array('(', ')'), $token);
659	$parsed_ary[] = $token_decoded;
660	}
661	}
662	$parsed_ary = array_values(array_merge($parsed_ary, array_reverse($ope_stack)));
663
664	// cleanup: each double "NOT" in RPN array actually does nothing
665	$parsed_ary_count = count($parsed_ary);
666	for ($i = 1; $i < $parsed_ary_count; ++$i) {
667	if ($parsed_ary[$i] === 'NOT' && $parsed_ary[$i - 1] === 'NOT') {
668	unset($parsed_ary[$i], $parsed_ary[$i - 1]);
669	}
670	}
671	$parsed_ary = array_values($parsed_ary);
672
673	// build return value
674	$q = array();
675	$q['query'] = $query;
676	$q['parsed_str'] = $parsed_query;
677	$q['parsed_ary'] = $parsed_ary;
678
679	foreach ($q['parsed_ary'] as $token) {
680	if ($token[2] !== ':') continue;
681	$body = substr($token, 3);
682
683	switch (substr($token, 0, 3)) {
684	case 'N+:':
685	$q['ns'][] = $body; // for backward compatibility
686	break;
687	case 'N-:':
688	$q['notns'][] = $body; // for backward compatibility
689	break;
690	case 'W_:':
691	$q['words'][] = $body;
692	break;
693	case 'W-:':
694	$q['words'][] = $body;
695	$q['not'][] = $body; // for backward compatibility
696	break;
697	case 'W+:':
698	$q['words'][] = $body;
699	$q['highlight'][] = $body;
700	$q['and'][] = $body; // for backward compatibility
701	break;
702	case 'P-:':
703	$q['phrases'][] = $body;
704	break;
705	case 'P+:':
706	$q['phrases'][] = $body;
707	$q['highlight'][] = $body;
708	break;
709	}
710	}
711	foreach (array('words', 'phrases', 'highlight', 'ns', 'notns', 'and', 'not') as $key) {
712	$q[$key] = empty($q[$key]) ? array() : array_values(array_unique($q[$key]));
713	}
714
715	return $q;
716	}
717
718	/**
719	* Transforms given search term into intermediate representation
720	*
721	* This function is used in ft_queryParser() and not for general purpose use.
722	*
723	* @author Kazutaka Miyasaka <[email protected]>
724	*/
725	function ft_termParser($Indexer, $term, $consider_asian = true, $phrase_mode = false) {
726	$parsed = '';
727	if ($consider_asian) {
728	// successive asian characters need to be searched as a phrase
729	$words = preg_split('/('.IDX_ASIAN.'+)/u', $term, -1, PREG_SPLIT_DELIM_CAPTURE \| PREG_SPLIT_NO_EMPTY);
730	foreach ($words as $word) {
731	$phrase_mode = $phrase_mode ? true : preg_match('/'.IDX_ASIAN.'/u', $word);
732	$parsed .= ft_termParser($Indexer, $word, false, $phrase_mode);
733	}
734	} else {
735	$term_noparen = str_replace(array('(', ')'), ' ', $term);
736	$words = $Indexer->tokenizer($term_noparen, true);
737
738	// W_: no need to highlight
739	if (empty($words)) {
740	$parsed = '()'; // important: do not remove
741	} elseif ($words[0] === $term) {
742	$parsed = '(W+:'.$words[0].')';
743	} elseif ($phrase_mode) {
744	$term_encoded = str_replace(array('(', ')'), array('OP', 'CP'), $term);
745	$parsed = '((W_:'.implode(')(W_:', $words).')(P+:'.$term_encoded.'))';
746	} else {
747	$parsed = '((W+:'.implode(')(W+:', $words).'))';
748	}
749	}
750	return $parsed;
751	}
752
753	//Setup VIM: ex: et ts=4 :

Note: See TracBrowser for help on using the repository browser.

Download in other formats: