1 | <?php
|
---|
2 | /**
|
---|
3 | * DokuWiki fulltextsearch functions using the index
|
---|
4 | *
|
---|
5 | * @license GPL 2 (http://www.gnu.org/licenses/gpl.html)
|
---|
6 | * @author Andreas Gohr <[email protected]>
|
---|
7 | */
|
---|
8 |
|
---|
9 | if(!defined('DOKU_INC')) die('meh.');
|
---|
10 |
|
---|
11 | /**
|
---|
12 | * create snippets for the first few results only
|
---|
13 | */
|
---|
14 | if(!defined('FT_SNIPPET_NUMBER')) define('FT_SNIPPET_NUMBER',15);
|
---|
15 |
|
---|
16 | /**
|
---|
17 | * The fulltext search
|
---|
18 | *
|
---|
19 | * Returns a list of matching documents for the given query
|
---|
20 | *
|
---|
21 | * refactored into ft_pageSearch(), _ft_pageSearch() and trigger_event()
|
---|
22 | *
|
---|
23 | */
|
---|
24 | function ft_pageSearch($query,&$highlight){
|
---|
25 |
|
---|
26 | $data['query'] = $query;
|
---|
27 | $data['highlight'] =& $highlight;
|
---|
28 |
|
---|
29 | return trigger_event('SEARCH_QUERY_FULLPAGE', $data, '_ft_pageSearch');
|
---|
30 | }
|
---|
31 |
|
---|
32 | /**
|
---|
33 | * Returns a list of matching documents for the given query
|
---|
34 | *
|
---|
35 | * @author Andreas Gohr <[email protected]>
|
---|
36 | * @author Kazutaka Miyasaka <[email protected]>
|
---|
37 | */
|
---|
38 | function _ft_pageSearch(&$data) {
|
---|
39 | $Indexer = idx_get_indexer();
|
---|
40 |
|
---|
41 | // parse the given query
|
---|
42 | $q = ft_queryParser($Indexer, $data['query']);
|
---|
43 | $data['highlight'] = $q['highlight'];
|
---|
44 |
|
---|
45 | if (empty($q['parsed_ary'])) return array();
|
---|
46 |
|
---|
47 | // lookup all words found in the query
|
---|
48 | $lookup = $Indexer->lookup($q['words']);
|
---|
49 |
|
---|
50 | // get all pages in this dokuwiki site (!: includes nonexistent pages)
|
---|
51 | $pages_all = array();
|
---|
52 | foreach ($Indexer->getPages() as $id) {
|
---|
53 | $pages_all[$id] = 0; // base: 0 hit
|
---|
54 | }
|
---|
55 |
|
---|
56 | // process the query
|
---|
57 | $stack = array();
|
---|
58 | foreach ($q['parsed_ary'] as $token) {
|
---|
59 | switch (substr($token, 0, 3)) {
|
---|
60 | case 'W+:':
|
---|
61 | case 'W-:':
|
---|
62 | case 'W_:': // word
|
---|
63 | $word = substr($token, 3);
|
---|
64 | $stack[] = (array) $lookup[$word];
|
---|
65 | break;
|
---|
66 | case 'P+:':
|
---|
67 | case 'P-:': // phrase
|
---|
68 | $phrase = substr($token, 3);
|
---|
69 | // since phrases are always parsed as ((W1)(W2)...(P)),
|
---|
70 | // the end($stack) always points the pages that contain
|
---|
71 | // all words in this phrase
|
---|
72 | $pages = end($stack);
|
---|
73 | $pages_matched = array();
|
---|
74 | foreach(array_keys($pages) as $id){
|
---|
75 | $text = utf8_strtolower(rawWiki($id));
|
---|
76 | if (strpos($text, $phrase) !== false) {
|
---|
77 | $pages_matched[$id] = 0; // phrase: always 0 hit
|
---|
78 | }
|
---|
79 | }
|
---|
80 | $stack[] = $pages_matched;
|
---|
81 | break;
|
---|
82 | case 'N+:':
|
---|
83 | case 'N-:': // namespace
|
---|
84 | $ns = substr($token, 3);
|
---|
85 | $pages_matched = array();
|
---|
86 | foreach (array_keys($pages_all) as $id) {
|
---|
87 | if (strpos($id, $ns) === 0) {
|
---|
88 | $pages_matched[$id] = 0; // namespace: always 0 hit
|
---|
89 | }
|
---|
90 | }
|
---|
91 | $stack[] = $pages_matched;
|
---|
92 | break;
|
---|
93 | case 'AND': // and operation
|
---|
94 | list($pages1, $pages2) = array_splice($stack, -2);
|
---|
95 | $stack[] = ft_resultCombine(array($pages1, $pages2));
|
---|
96 | break;
|
---|
97 | case 'OR': // or operation
|
---|
98 | list($pages1, $pages2) = array_splice($stack, -2);
|
---|
99 | $stack[] = ft_resultUnite(array($pages1, $pages2));
|
---|
100 | break;
|
---|
101 | case 'NOT': // not operation (unary)
|
---|
102 | $pages = array_pop($stack);
|
---|
103 | $stack[] = ft_resultComplement(array($pages_all, $pages));
|
---|
104 | break;
|
---|
105 | }
|
---|
106 | }
|
---|
107 | $docs = array_pop($stack);
|
---|
108 |
|
---|
109 | if (empty($docs)) return array();
|
---|
110 |
|
---|
111 | // check: settings, acls, existence
|
---|
112 | foreach (array_keys($docs) as $id) {
|
---|
113 | if (isHiddenPage($id) || auth_quickaclcheck($id) < AUTH_READ || !page_exists($id, '', false)) {
|
---|
114 | unset($docs[$id]);
|
---|
115 | }
|
---|
116 | }
|
---|
117 |
|
---|
118 | // sort docs by count
|
---|
119 | arsort($docs);
|
---|
120 |
|
---|
121 | return $docs;
|
---|
122 | }
|
---|
123 |
|
---|
124 | /**
|
---|
125 | * Returns the backlinks for a given page
|
---|
126 | *
|
---|
127 | * Uses the metadata index.
|
---|
128 | */
|
---|
129 | function ft_backlinks($id){
|
---|
130 | $result = array();
|
---|
131 |
|
---|
132 | $result = idx_get_indexer()->lookupKey('relation_references', $id);
|
---|
133 |
|
---|
134 | if(!count($result)) return $result;
|
---|
135 |
|
---|
136 | // check ACL permissions
|
---|
137 | foreach(array_keys($result) as $idx){
|
---|
138 | if(auth_quickaclcheck($result[$idx]) < AUTH_READ){
|
---|
139 | unset($result[$idx]);
|
---|
140 | }
|
---|
141 | }
|
---|
142 |
|
---|
143 | sort($result);
|
---|
144 | return $result;
|
---|
145 | }
|
---|
146 |
|
---|
147 | /**
|
---|
148 | * Returns the pages that use a given media file
|
---|
149 | *
|
---|
150 | * Does a quick lookup with the fulltext index, then
|
---|
151 | * evaluates the instructions of the found pages
|
---|
152 | *
|
---|
153 | * Aborts after $max found results
|
---|
154 | */
|
---|
155 | function ft_mediause($id,$max){
|
---|
156 | if(!$max) $max = 1; // need to find at least one
|
---|
157 |
|
---|
158 | $result = array();
|
---|
159 |
|
---|
160 | // quick lookup of the mediafile
|
---|
161 | // FIXME use metadata key lookup
|
---|
162 | $media = noNS($id);
|
---|
163 | $matches = idx_lookup(idx_tokenizer($media));
|
---|
164 | $docs = array_keys(ft_resultCombine(array_values($matches)));
|
---|
165 | if(!count($docs)) return $result;
|
---|
166 |
|
---|
167 | // go through all found pages
|
---|
168 | $found = 0;
|
---|
169 | $pcre = preg_quote($media,'/');
|
---|
170 | foreach($docs as $doc){
|
---|
171 | $ns = getNS($doc);
|
---|
172 | preg_match_all('/\{\{([^|}]*'.$pcre.'[^|}]*)(|[^}]+)?\}\}/i',rawWiki($doc),$matches);
|
---|
173 | foreach($matches[1] as $img){
|
---|
174 | $img = trim($img);
|
---|
175 | if(preg_match('/^https?:\/\//i',$img)) continue; // skip external images
|
---|
176 | list($img) = explode('?',$img); // remove any parameters
|
---|
177 | resolve_mediaid($ns,$img,$exists); // resolve the possibly relative img
|
---|
178 |
|
---|
179 | if($img == $id){ // we have a match
|
---|
180 | $result[] = $doc;
|
---|
181 | $found++;
|
---|
182 | break;
|
---|
183 | }
|
---|
184 | }
|
---|
185 | if($found >= $max) break;
|
---|
186 | }
|
---|
187 |
|
---|
188 | sort($result);
|
---|
189 | return $result;
|
---|
190 | }
|
---|
191 |
|
---|
192 |
|
---|
193 |
|
---|
194 | /**
|
---|
195 | * Quicksearch for pagenames
|
---|
196 | *
|
---|
197 | * By default it only matches the pagename and ignores the
|
---|
198 | * namespace. This can be changed with the second parameter.
|
---|
199 | * The third parameter allows to search in titles as well.
|
---|
200 | *
|
---|
201 | * The function always returns titles as well
|
---|
202 | *
|
---|
203 | * @triggers SEARCH_QUERY_PAGELOOKUP
|
---|
204 | * @author Andreas Gohr <[email protected]>
|
---|
205 | * @author Adrian Lang <[email protected]>
|
---|
206 | */
|
---|
207 | function ft_pageLookup($id, $in_ns=false, $in_title=false){
|
---|
208 | $data = compact('id', 'in_ns', 'in_title');
|
---|
209 | $data['has_titles'] = true; // for plugin backward compatibility check
|
---|
210 | return trigger_event('SEARCH_QUERY_PAGELOOKUP', $data, '_ft_pageLookup');
|
---|
211 | }
|
---|
212 |
|
---|
213 | function _ft_pageLookup(&$data){
|
---|
214 | // split out original parameters
|
---|
215 | $id = $data['id'];
|
---|
216 | if (preg_match('/(?:^| )@(\w+)/', $id, $matches)) {
|
---|
217 | $ns = cleanID($matches[1]) . ':';
|
---|
218 | $id = str_replace($matches[0], '', $id);
|
---|
219 | }
|
---|
220 |
|
---|
221 | $in_ns = $data['in_ns'];
|
---|
222 | $in_title = $data['in_title'];
|
---|
223 | $cleaned = cleanID($id);
|
---|
224 |
|
---|
225 | $Indexer = idx_get_indexer();
|
---|
226 | $page_idx = $Indexer->getPages();
|
---|
227 |
|
---|
228 | $pages = array();
|
---|
229 | if ($id !== '' && $cleaned !== '') {
|
---|
230 | foreach ($page_idx as $p_id) {
|
---|
231 | if ((strpos($in_ns ? $p_id : noNSorNS($p_id), $cleaned) !== false)) {
|
---|
232 | if (!isset($pages[$p_id]))
|
---|
233 | $pages[$p_id] = p_get_first_heading($p_id, METADATA_DONT_RENDER);
|
---|
234 | }
|
---|
235 | }
|
---|
236 | if ($in_title) {
|
---|
237 | foreach ($Indexer->lookupKey('title', $id, '_ft_pageLookupTitleCompare') as $p_id) {
|
---|
238 | if (!isset($pages[$p_id]))
|
---|
239 | $pages[$p_id] = p_get_first_heading($p_id, METADATA_DONT_RENDER);
|
---|
240 | }
|
---|
241 | }
|
---|
242 | }
|
---|
243 |
|
---|
244 | if (isset($ns)) {
|
---|
245 | foreach (array_keys($pages) as $p_id) {
|
---|
246 | if (strpos($p_id, $ns) !== 0) {
|
---|
247 | unset($pages[$p_id]);
|
---|
248 | }
|
---|
249 | }
|
---|
250 | }
|
---|
251 |
|
---|
252 | // discard hidden pages
|
---|
253 | // discard nonexistent pages
|
---|
254 | // check ACL permissions
|
---|
255 | foreach(array_keys($pages) as $idx){
|
---|
256 | if(!isVisiblePage($idx) || !page_exists($idx) ||
|
---|
257 | auth_quickaclcheck($idx) < AUTH_READ) {
|
---|
258 | unset($pages[$idx]);
|
---|
259 | }
|
---|
260 | }
|
---|
261 |
|
---|
262 | uksort($pages,'ft_pagesorter');
|
---|
263 | return $pages;
|
---|
264 | }
|
---|
265 |
|
---|
266 | /**
|
---|
267 | * Tiny helper function for comparing the searched title with the title
|
---|
268 | * from the search index. This function is a wrapper around stripos with
|
---|
269 | * adapted argument order and return value.
|
---|
270 | */
|
---|
271 | function _ft_pageLookupTitleCompare($search, $title) {
|
---|
272 | return stripos($title, $search) !== false;
|
---|
273 | }
|
---|
274 |
|
---|
275 | /**
|
---|
276 | * Sort pages based on their namespace level first, then on their string
|
---|
277 | * values. This makes higher hierarchy pages rank higher than lower hierarchy
|
---|
278 | * pages.
|
---|
279 | */
|
---|
280 | function ft_pagesorter($a, $b){
|
---|
281 | $ac = count(explode(':',$a));
|
---|
282 | $bc = count(explode(':',$b));
|
---|
283 | if($ac < $bc){
|
---|
284 | return -1;
|
---|
285 | }elseif($ac > $bc){
|
---|
286 | return 1;
|
---|
287 | }
|
---|
288 | return strcmp ($a,$b);
|
---|
289 | }
|
---|
290 |
|
---|
291 | /**
|
---|
292 | * Creates a snippet extract
|
---|
293 | *
|
---|
294 | * @author Andreas Gohr <[email protected]>
|
---|
295 | * @triggers FULLTEXT_SNIPPET_CREATE
|
---|
296 | */
|
---|
297 | function ft_snippet($id,$highlight){
|
---|
298 | $text = rawWiki($id);
|
---|
299 | $text = str_replace("\xC2\xAD",'',$text); // remove soft-hyphens
|
---|
300 | $evdata = array(
|
---|
301 | 'id' => $id,
|
---|
302 | 'text' => &$text,
|
---|
303 | 'highlight' => &$highlight,
|
---|
304 | 'snippet' => '',
|
---|
305 | );
|
---|
306 |
|
---|
307 | $evt = new Doku_Event('FULLTEXT_SNIPPET_CREATE',$evdata);
|
---|
308 | if ($evt->advise_before()) {
|
---|
309 | $match = array();
|
---|
310 | $snippets = array();
|
---|
311 | $utf8_offset = $offset = $end = 0;
|
---|
312 | $len = utf8_strlen($text);
|
---|
313 |
|
---|
314 | // build a regexp from the phrases to highlight
|
---|
315 | $re1 = '('.join('|',array_map('ft_snippet_re_preprocess', array_map('preg_quote_cb',array_filter((array) $highlight)))).')';
|
---|
316 | $re2 = "$re1.{0,75}(?!\\1)$re1";
|
---|
317 | $re3 = "$re1.{0,45}(?!\\1)$re1.{0,45}(?!\\1)(?!\\2)$re1";
|
---|
318 |
|
---|
319 | for ($cnt=4; $cnt--;) {
|
---|
320 | if (0) {
|
---|
321 | } else if (preg_match('/'.$re3.'/iu',$text,$match,PREG_OFFSET_CAPTURE,$offset)) {
|
---|
322 | } else if (preg_match('/'.$re2.'/iu',$text,$match,PREG_OFFSET_CAPTURE,$offset)) {
|
---|
323 | } else if (preg_match('/'.$re1.'/iu',$text,$match,PREG_OFFSET_CAPTURE,$offset)) {
|
---|
324 | } else {
|
---|
325 | break;
|
---|
326 | }
|
---|
327 |
|
---|
328 | list($str,$idx) = $match[0];
|
---|
329 |
|
---|
330 | // convert $idx (a byte offset) into a utf8 character offset
|
---|
331 | $utf8_idx = utf8_strlen(substr($text,0,$idx));
|
---|
332 | $utf8_len = utf8_strlen($str);
|
---|
333 |
|
---|
334 | // establish context, 100 bytes surrounding the match string
|
---|
335 | // first look to see if we can go 100 either side,
|
---|
336 | // then drop to 50 adding any excess if the other side can't go to 50,
|
---|
337 | $pre = min($utf8_idx-$utf8_offset,100);
|
---|
338 | $post = min($len-$utf8_idx-$utf8_len,100);
|
---|
339 |
|
---|
340 | if ($pre>50 && $post>50) {
|
---|
341 | $pre = $post = 50;
|
---|
342 | } else if ($pre>50) {
|
---|
343 | $pre = min($pre,100-$post);
|
---|
344 | } else if ($post>50) {
|
---|
345 | $post = min($post, 100-$pre);
|
---|
346 | } else {
|
---|
347 | // both are less than 50, means the context is the whole string
|
---|
348 | // make it so and break out of this loop - there is no need for the
|
---|
349 | // complex snippet calculations
|
---|
350 | $snippets = array($text);
|
---|
351 | break;
|
---|
352 | }
|
---|
353 |
|
---|
354 | // establish context start and end points, try to append to previous
|
---|
355 | // context if possible
|
---|
356 | $start = $utf8_idx - $pre;
|
---|
357 | $append = ($start < $end) ? $end : false; // still the end of the previous context snippet
|
---|
358 | $end = $utf8_idx + $utf8_len + $post; // now set it to the end of this context
|
---|
359 |
|
---|
360 | if ($append) {
|
---|
361 | $snippets[count($snippets)-1] .= utf8_substr($text,$append,$end-$append);
|
---|
362 | } else {
|
---|
363 | $snippets[] = utf8_substr($text,$start,$end-$start);
|
---|
364 | }
|
---|
365 |
|
---|
366 | // set $offset for next match attempt
|
---|
367 | // substract strlen to avoid splitting a potential search success,
|
---|
368 | // this is an approximation as the search pattern may match strings
|
---|
369 | // of varying length and it will fail if the context snippet
|
---|
370 | // boundary breaks a matching string longer than the current match
|
---|
371 | $utf8_offset = $utf8_idx + $post;
|
---|
372 | $offset = $idx + strlen(utf8_substr($text,$utf8_idx,$post));
|
---|
373 | $offset = utf8_correctIdx($text,$offset);
|
---|
374 | }
|
---|
375 |
|
---|
376 | $m = "\1";
|
---|
377 | $snippets = preg_replace('/'.$re1.'/iu',$m.'$1'.$m,$snippets);
|
---|
378 | $snippet = preg_replace('/'.$m.'([^'.$m.']*?)'.$m.'/iu','<strong class="search_hit">$1</strong>',hsc(join('... ',$snippets)));
|
---|
379 |
|
---|
380 | $evdata['snippet'] = $snippet;
|
---|
381 | }
|
---|
382 | $evt->advise_after();
|
---|
383 | unset($evt);
|
---|
384 |
|
---|
385 | return $evdata['snippet'];
|
---|
386 | }
|
---|
387 |
|
---|
388 | /**
|
---|
389 | * Wraps a search term in regex boundary checks.
|
---|
390 | */
|
---|
391 | function ft_snippet_re_preprocess($term) {
|
---|
392 | // do not process asian terms where word boundaries are not explicit
|
---|
393 | if(preg_match('/'.IDX_ASIAN.'/u',$term)){
|
---|
394 | return $term;
|
---|
395 | }
|
---|
396 |
|
---|
397 | if(substr($term,0,2) == '\\*'){
|
---|
398 | $term = substr($term,2);
|
---|
399 | }else{
|
---|
400 | $term = '\b'.$term;
|
---|
401 | }
|
---|
402 |
|
---|
403 | if(substr($term,-2,2) == '\\*'){
|
---|
404 | $term = substr($term,0,-2);
|
---|
405 | }else{
|
---|
406 | $term = $term.'\b';
|
---|
407 | }
|
---|
408 | return $term;
|
---|
409 | }
|
---|
410 |
|
---|
411 | /**
|
---|
412 | * Combine found documents and sum up their scores
|
---|
413 | *
|
---|
414 | * This function is used to combine searched words with a logical
|
---|
415 | * AND. Only documents available in all arrays are returned.
|
---|
416 | *
|
---|
417 | * based upon PEAR's PHP_Compat function for array_intersect_key()
|
---|
418 | *
|
---|
419 | * @param array $args An array of page arrays
|
---|
420 | */
|
---|
421 | function ft_resultCombine($args){
|
---|
422 | $array_count = count($args);
|
---|
423 | if($array_count == 1){
|
---|
424 | return $args[0];
|
---|
425 | }
|
---|
426 |
|
---|
427 | $result = array();
|
---|
428 | if ($array_count > 1) {
|
---|
429 | foreach ($args[0] as $key => $value) {
|
---|
430 | $result[$key] = $value;
|
---|
431 | for ($i = 1; $i !== $array_count; $i++) {
|
---|
432 | if (!isset($args[$i][$key])) {
|
---|
433 | unset($result[$key]);
|
---|
434 | break;
|
---|
435 | }
|
---|
436 | $result[$key] += $args[$i][$key];
|
---|
437 | }
|
---|
438 | }
|
---|
439 | }
|
---|
440 | return $result;
|
---|
441 | }
|
---|
442 |
|
---|
443 | /**
|
---|
444 | * Unites found documents and sum up their scores
|
---|
445 | *
|
---|
446 | * based upon ft_resultCombine() function
|
---|
447 | *
|
---|
448 | * @param array $args An array of page arrays
|
---|
449 | * @author Kazutaka Miyasaka <[email protected]>
|
---|
450 | */
|
---|
451 | function ft_resultUnite($args) {
|
---|
452 | $array_count = count($args);
|
---|
453 | if ($array_count === 1) {
|
---|
454 | return $args[0];
|
---|
455 | }
|
---|
456 |
|
---|
457 | $result = $args[0];
|
---|
458 | for ($i = 1; $i !== $array_count; $i++) {
|
---|
459 | foreach (array_keys($args[$i]) as $id) {
|
---|
460 | $result[$id] += $args[$i][$id];
|
---|
461 | }
|
---|
462 | }
|
---|
463 | return $result;
|
---|
464 | }
|
---|
465 |
|
---|
466 | /**
|
---|
467 | * Computes the difference of documents using page id for comparison
|
---|
468 | *
|
---|
469 | * nearly identical to PHP5's array_diff_key()
|
---|
470 | *
|
---|
471 | * @param array $args An array of page arrays
|
---|
472 | * @author Kazutaka Miyasaka <[email protected]>
|
---|
473 | */
|
---|
474 | function ft_resultComplement($args) {
|
---|
475 | $array_count = count($args);
|
---|
476 | if ($array_count === 1) {
|
---|
477 | return $args[0];
|
---|
478 | }
|
---|
479 |
|
---|
480 | $result = $args[0];
|
---|
481 | foreach (array_keys($result) as $id) {
|
---|
482 | for ($i = 1; $i !== $array_count; $i++) {
|
---|
483 | if (isset($args[$i][$id])) unset($result[$id]);
|
---|
484 | }
|
---|
485 | }
|
---|
486 | return $result;
|
---|
487 | }
|
---|
488 |
|
---|
489 | /**
|
---|
490 | * Parses a search query and builds an array of search formulas
|
---|
491 | *
|
---|
492 | * @author Andreas Gohr <[email protected]>
|
---|
493 | * @author Kazutaka Miyasaka <[email protected]>
|
---|
494 | */
|
---|
495 | function ft_queryParser($Indexer, $query){
|
---|
496 | /**
|
---|
497 | * parse a search query and transform it into intermediate representation
|
---|
498 | *
|
---|
499 | * in a search query, you can use the following expressions:
|
---|
500 | *
|
---|
501 | * words:
|
---|
502 | * include
|
---|
503 | * -exclude
|
---|
504 | * phrases:
|
---|
505 | * "phrase to be included"
|
---|
506 | * -"phrase you want to exclude"
|
---|
507 | * namespaces:
|
---|
508 | * @include:namespace (or ns:include:namespace)
|
---|
509 | * ^exclude:namespace (or -ns:exclude:namespace)
|
---|
510 | * groups:
|
---|
511 | * ()
|
---|
512 | * -()
|
---|
513 | * operators:
|
---|
514 | * and ('and' is the default operator: you can always omit this)
|
---|
515 | * or (or pipe symbol '|', lower precedence than 'and')
|
---|
516 | *
|
---|
517 | * e.g. a query [ aa "bb cc" @dd:ee ] means "search pages which contain
|
---|
518 | * a word 'aa', a phrase 'bb cc' and are within a namespace 'dd:ee'".
|
---|
519 | * this query is equivalent to [ -(-aa or -"bb cc" or -ns:dd:ee) ]
|
---|
520 | * as long as you don't mind hit counts.
|
---|
521 | *
|
---|
522 | * intermediate representation consists of the following parts:
|
---|
523 | *
|
---|
524 | * ( ) - group
|
---|
525 | * AND - logical and
|
---|
526 | * OR - logical or
|
---|
527 | * NOT - logical not
|
---|
528 | * W+:, W-:, W_: - word (underscore: no need to highlight)
|
---|
529 | * P+:, P-: - phrase (minus sign: logically in NOT group)
|
---|
530 | * N+:, N-: - namespace
|
---|
531 | */
|
---|
532 | $parsed_query = '';
|
---|
533 | $parens_level = 0;
|
---|
534 | $terms = preg_split('/(-?".*?")/u', utf8_strtolower($query), -1, PREG_SPLIT_DELIM_CAPTURE | PREG_SPLIT_NO_EMPTY);
|
---|
535 |
|
---|
536 | foreach ($terms as $term) {
|
---|
537 | $parsed = '';
|
---|
538 | if (preg_match('/^(-?)"(.+)"$/u', $term, $matches)) {
|
---|
539 | // phrase-include and phrase-exclude
|
---|
540 | $not = $matches[1] ? 'NOT' : '';
|
---|
541 | $parsed = $not.ft_termParser($Indexer, $matches[2], false, true);
|
---|
542 | } else {
|
---|
543 | // fix incomplete phrase
|
---|
544 | $term = str_replace('"', ' ', $term);
|
---|
545 |
|
---|
546 | // fix parentheses
|
---|
547 | $term = str_replace(')' , ' ) ', $term);
|
---|
548 | $term = str_replace('(' , ' ( ', $term);
|
---|
549 | $term = str_replace('- (', ' -(', $term);
|
---|
550 |
|
---|
551 | // treat pipe symbols as 'OR' operators
|
---|
552 | $term = str_replace('|', ' or ', $term);
|
---|
553 |
|
---|
554 | // treat ideographic spaces (U+3000) as search term separators
|
---|
555 | // FIXME: some more separators?
|
---|
556 | $term = preg_replace('/[ \x{3000}]+/u', ' ', $term);
|
---|
557 | $term = trim($term);
|
---|
558 | if ($term === '') continue;
|
---|
559 |
|
---|
560 | $tokens = explode(' ', $term);
|
---|
561 | foreach ($tokens as $token) {
|
---|
562 | if ($token === '(') {
|
---|
563 | // parenthesis-include-open
|
---|
564 | $parsed .= '(';
|
---|
565 | ++$parens_level;
|
---|
566 | } elseif ($token === '-(') {
|
---|
567 | // parenthesis-exclude-open
|
---|
568 | $parsed .= 'NOT(';
|
---|
569 | ++$parens_level;
|
---|
570 | } elseif ($token === ')') {
|
---|
571 | // parenthesis-any-close
|
---|
572 | if ($parens_level === 0) continue;
|
---|
573 | $parsed .= ')';
|
---|
574 | $parens_level--;
|
---|
575 | } elseif ($token === 'and') {
|
---|
576 | // logical-and (do nothing)
|
---|
577 | } elseif ($token === 'or') {
|
---|
578 | // logical-or
|
---|
579 | $parsed .= 'OR';
|
---|
580 | } elseif (preg_match('/^(?:\^|-ns:)(.+)$/u', $token, $matches)) {
|
---|
581 | // namespace-exclude
|
---|
582 | $parsed .= 'NOT(N+:'.$matches[1].')';
|
---|
583 | } elseif (preg_match('/^(?:@|ns:)(.+)$/u', $token, $matches)) {
|
---|
584 | // namespace-include
|
---|
585 | $parsed .= '(N+:'.$matches[1].')';
|
---|
586 | } elseif (preg_match('/^-(.+)$/', $token, $matches)) {
|
---|
587 | // word-exclude
|
---|
588 | $parsed .= 'NOT('.ft_termParser($Indexer, $matches[1]).')';
|
---|
589 | } else {
|
---|
590 | // word-include
|
---|
591 | $parsed .= ft_termParser($Indexer, $token);
|
---|
592 | }
|
---|
593 | }
|
---|
594 | }
|
---|
595 | $parsed_query .= $parsed;
|
---|
596 | }
|
---|
597 |
|
---|
598 | // cleanup (very sensitive)
|
---|
599 | $parsed_query .= str_repeat(')', $parens_level);
|
---|
600 | do {
|
---|
601 | $parsed_query_old = $parsed_query;
|
---|
602 | $parsed_query = preg_replace('/(NOT)?\(\)/u', '', $parsed_query);
|
---|
603 | } while ($parsed_query !== $parsed_query_old);
|
---|
604 | $parsed_query = preg_replace('/(NOT|OR)+\)/u', ')' , $parsed_query);
|
---|
605 | $parsed_query = preg_replace('/(OR)+/u' , 'OR' , $parsed_query);
|
---|
606 | $parsed_query = preg_replace('/\(OR/u' , '(' , $parsed_query);
|
---|
607 | $parsed_query = preg_replace('/^OR|OR$/u' , '' , $parsed_query);
|
---|
608 | $parsed_query = preg_replace('/\)(NOT)?\(/u' , ')AND$1(', $parsed_query);
|
---|
609 |
|
---|
610 | // adjustment: make highlightings right
|
---|
611 | $parens_level = 0;
|
---|
612 | $notgrp_levels = array();
|
---|
613 | $parsed_query_new = '';
|
---|
614 | $tokens = preg_split('/(NOT\(|[()])/u', $parsed_query, -1, PREG_SPLIT_DELIM_CAPTURE | PREG_SPLIT_NO_EMPTY);
|
---|
615 | foreach ($tokens as $token) {
|
---|
616 | if ($token === 'NOT(') {
|
---|
617 | $notgrp_levels[] = ++$parens_level;
|
---|
618 | } elseif ($token === '(') {
|
---|
619 | ++$parens_level;
|
---|
620 | } elseif ($token === ')') {
|
---|
621 | if ($parens_level-- === end($notgrp_levels)) array_pop($notgrp_levels);
|
---|
622 | } elseif (count($notgrp_levels) % 2 === 1) {
|
---|
623 | // turn highlight-flag off if terms are logically in "NOT" group
|
---|
624 | $token = preg_replace('/([WPN])\+\:/u', '$1-:', $token);
|
---|
625 | }
|
---|
626 | $parsed_query_new .= $token;
|
---|
627 | }
|
---|
628 | $parsed_query = $parsed_query_new;
|
---|
629 |
|
---|
630 | /**
|
---|
631 | * convert infix notation string into postfix (Reverse Polish notation) array
|
---|
632 | * by Shunting-yard algorithm
|
---|
633 | *
|
---|
634 | * see: http://en.wikipedia.org/wiki/Reverse_Polish_notation
|
---|
635 | * see: http://en.wikipedia.org/wiki/Shunting-yard_algorithm
|
---|
636 | */
|
---|
637 | $parsed_ary = array();
|
---|
638 | $ope_stack = array();
|
---|
639 | $ope_precedence = array(')' => 1, 'OR' => 2, 'AND' => 3, 'NOT' => 4, '(' => 5);
|
---|
640 | $ope_regex = '/([()]|OR|AND|NOT)/u';
|
---|
641 |
|
---|
642 | $tokens = preg_split($ope_regex, $parsed_query, -1, PREG_SPLIT_DELIM_CAPTURE | PREG_SPLIT_NO_EMPTY);
|
---|
643 | foreach ($tokens as $token) {
|
---|
644 | if (preg_match($ope_regex, $token)) {
|
---|
645 | // operator
|
---|
646 | $last_ope = end($ope_stack);
|
---|
647 | while ($ope_precedence[$token] <= $ope_precedence[$last_ope] && $last_ope != '(') {
|
---|
648 | $parsed_ary[] = array_pop($ope_stack);
|
---|
649 | $last_ope = end($ope_stack);
|
---|
650 | }
|
---|
651 | if ($token == ')') {
|
---|
652 | array_pop($ope_stack); // this array_pop always deletes '('
|
---|
653 | } else {
|
---|
654 | $ope_stack[] = $token;
|
---|
655 | }
|
---|
656 | } else {
|
---|
657 | // operand
|
---|
658 | $token_decoded = str_replace(array('OP', 'CP'), array('(', ')'), $token);
|
---|
659 | $parsed_ary[] = $token_decoded;
|
---|
660 | }
|
---|
661 | }
|
---|
662 | $parsed_ary = array_values(array_merge($parsed_ary, array_reverse($ope_stack)));
|
---|
663 |
|
---|
664 | // cleanup: each double "NOT" in RPN array actually does nothing
|
---|
665 | $parsed_ary_count = count($parsed_ary);
|
---|
666 | for ($i = 1; $i < $parsed_ary_count; ++$i) {
|
---|
667 | if ($parsed_ary[$i] === 'NOT' && $parsed_ary[$i - 1] === 'NOT') {
|
---|
668 | unset($parsed_ary[$i], $parsed_ary[$i - 1]);
|
---|
669 | }
|
---|
670 | }
|
---|
671 | $parsed_ary = array_values($parsed_ary);
|
---|
672 |
|
---|
673 | // build return value
|
---|
674 | $q = array();
|
---|
675 | $q['query'] = $query;
|
---|
676 | $q['parsed_str'] = $parsed_query;
|
---|
677 | $q['parsed_ary'] = $parsed_ary;
|
---|
678 |
|
---|
679 | foreach ($q['parsed_ary'] as $token) {
|
---|
680 | if ($token[2] !== ':') continue;
|
---|
681 | $body = substr($token, 3);
|
---|
682 |
|
---|
683 | switch (substr($token, 0, 3)) {
|
---|
684 | case 'N+:':
|
---|
685 | $q['ns'][] = $body; // for backward compatibility
|
---|
686 | break;
|
---|
687 | case 'N-:':
|
---|
688 | $q['notns'][] = $body; // for backward compatibility
|
---|
689 | break;
|
---|
690 | case 'W_:':
|
---|
691 | $q['words'][] = $body;
|
---|
692 | break;
|
---|
693 | case 'W-:':
|
---|
694 | $q['words'][] = $body;
|
---|
695 | $q['not'][] = $body; // for backward compatibility
|
---|
696 | break;
|
---|
697 | case 'W+:':
|
---|
698 | $q['words'][] = $body;
|
---|
699 | $q['highlight'][] = $body;
|
---|
700 | $q['and'][] = $body; // for backward compatibility
|
---|
701 | break;
|
---|
702 | case 'P-:':
|
---|
703 | $q['phrases'][] = $body;
|
---|
704 | break;
|
---|
705 | case 'P+:':
|
---|
706 | $q['phrases'][] = $body;
|
---|
707 | $q['highlight'][] = $body;
|
---|
708 | break;
|
---|
709 | }
|
---|
710 | }
|
---|
711 | foreach (array('words', 'phrases', 'highlight', 'ns', 'notns', 'and', 'not') as $key) {
|
---|
712 | $q[$key] = empty($q[$key]) ? array() : array_values(array_unique($q[$key]));
|
---|
713 | }
|
---|
714 |
|
---|
715 | return $q;
|
---|
716 | }
|
---|
717 |
|
---|
718 | /**
|
---|
719 | * Transforms given search term into intermediate representation
|
---|
720 | *
|
---|
721 | * This function is used in ft_queryParser() and not for general purpose use.
|
---|
722 | *
|
---|
723 | * @author Kazutaka Miyasaka <[email protected]>
|
---|
724 | */
|
---|
725 | function ft_termParser($Indexer, $term, $consider_asian = true, $phrase_mode = false) {
|
---|
726 | $parsed = '';
|
---|
727 | if ($consider_asian) {
|
---|
728 | // successive asian characters need to be searched as a phrase
|
---|
729 | $words = preg_split('/('.IDX_ASIAN.'+)/u', $term, -1, PREG_SPLIT_DELIM_CAPTURE | PREG_SPLIT_NO_EMPTY);
|
---|
730 | foreach ($words as $word) {
|
---|
731 | $phrase_mode = $phrase_mode ? true : preg_match('/'.IDX_ASIAN.'/u', $word);
|
---|
732 | $parsed .= ft_termParser($Indexer, $word, false, $phrase_mode);
|
---|
733 | }
|
---|
734 | } else {
|
---|
735 | $term_noparen = str_replace(array('(', ')'), ' ', $term);
|
---|
736 | $words = $Indexer->tokenizer($term_noparen, true);
|
---|
737 |
|
---|
738 | // W_: no need to highlight
|
---|
739 | if (empty($words)) {
|
---|
740 | $parsed = '()'; // important: do not remove
|
---|
741 | } elseif ($words[0] === $term) {
|
---|
742 | $parsed = '(W+:'.$words[0].')';
|
---|
743 | } elseif ($phrase_mode) {
|
---|
744 | $term_encoded = str_replace(array('(', ')'), array('OP', 'CP'), $term);
|
---|
745 | $parsed = '((W_:'.implode(')(W_:', $words).')(P+:'.$term_encoded.'))';
|
---|
746 | } else {
|
---|
747 | $parsed = '((W+:'.implode(')(W+:', $words).'))';
|
---|
748 | }
|
---|
749 | }
|
---|
750 | return $parsed;
|
---|
751 | }
|
---|
752 |
|
---|
753 | //Setup VIM: ex: et ts=4 :
|
---|