1 | /**************************************************************************
|
---|
2 | *
|
---|
3 | * mgquery.c -- The M G Q U E R Y program
|
---|
4 | * Copyright (C) 1994 Neil Sharman
|
---|
5 | *
|
---|
6 | * This program is free software; you can redistribute it and/or modify
|
---|
7 | * it under the terms of the GNU General Public License as published by
|
---|
8 | * the Free Software Foundation; either version 2 of the License, or
|
---|
9 | * (at your option) any later version.
|
---|
10 | *
|
---|
11 | * This program is distributed in the hope that it will be useful,
|
---|
12 | * but WITHOUT ANY WARRANTY; without even the implied warranty of
|
---|
13 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
---|
14 | * GNU General Public License for more details.
|
---|
15 | *
|
---|
16 | * You should have received a copy of the GNU General Public License
|
---|
17 | * along with this program; if not, write to the Free Software
|
---|
18 | * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
|
---|
19 | *
|
---|
20 | * $Id: mgquery.c 7628 2004-06-22 04:17:51Z kjdon $
|
---|
21 | *
|
---|
22 | **************************************************************************/
|
---|
23 |
|
---|
24 | /*
|
---|
25 | $Log$
|
---|
26 | Revision 1.2 2004/06/22 04:17:51 kjdon
|
---|
27 | fixed a couple of compiler warnings
|
---|
28 |
|
---|
29 | Revision 1.1 2003/02/20 21:18:24 mdewsnip
|
---|
30 | Addition of MG package for search and retrieval
|
---|
31 |
|
---|
32 | Revision 1.1 1999/08/10 21:18:18 sjboddie
|
---|
33 | renamed mg-1.3d directory mg
|
---|
34 |
|
---|
35 | Revision 1.3 1999/01/08 00:33:46 rjmcnab
|
---|
36 |
|
---|
37 | Enabled mg and the library software to read in more than one index
|
---|
38 | at a time.
|
---|
39 |
|
---|
40 | Revision 1.2 1998/11/25 07:55:49 rjmcnab
|
---|
41 |
|
---|
42 | Modified mg to that you can specify the stemmer you want
|
---|
43 | to use via a command line option. You specify it to
|
---|
44 | mg_passes during the build process. The number of the
|
---|
45 | stemmer that you used is stored within the inverted
|
---|
46 | dictionary header and the stemmed dictionary header so
|
---|
47 | the correct stemmer is used in later stages of building
|
---|
48 | and querying.
|
---|
49 |
|
---|
50 | Revision 1.1 1998/11/17 09:35:29 rjmcnab
|
---|
51 | *** empty log message ***
|
---|
52 |
|
---|
53 | * Revision 1.3 1994/10/20 03:57:02 tes
|
---|
54 | * I have rewritten the boolean query optimiser and abstracted out the
|
---|
55 | * components of the boolean query.
|
---|
56 | *
|
---|
57 | * Revision 1.2 1994/09/20 04:41:58 tes
|
---|
58 | * For version 1.1
|
---|
59 | *
|
---|
60 | */
|
---|
61 |
|
---|
62 | static char *RCSID = "$Id: mgquery.c 7628 2004-06-22 04:17:51Z kjdon $";
|
---|
63 |
|
---|
64 | #include "sysfuncs.h"
|
---|
65 |
|
---|
66 | #if defined(HAVE_SYS_PROCFS_H) && defined(HAVE_PR_BRKSIZE) && \
|
---|
67 | (__STDC__ == 0)
|
---|
68 | /* STDC test was included to allow cc -Xc on SunOS 5 to work */
|
---|
69 | #define USE_PROCESS_MEM
|
---|
70 | #endif
|
---|
71 |
|
---|
72 | #ifdef USE_PROCESS_MEM
|
---|
73 | # include <sys/procfs.h>
|
---|
74 | #endif
|
---|
75 |
|
---|
76 | #ifdef HAVE_GETRUSAGE
|
---|
77 | # ifdef HAVE_SYS_TIME_H
|
---|
78 | # include <sys/time.h>
|
---|
79 | # endif
|
---|
80 | # include <sys/resource.h>
|
---|
81 | #endif
|
---|
82 |
|
---|
83 | #ifndef HAVE_GETPAGESIZE
|
---|
84 | # include "getpagesize.h"
|
---|
85 | #endif
|
---|
86 |
|
---|
87 | #if WITH_REGEX
|
---|
88 | # include <regex.h>
|
---|
89 | #else
|
---|
90 | # include <rx.h>
|
---|
91 | #endif
|
---|
92 |
|
---|
93 |
|
---|
94 | #include <stdarg.h>
|
---|
95 | #include <signal.h>
|
---|
96 |
|
---|
97 | #include "messages.h"
|
---|
98 | #include "timing.h"
|
---|
99 | #include "memlib.h"
|
---|
100 | #include "local_strings.h" /* [RPAP - Feb 97: Term Frequency] */
|
---|
101 |
|
---|
102 | #include "filestats.h"
|
---|
103 | #include "invf.h"
|
---|
104 | #include "text.h"
|
---|
105 | #include "mg.h"
|
---|
106 | #include "lists.h"
|
---|
107 | #include "backend.h"
|
---|
108 | #include "environment.h"
|
---|
109 | #include "globals.h"
|
---|
110 | #include "read_line.h"
|
---|
111 | #include "mg_errors.h"
|
---|
112 | #include "commands.h"
|
---|
113 | #include "text_get.h"
|
---|
114 | #include "term_lists.h"
|
---|
115 | #include "query_term_list.h"
|
---|
116 |
|
---|
117 |
|
---|
118 | FILE *OutFile = NULL, *InFile = NULL;
|
---|
119 | int OutPipe = 0, InPipe = 0;
|
---|
120 | int Quitting = 0;
|
---|
121 |
|
---|
122 | /* [RPAP - Feb 97: NZDL Additions] */
|
---|
123 | #if defined(PARADOCNUM) || defined(NZDL)
|
---|
124 | int GetDocNumFromParaNum(query_data *qd, int paranum) {
|
---|
125 | int Documents = qd->td->cth.num_of_docs;
|
---|
126 | int *Paragraph = qd->paragraph;
|
---|
127 | int low = 1, high = Documents;
|
---|
128 | int mid = (low+high)/2;
|
---|
129 |
|
---|
130 | while ((mid = (low+high)/2) >=1 && mid <= Documents)
|
---|
131 | {
|
---|
132 | if (paranum > Paragraph[mid])
|
---|
133 | low = mid+1;
|
---|
134 | else if (paranum <= Paragraph[mid-1])
|
---|
135 | high = mid-1;
|
---|
136 | else
|
---|
137 | return mid;
|
---|
138 | }
|
---|
139 | FatalError(1, "Bad paragraph number.\n");
|
---|
140 | }
|
---|
141 | #endif
|
---|
142 |
|
---|
143 | #ifdef TREC_MODE
|
---|
144 | char *trec_ids = NULL;
|
---|
145 | long *trec_paras = NULL;
|
---|
146 | #endif
|
---|
147 |
|
---|
148 | static volatile int PagerRunning = 0;
|
---|
149 | static volatile int Ctrl_C = 0;
|
---|
150 |
|
---|
151 |
|
---|
152 | /*****************************************************************************/
|
---|
153 |
|
---|
154 | typedef enum
|
---|
155 | {
|
---|
156 | S_Time, S_Mem, S_Size, S_File
|
---|
157 | }
|
---|
158 | S_Type;
|
---|
159 |
|
---|
160 | static struct Stat
|
---|
161 | {
|
---|
162 | S_Type typ;
|
---|
163 | char *name;
|
---|
164 | char *text;
|
---|
165 | }
|
---|
166 | *Stats = NULL;
|
---|
167 | static int NumStats = 0;
|
---|
168 |
|
---|
169 | static void
|
---|
170 | Clear_Stats (void)
|
---|
171 | {
|
---|
172 | if (Stats)
|
---|
173 | {
|
---|
174 | int i;
|
---|
175 | for (i = 0; i < NumStats; i++)
|
---|
176 | {
|
---|
177 | if (Stats[i].name)
|
---|
178 | Xfree (Stats[i].name);
|
---|
179 | if (Stats[i].text)
|
---|
180 | Xfree (Stats[i].text);
|
---|
181 | }
|
---|
182 | Xfree (Stats);
|
---|
183 | Stats = NULL;
|
---|
184 | NumStats = 0;
|
---|
185 | }
|
---|
186 | }
|
---|
187 |
|
---|
188 | static void
|
---|
189 | Add_Stats (S_Type typ, char *name, char *fmt,...)
|
---|
190 | {
|
---|
191 | char buf[1024];
|
---|
192 | va_list args;
|
---|
193 | va_start (args, fmt);
|
---|
194 | vsprintf (buf, fmt, args);
|
---|
195 | if (Stats)
|
---|
196 | Stats = Xrealloc (Stats, (++NumStats) * sizeof (*Stats));
|
---|
197 | else
|
---|
198 | Stats = Xmalloc ((++NumStats) * sizeof (*Stats));
|
---|
199 | Stats[NumStats - 1].typ = typ;
|
---|
200 | Stats[NumStats - 1].name = Xstrdup (name);
|
---|
201 | Stats[NumStats - 1].text = Xstrdup (buf);
|
---|
202 | }
|
---|
203 |
|
---|
204 | static void
|
---|
205 | Display_Stats (FILE * f)
|
---|
206 | {
|
---|
207 | static char *sep = "-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-="
|
---|
208 | "-=-=-=-=-=-=-=-=-=-=-";
|
---|
209 | char *names[] =
|
---|
210 | {"Time: ", "Memory:", "Sizes: ", "Disk: ", " "};
|
---|
211 | int i, last_typ = -1;
|
---|
212 | size_t len = 0;
|
---|
213 | if (NumStats == 0)
|
---|
214 | return;
|
---|
215 | fprintf (f, "%s\n", sep);
|
---|
216 | for (i = 0; i < NumStats; i++)
|
---|
217 | if (strlen (Stats[i].name) > len)
|
---|
218 | len = strlen (Stats[i].name);
|
---|
219 | for (i = 0; i < NumStats; i++)
|
---|
220 | {
|
---|
221 | int typ = 4;
|
---|
222 | if (Stats[i].typ != last_typ)
|
---|
223 | typ = last_typ = Stats[i].typ;
|
---|
224 | fprintf (f, "%s %-*s %s\n", names[typ], (int) len, Stats[i].name, Stats[i].text);
|
---|
225 | }
|
---|
226 | fprintf (f, "%s\n", sep);
|
---|
227 | }
|
---|
228 |
|
---|
229 | /*****************************************************************************/
|
---|
230 |
|
---|
231 |
|
---|
232 | static void
|
---|
233 | QueryTimeStats (ProgTime * Start, ProgTime * invf, ProgTime * text)
|
---|
234 | {
|
---|
235 | if (!BooleanEnv (GetEnv ("briefstats"), 0))
|
---|
236 | {
|
---|
237 | Add_Stats (S_Time, "invf", ElapsedTime (Start, invf));
|
---|
238 | Add_Stats (S_Time, "text", ElapsedTime (invf, text));
|
---|
239 | }
|
---|
240 | Add_Stats (S_Time, "total", ElapsedTime (Start, text));
|
---|
241 | }
|
---|
242 |
|
---|
243 | static void
|
---|
244 | StartUpTimeStats (InitQueryTimes * iqt)
|
---|
245 | {
|
---|
246 | if (!BooleanEnv (GetEnv ("briefstats"), 0))
|
---|
247 | {
|
---|
248 | Add_Stats (S_Time, "dict [stem]", ElapsedTime (&iqt->Start,
|
---|
249 | &iqt->StemDict));
|
---|
250 | Add_Stats (S_Time, "weights", ElapsedTime (&iqt->StemDict,
|
---|
251 | &iqt->ApproxWeights));
|
---|
252 | Add_Stats (S_Time, "dict [text]", ElapsedTime (&iqt->ApproxWeights,
|
---|
253 | &iqt->CompDict));
|
---|
254 | Add_Stats (S_Time, "Inverted", ElapsedTime (&iqt->CompDict,
|
---|
255 | &iqt->Invf));
|
---|
256 | Add_Stats (S_Time, "Compressed", ElapsedTime (&iqt->Invf,
|
---|
257 | &iqt->Text));
|
---|
258 | }
|
---|
259 | Add_Stats (S_Time, "total", ElapsedTime (&iqt->Start, &iqt->Text));
|
---|
260 | }
|
---|
261 |
|
---|
262 |
|
---|
263 |
|
---|
264 |
|
---|
265 | #ifdef USE_PROCESS_MEM
|
---|
266 | static u_long
|
---|
267 | process_mem (void)
|
---|
268 | {
|
---|
269 | prstatus_t pr;
|
---|
270 | static int fd = -1;
|
---|
271 | if (fd == -1)
|
---|
272 | {
|
---|
273 | char buf[128];
|
---|
274 | sprintf (buf, "/proc/%ld", (long) getpid ());
|
---|
275 | fd = open (buf, O_RDONLY);
|
---|
276 | }
|
---|
277 | if (fd == -1 || ioctl (fd, PIOCSTATUS, &pr) == -1)
|
---|
278 | return 0;
|
---|
279 | return pr.pr_brksize;
|
---|
280 | }
|
---|
281 | #endif
|
---|
282 |
|
---|
283 |
|
---|
284 |
|
---|
285 |
|
---|
286 | static void
|
---|
287 | MemStats (query_data * qd)
|
---|
288 | {
|
---|
289 | if (!BooleanEnv (GetEnv ("briefstats"), 0))
|
---|
290 | {
|
---|
291 | #ifdef HAVE_GETRUSAGE
|
---|
292 | struct rusage rusage;
|
---|
293 | getrusage (RUSAGE_SELF, &rusage);
|
---|
294 |
|
---|
295 | Add_Stats (S_Mem, "process mem", "%7.3f Mb",
|
---|
296 | (double) (rusage.ru_maxrss * getpagesize () / 1024.0 / 1024.0));
|
---|
297 | #endif
|
---|
298 | #ifdef USE_PROCESS_MEM
|
---|
299 | Add_Stats (S_Mem, "process mem", "%7.3f Mb",
|
---|
300 | (double) (process_mem () / 1024.0 / 1024.0));
|
---|
301 | #endif
|
---|
302 | Add_Stats (S_Mem, "dict [stem]", "%7.1f kB",
|
---|
303 | (double) qd->sd->MemForStemDict / 1024);
|
---|
304 | Add_Stats (S_Mem, "dict [text]", "%7.1f kB",
|
---|
305 | (double) qd->cd->MemForCompDict / 1024);
|
---|
306 | if (qd->awd)
|
---|
307 | Add_Stats (S_Mem, "weights", "%7.1f kB",
|
---|
308 | (double) qd->awd->MemForWeights / 1024);
|
---|
309 | }
|
---|
310 | if (qd->awd)
|
---|
311 | Add_Stats (S_Mem, "total [peak]", "%7.1f kB",
|
---|
312 | (double) (qd->max_mem_in_use + qd->sd->MemForStemDict +
|
---|
313 | qd->cd->MemForCompDict + qd->awd->MemForWeights) / 1024);
|
---|
314 | else
|
---|
315 | Add_Stats (S_Mem, "total [peak]", "%7.1f kB",
|
---|
316 | (double) (qd->max_mem_in_use + qd->sd->MemForStemDict +
|
---|
317 | qd->cd->MemForCompDict) / 1024);
|
---|
318 |
|
---|
319 | }
|
---|
320 |
|
---|
321 |
|
---|
322 |
|
---|
323 | static void
|
---|
324 | SizeStats (query_data * qd)
|
---|
325 | {
|
---|
326 | Add_Stats (S_Size, "skips", "%7d", qd->hops_taken);
|
---|
327 | Add_Stats (S_Size, "pointers", "%7d", qd->num_of_ptrs);
|
---|
328 | Add_Stats (S_Size, "accumulators", "%7d", qd->num_of_accum);
|
---|
329 | Add_Stats (S_Size, "terms", "%7d", qd->num_of_terms);
|
---|
330 | Add_Stats (S_Size, "answers", "%7d", qd->num_of_ans);
|
---|
331 | Add_Stats (S_Size, "index lookups", "%7d", qd->text_idx_lookups);
|
---|
332 | }
|
---|
333 |
|
---|
334 | static void
|
---|
335 | TotalSizeStats (query_data * qd)
|
---|
336 | {
|
---|
337 | Add_Stats (S_Size, "skips", "%7d", qd->tot_hops_taken);
|
---|
338 | Add_Stats (S_Size, "pointers", "%7d", qd->tot_num_of_ptrs);
|
---|
339 | Add_Stats (S_Size, "accumulators", "%7d", qd->tot_num_of_accum);
|
---|
340 | Add_Stats (S_Size, "terms", "%7d", qd->tot_num_of_terms);
|
---|
341 | Add_Stats (S_Size, "answers", "%7d", qd->tot_num_of_ans);
|
---|
342 | Add_Stats (S_Size, "index lookups", "%7d", qd->tot_text_idx_lookups);
|
---|
343 | }
|
---|
344 |
|
---|
345 |
|
---|
346 | static void
|
---|
347 | StatFile (File * F)
|
---|
348 | {
|
---|
349 | static unsigned long NumBytes = 0, NumSeeks = 0, NumReads = 0;
|
---|
350 | if (F)
|
---|
351 | {
|
---|
352 | if ((int) F != -1)
|
---|
353 | {
|
---|
354 | if (!BooleanEnv (GetEnv ("briefstats"), 0))
|
---|
355 | Add_Stats (S_File, F->name, "%7.1f kB (%3d seeks, %7d reads)",
|
---|
356 | (double) F->Current.NumBytes / 1024, F->Current.NumSeeks,
|
---|
357 | F->Current.NumReads);
|
---|
358 | NumBytes += F->Current.NumBytes;
|
---|
359 | NumSeeks += F->Current.NumSeeks;
|
---|
360 | NumReads += F->Current.NumReads;
|
---|
361 | }
|
---|
362 | else
|
---|
363 | {
|
---|
364 | Add_Stats (S_File, "total", "%7.1f kB (%3d seeks, %7d reads)",
|
---|
365 | (double) NumBytes / 1024, NumSeeks, NumReads);
|
---|
366 | NumSeeks = NumReads = NumBytes = 0;
|
---|
367 | }
|
---|
368 | }
|
---|
369 | }
|
---|
370 |
|
---|
371 |
|
---|
372 | static void
|
---|
373 | File_Stats (query_data * qd)
|
---|
374 | {
|
---|
375 | StatFile (qd->File_comp_dict);
|
---|
376 | StatFile (qd->File_fast_comp_dict);
|
---|
377 | StatFile (qd->File_text_idx_wgt);
|
---|
378 | StatFile (qd->File_text);
|
---|
379 | StatFile (qd->File_stem);
|
---|
380 |
|
---|
381 | /* [RPAP - Jan 97: Stem Index Change] */
|
---|
382 | if (qd->sd->sdh.indexed)
|
---|
383 | {
|
---|
384 | StatFile (qd->File_stem1);
|
---|
385 | StatFile (qd->File_stem2);
|
---|
386 | StatFile (qd->File_stem3);
|
---|
387 | }
|
---|
388 |
|
---|
389 | StatFile (qd->File_invf);
|
---|
390 | StatFile (qd->File_weight_approx);
|
---|
391 | StatFile (qd->File_text_idx);
|
---|
392 | StatFile ((File *) (-1));
|
---|
393 | }
|
---|
394 |
|
---|
395 |
|
---|
396 | char *
|
---|
397 | get_query (query_data * qd)
|
---|
398 | {
|
---|
399 | char *line, *LinePtr;
|
---|
400 | WritePrompt ();
|
---|
401 | do
|
---|
402 | {
|
---|
403 | do
|
---|
404 | {
|
---|
405 | line = GetMultiLine ();
|
---|
406 | if (line == NULL)
|
---|
407 | {
|
---|
408 | if (stdin == InFile)
|
---|
409 | return (NULL); /* EOF */
|
---|
410 | if (InPipe)
|
---|
411 | /* [RPAP - Feb 97: WIN32 Port] */
|
---|
412 | #ifdef __WIN32__
|
---|
413 | _pclose (InFile);
|
---|
414 | #else
|
---|
415 | pclose (InFile);
|
---|
416 | #endif
|
---|
417 | else
|
---|
418 | fclose (InFile);
|
---|
419 | InPipe = 0;
|
---|
420 | InFile = stdin;
|
---|
421 | }
|
---|
422 | }
|
---|
423 | while (line == NULL);
|
---|
424 | LinePtr = ProcessCommands (line, qd);
|
---|
425 | if (CommandsErrorStr)
|
---|
426 | fprintf (stderr, "%s\n", CommandsErrorStr);
|
---|
427 | }
|
---|
428 | while (*LinePtr == '\0' && !Quitting);
|
---|
429 | return (LinePtr);
|
---|
430 | }
|
---|
431 |
|
---|
432 |
|
---|
433 | /* This is executed when a SIGPIPE is detected
|
---|
434 | i.e. If some one quits out of the PAGER, this is executed */
|
---|
435 | #ifdef HAVE_SIGCONTEXT
|
---|
436 | static RETSIGTYPE
|
---|
437 | SIGPIPE_handler (int sig, int code,
|
---|
438 | struct sigcontext *scp, char *addr)
|
---|
439 | #else
|
---|
440 | static RETSIGTYPE
|
---|
441 | SIGPIPE_handler (int sig)
|
---|
442 | #endif
|
---|
443 | {
|
---|
444 | /* [RPAP - Feb 97: WIN32 Port] */
|
---|
445 | #ifdef __WIN32__
|
---|
446 | signal (sig, SIG_IGN);
|
---|
447 | #else
|
---|
448 | signal (SIGPIPE, SIG_IGN);
|
---|
449 | #endif
|
---|
450 | PagerRunning = 0;
|
---|
451 | }
|
---|
452 |
|
---|
453 | /* This is executed when a SIGINT (i.e. CTRL-C) is detected */
|
---|
454 | #ifdef HAVE_SIGCONTEXT
|
---|
455 | static RETSIGTYPE
|
---|
456 | SIGINT_handler (int sig, int code,
|
---|
457 | struct sigcontext *scp, char *addr)
|
---|
458 | #else
|
---|
459 | static RETSIGTYPE
|
---|
460 | SIGINT_handler (int sig)
|
---|
461 | #endif
|
---|
462 | {
|
---|
463 | Ctrl_C = 1;
|
---|
464 | }
|
---|
465 |
|
---|
466 |
|
---|
467 |
|
---|
468 | static char *post_proc = NULL;
|
---|
469 |
|
---|
470 |
|
---|
471 |
|
---|
472 | void
|
---|
473 | GetPostProc (char *line)
|
---|
474 | {
|
---|
475 | char *start, *finish;
|
---|
476 | if (post_proc)
|
---|
477 | {
|
---|
478 | Xfree (post_proc);
|
---|
479 | post_proc = NULL;
|
---|
480 | }
|
---|
481 | start = strchr (line, '\"');
|
---|
482 | finish = strrchr (line, '\"');
|
---|
483 | if (start != finish)
|
---|
484 | {
|
---|
485 | /* found a pattern */
|
---|
486 | *finish = '\0';
|
---|
487 | post_proc = Xstrdup (start + 1);
|
---|
488 | strcpy (start, finish + 1);
|
---|
489 | if (BooleanEnv (GetEnv ("verbatim"), 1) == 0)
|
---|
490 | {
|
---|
491 | char *s;
|
---|
492 | s = re_comp (post_proc);
|
---|
493 | if (!s)
|
---|
494 | {
|
---|
495 | Xfree (post_proc);
|
---|
496 | post_proc = NULL;
|
---|
497 | }
|
---|
498 | }
|
---|
499 | }
|
---|
500 | else if (start != NULL)
|
---|
501 | {
|
---|
502 | /* found a single speech mark. Delete It. */
|
---|
503 | strcpy (start, start + 1);
|
---|
504 | }
|
---|
505 | }
|
---|
506 |
|
---|
507 | int
|
---|
508 | PostProc (char *UDoc, int verbatim)
|
---|
509 | {
|
---|
510 | if (!post_proc)
|
---|
511 | return 1;
|
---|
512 |
|
---|
513 | if (verbatim) {
|
---|
514 | return (strstr (UDoc, post_proc) != NULL);
|
---|
515 | }
|
---|
516 | return re_exec ((char *) UDoc);
|
---|
517 | }
|
---|
518 |
|
---|
519 |
|
---|
520 |
|
---|
521 | static DocEntry *
|
---|
522 | in_chain (int para, int ip, DocEntry * dc)
|
---|
523 | {
|
---|
524 | while (dc)
|
---|
525 | {
|
---|
526 | if (dc->DocNum - ip == para)
|
---|
527 | return dc;
|
---|
528 | dc = dc->Next;
|
---|
529 | }
|
---|
530 | return NULL;
|
---|
531 | }
|
---|
532 |
|
---|
533 | /* num should be greater than or equal to 1 */
|
---|
534 | int
|
---|
535 | RawDocOutput (query_data * qd, u_long num, FILE * Output)
|
---|
536 | {
|
---|
537 | static u_long last_pos = 0;
|
---|
538 | static u_char *c_buffer = 0;
|
---|
539 | static int buf_len = -1;
|
---|
540 | static u_char *uc_buffer = 0;
|
---|
541 | u_long pos, len;
|
---|
542 | int ULen;
|
---|
543 |
|
---|
544 | FetchDocStart (qd, num, &pos, &len);
|
---|
545 |
|
---|
546 | if ((int) len > buf_len)
|
---|
547 | {
|
---|
548 | if (c_buffer)
|
---|
549 | {
|
---|
550 | Xfree (c_buffer);
|
---|
551 | Xfree (uc_buffer);
|
---|
552 | }
|
---|
553 | if (!(c_buffer = Xmalloc (len)))
|
---|
554 | return -1;
|
---|
555 | if (!(uc_buffer = Xmalloc ((int) (qd->td->cth.ratio * 1.01 *
|
---|
556 | len) + 100)))
|
---|
557 | return -1;
|
---|
558 | buf_len = len;
|
---|
559 | }
|
---|
560 | if (last_pos != pos)
|
---|
561 | Fseek (qd->td->TextFile, pos, 0);
|
---|
562 | Fread (c_buffer, 1, len, qd->td->TextFile);
|
---|
563 | last_pos = pos + len;
|
---|
564 | DecodeText (qd->cd, c_buffer, len, uc_buffer, &ULen);
|
---|
565 | fwrite (uc_buffer, ULen, sizeof (u_char), Output);
|
---|
566 | return 0;
|
---|
567 | }
|
---|
568 |
|
---|
569 |
|
---|
570 | void
|
---|
571 | StringOut (FILE * Output, char *string,
|
---|
572 | int intvalid, unsigned long intval,
|
---|
573 | int floatvalid, double floatval)
|
---|
574 | {
|
---|
575 | char *s;
|
---|
576 | for (s = string; *s; s++)
|
---|
577 | if (*s == '%' &&
|
---|
578 | (*(s + 1) == 'n' || *(s + 1) == 'w' || *(s + 1) == '%'))
|
---|
579 | {
|
---|
580 | s++;
|
---|
581 | switch (*s)
|
---|
582 | {
|
---|
583 | case 'n':
|
---|
584 | if (intvalid)
|
---|
585 | fprintf (Output, "%lu", intval);
|
---|
586 | else
|
---|
587 | fprintf (Output, "%%n");
|
---|
588 | break;
|
---|
589 | case 'w':
|
---|
590 | if (floatvalid)
|
---|
591 | fprintf (Output, "%f", floatval);
|
---|
592 | else
|
---|
593 | fprintf (Output, "%%w");
|
---|
594 | break;
|
---|
595 | case '%':
|
---|
596 | fputc ('%', Output);
|
---|
597 | }
|
---|
598 | }
|
---|
599 | else
|
---|
600 | fputc (*s, Output);
|
---|
601 | }
|
---|
602 |
|
---|
603 |
|
---|
604 | void
|
---|
605 | HeaderOut (FILE * Output, u_char * UDoc, unsigned long ULen, int heads_length)
|
---|
606 | {
|
---|
607 | int i, space = 1, num = 0;
|
---|
608 | for (i = 0; i < ULen && num < heads_length; i++)
|
---|
609 | {
|
---|
610 | char c = UDoc[i];
|
---|
611 | if (c == '\02')
|
---|
612 | break;
|
---|
613 |
|
---|
614 | if (isspace (c) || c == '\01' || c == '\03')
|
---|
615 | {
|
---|
616 | if (!space)
|
---|
617 | {
|
---|
618 | fputc (' ', Output);
|
---|
619 | num++;
|
---|
620 | }
|
---|
621 | space = 1;
|
---|
622 | }
|
---|
623 | else
|
---|
624 | {
|
---|
625 | space = 0;
|
---|
626 | fputc (c, Output);
|
---|
627 | num++;
|
---|
628 | }
|
---|
629 | }
|
---|
630 | }
|
---|
631 |
|
---|
632 | /* [RPAP - Feb 97: NZDL Additions] */
|
---|
633 | #if defined(PARADOCNUM) || defined(NZDL)
|
---|
634 | void PrintDocNum(FILE *output, char query_type,
|
---|
635 | int docnum, int indexnum, float weight)
|
---|
636 | {
|
---|
637 | if (query_type == 'R' || query_type == 'A')
|
---|
638 | fprintf(output, "%7d.%-7d %6.4f\n", docnum, indexnum, weight);
|
---|
639 | else
|
---|
640 | fprintf(output, "%7d.%-7d\n", docnum, indexnum);
|
---|
641 | }
|
---|
642 | #endif
|
---|
643 |
|
---|
644 | static int
|
---|
645 | ProcessDocs (query_data * qd, int num, int verbatim,
|
---|
646 | char OutputType, FILE * Output)
|
---|
647 | {
|
---|
648 | int max_buf = 0;
|
---|
649 | int DocCount = 0;
|
---|
650 | char *doc_sepstr = NULL;
|
---|
651 | char *para_sepstr = NULL;
|
---|
652 | char *para_start = NULL;
|
---|
653 | int heads_length = atoi (GetDefEnv ("heads_length", "50"));
|
---|
654 | char QueryType = get_query_type ();
|
---|
655 | int need_text = (OutputType == OUTPUT_TEXT || OutputType == OUTPUT_HILITE ||
|
---|
656 | OutputType == OUTPUT_HEADERS || OutputType == OUTPUT_SILENT ||
|
---|
657 | post_proc); /* [RJM June 1997 -- fixing post retrieval scan] */
|
---|
658 |
|
---|
659 | if (OutputType == OUTPUT_TEXT || OutputType == OUTPUT_HILITE)
|
---|
660 | {
|
---|
661 | if (QueryType == QUERY_APPROX || QueryType == QUERY_RANKED)
|
---|
662 | {
|
---|
663 | doc_sepstr = de_escape_string (
|
---|
664 | Xstrdup (GetDefEnv ("ranked_doc_sepstr",
|
---|
665 | "---------------------------------- %n %w\\n")));
|
---|
666 | }
|
---|
667 | else
|
---|
668 | {
|
---|
669 | doc_sepstr = de_escape_string (
|
---|
670 | Xstrdup (GetDefEnv ("doc_sepstr",
|
---|
671 | "---------------------------------- %n\\n")));
|
---|
672 | }
|
---|
673 | para_sepstr = de_escape_string (
|
---|
674 | Xstrdup (GetDefEnv ("para_sepstr",
|
---|
675 | "\\n######## PARAGRAPH %n ########\\n")));
|
---|
676 |
|
---|
677 | para_start = de_escape_string (
|
---|
678 | Xstrdup (GetDefEnv ("para_start",
|
---|
679 | "***** Weight = %w *****\\n")));
|
---|
680 | }
|
---|
681 |
|
---|
682 | if (need_text)
|
---|
683 | {
|
---|
684 | max_buf = atoi (GetDefEnv ("buffer", "1048576"));
|
---|
685 | }
|
---|
686 |
|
---|
687 | do
|
---|
688 | {
|
---|
689 | u_char *UDoc = NULL;
|
---|
690 | unsigned long ULen;
|
---|
691 |
|
---|
692 | if (need_text)
|
---|
693 | {
|
---|
694 | /* load the compressed text */
|
---|
695 | if (LoadCompressedText (qd, max_buf))
|
---|
696 | {
|
---|
697 | Message ("Unable to load compressed text.");
|
---|
698 | FatalError (1, "This is probably due to lack of memory.");
|
---|
699 | }
|
---|
700 |
|
---|
701 | /* uncompress the loaded text */
|
---|
702 | UDoc = GetDocText (qd, &ULen);
|
---|
703 | if (UDoc == NULL)
|
---|
704 | FatalError (1, "UDoc is unexpectedly NULL");
|
---|
705 | }
|
---|
706 |
|
---|
707 | if (!UDoc || PostProc ((char *) UDoc, verbatim))
|
---|
708 | {
|
---|
709 | switch (OutputType)
|
---|
710 | {
|
---|
711 | case OUTPUT_COUNT:
|
---|
712 | case OUTPUT_SILENT:
|
---|
713 | break;
|
---|
714 | case OUTPUT_DOCNUMS: /* This prints out the docnums string */
|
---|
715 | if (PagerRunning)
|
---|
716 | {
|
---|
717 |
|
---|
718 | /* [RPAP - Feb 97: NZDL Additions] */
|
---|
719 | #if defined(PARADOCNUM) || defined(NZDL)
|
---|
720 | int doc_num = GetDocNum(qd);
|
---|
721 |
|
---|
722 | if (qd->paragraph)
|
---|
723 | {
|
---|
724 | if (qd->id->ifh.InvfLevel == 3 &&
|
---|
725 | (QueryType == 'R' || QueryType == 'A'))
|
---|
726 | {
|
---|
727 | /* Print weights for each paragraph in document */
|
---|
728 |
|
---|
729 |
|
---|
730 | int true_doc_num = GetDocNumFromParaNum(qd, doc_num);
|
---|
731 |
|
---|
732 | /* Get number of paragraphs in this document */
|
---|
733 |
|
---|
734 | int num_paragraphs =
|
---|
735 | qd->paragraph[true_doc_num]-qd->paragraph[true_doc_num-1];
|
---|
736 |
|
---|
737 | int init_para = FetchInitialParagraph(qd->td,
|
---|
738 | doc_num);
|
---|
739 | DocEntry *de, *doc_chain = GetDocChain(qd);
|
---|
740 | int i;
|
---|
741 |
|
---|
742 | for (i = 0; i < num_paragraphs; i++)
|
---|
743 | {
|
---|
744 | if ((de = in_chain(i, init_para, doc_chain)))
|
---|
745 | PrintDocNum(Output, QueryType,
|
---|
746 | true_doc_num, init_para+i,
|
---|
747 | de->Weight);
|
---|
748 | }
|
---|
749 | }
|
---|
750 | else
|
---|
751 | PrintDocNum(Output, QueryType,
|
---|
752 | GetDocNumFromParaNum(qd, GetDocNum(qd)),
|
---|
753 | GetDocNum(qd),
|
---|
754 | GetDocWeight(qd));
|
---|
755 | }
|
---|
756 | else
|
---|
757 | {
|
---|
758 | PrintDocNum(Output, QueryType,
|
---|
759 | doc_num, doc_num, GetDocWeight(qd));
|
---|
760 | }
|
---|
761 | #else
|
---|
762 | fprintf (Output, "%7d %6.4f %7lu\n", GetDocNum (qd),
|
---|
763 | GetDocWeight (qd), GetDocCompLength (qd));
|
---|
764 | #endif
|
---|
765 | }
|
---|
766 | break;
|
---|
767 | case OUTPUT_HEADERS: /* This prints out the headers of the documents */
|
---|
768 | if (PagerRunning)
|
---|
769 | fprintf (Output, "%d ", GetDocNum (qd));
|
---|
770 | HeaderOut (Output, UDoc, ULen, heads_length);
|
---|
771 | if (PagerRunning)
|
---|
772 | fputc ('\n', Output);
|
---|
773 | break;
|
---|
774 | #if TREC_MODE
|
---|
775 | case OUTPUT_EXTRAS: /* This prints out the docnums string */
|
---|
776 | if (PagerRunning && trec_ids)
|
---|
777 | {
|
---|
778 | long DN, PN = GetDocNum (qd) - 1;
|
---|
779 | if (trec_paras)
|
---|
780 | DN = trec_paras[PN];
|
---|
781 | else
|
---|
782 | DN = PN;
|
---|
783 | fprintf (Output, "%-14.14s %8ld %10.5f\n",
|
---|
784 | &trec_ids[DN * 14], PN + 1, GetDocWeight (qd));
|
---|
785 | }
|
---|
786 | break;
|
---|
787 | #endif
|
---|
788 | case OUTPUT_TEXT:
|
---|
789 | case OUTPUT_HILITE:
|
---|
790 | {
|
---|
791 | int j, para = -1, curr_para = 0;
|
---|
792 | int init_para = -1;
|
---|
793 | DocEntry *de, *doc_chain = NULL;
|
---|
794 | register char ch = ' ';
|
---|
795 | register char lch = '\n';
|
---|
796 |
|
---|
797 | /* [RPAP - Feb 97: NZDL Additions] */
|
---|
798 | #if defined(PARADOCNUM) || defined(NZDL)
|
---|
799 | if (qd->id->ifh.InvfLevel == 3)
|
---|
800 | {
|
---|
801 | init_para = FetchInitialParagraph(qd->td, GetDocNum(qd));
|
---|
802 |
|
---|
803 | StringOut(Output, para_sepstr,
|
---|
804 | 1, init_para+curr_para,
|
---|
805 | 0, 0);
|
---|
806 |
|
---|
807 | }
|
---|
808 | else
|
---|
809 | StringOut(Output, doc_sepstr,
|
---|
810 | 1, GetDocNum(qd),
|
---|
811 | QueryType == 'A' || QueryType == 'R',
|
---|
812 | GetDocWeight(qd));
|
---|
813 |
|
---|
814 | #else
|
---|
815 | int p_on = 0;
|
---|
816 |
|
---|
817 | if (PagerRunning)
|
---|
818 | {
|
---|
819 | StringOut (Output, doc_sepstr,
|
---|
820 | 1, GetDocNum (qd),
|
---|
821 | QueryType == 'A' || QueryType == 'R',
|
---|
822 | GetDocWeight (qd));
|
---|
823 | }
|
---|
824 | if (qd->id->ifh.InvfLevel == 3)
|
---|
825 | {
|
---|
826 | init_para = FetchInitialParagraph (qd->td, GetDocNum (qd));
|
---|
827 | doc_chain = GetDocChain (qd);
|
---|
828 | para = GetDocNum (qd) - init_para;
|
---|
829 |
|
---|
830 | StringOut (Output, para_sepstr,
|
---|
831 | 1, curr_para + 1,
|
---|
832 | 0, 0);
|
---|
833 |
|
---|
834 | if ((de = in_chain (0, init_para, doc_chain)))
|
---|
835 | StringOut (Output, para_start,
|
---|
836 | 0, 0,
|
---|
837 | 1, de->Weight);
|
---|
838 |
|
---|
839 | if (doc_chain->DocNum - init_para == 0)
|
---|
840 | p_on = 1;
|
---|
841 | }
|
---|
842 | #endif
|
---|
843 | for (j = 0; j < ULen; j++)
|
---|
844 | {
|
---|
845 | ch = UDoc[j];
|
---|
846 | switch (ch)
|
---|
847 | {
|
---|
848 | case '\02':
|
---|
849 | break;
|
---|
850 | case '\01':
|
---|
851 | ch = '\n';
|
---|
852 | case '\03':
|
---|
853 | /* [RPAP - Feb 97: NZDL Additions] */
|
---|
854 | #if defined(PARADOCNUM) || defined(NZDL)
|
---|
855 | /* print paragraph numbers only if this is
|
---|
856 | a level 3 index */
|
---|
857 | if (qd->id->ifh.InvfLevel == 3)
|
---|
858 | {
|
---|
859 | curr_para++;
|
---|
860 | StringOut(Output, para_sepstr,
|
---|
861 | 1, init_para+curr_para,
|
---|
862 | 0, 0);
|
---|
863 | }
|
---|
864 | #else
|
---|
865 | p_on = 0;
|
---|
866 | curr_para++;
|
---|
867 | StringOut (Output, para_sepstr,
|
---|
868 | 1, curr_para + 1,
|
---|
869 | 0, 0);
|
---|
870 | lch = *(strchr (para_sepstr, '\0') - 1);
|
---|
871 | if ((de = in_chain (curr_para, init_para, doc_chain)))
|
---|
872 | StringOut (Output, para_start,
|
---|
873 | 0, 0,
|
---|
874 | 1, de->Weight);
|
---|
875 | if (doc_chain &&
|
---|
876 | doc_chain->DocNum - init_para == curr_para)
|
---|
877 | p_on = 1;
|
---|
878 | #endif
|
---|
879 | break;
|
---|
880 | default:
|
---|
881 | {
|
---|
882 | if (PagerRunning)
|
---|
883 | {
|
---|
884 | fputc (ch, Output);
|
---|
885 | /* [RPAP - Feb 97: NZDL Additions] */
|
---|
886 | #if !defined(PARADOCNUM) && !defined(NZDL)
|
---|
887 | if (p_on && isprint (ch))
|
---|
888 | {
|
---|
889 | fputc ('\b', Output);
|
---|
890 | fputc ('_', Output);
|
---|
891 | }
|
---|
892 | #endif
|
---|
893 | }
|
---|
894 |
|
---|
895 | lch = ch;
|
---|
896 | }
|
---|
897 | }
|
---|
898 | }
|
---|
899 | if (PagerRunning && lch != '\n')
|
---|
900 | fputc ('\n', Output);
|
---|
901 | /* [RPAP - Feb 97: NZDL Additions] */
|
---|
902 | #if !defined(PARADOCNUM) && !defined(NZDL)
|
---|
903 | p_on = 0;
|
---|
904 | #endif
|
---|
905 | }
|
---|
906 | }
|
---|
907 | if (PagerRunning)
|
---|
908 | fflush (Output);
|
---|
909 |
|
---|
910 | DocCount++; /* moved within if statement [RJM June 1997 -- fixing post retrieval scan] */
|
---|
911 | }
|
---|
912 | }
|
---|
913 | while (NextDoc (qd) && PagerRunning && (!Ctrl_C));
|
---|
914 |
|
---|
915 | if (need_text)
|
---|
916 | {
|
---|
917 | FreeTextBuffer (qd);
|
---|
918 | }
|
---|
919 |
|
---|
920 | if (OutputType == OUTPUT_TEXT || OutputType == OUTPUT_HILITE)
|
---|
921 | {
|
---|
922 | Xfree (doc_sepstr);
|
---|
923 | Xfree (para_sepstr);
|
---|
924 | Xfree (para_start);
|
---|
925 | }
|
---|
926 |
|
---|
927 | return (DocCount);
|
---|
928 | }
|
---|
929 |
|
---|
930 |
|
---|
931 | void
|
---|
932 | output_terminator (FILE * out)
|
---|
933 | {
|
---|
934 | char *terminator = Xstrdup (GetDefEnv ("terminator", ""));
|
---|
935 | de_escape_string (terminator);
|
---|
936 | fputs (terminator, out);
|
---|
937 | Xfree (terminator);
|
---|
938 | }
|
---|
939 |
|
---|
940 |
|
---|
941 |
|
---|
942 |
|
---|
943 | /* MoreDocs () */
|
---|
944 | /* Displays all documents in list DocList. */
|
---|
945 | /* Documents are fetched, then decompressed and displayed according to the */
|
---|
946 | /* format implied in FormString(). */
|
---|
947 |
|
---|
948 | /* [RPAP - Feb 97: WIN32 Port] */
|
---|
949 | #ifdef __WIN32__
|
---|
950 | # define HILITE_PAGER "mg_hilite_words.exe"
|
---|
951 | #else
|
---|
952 | # define HILITE_PAGER "mg_hilite_words"
|
---|
953 | #endif
|
---|
954 |
|
---|
955 | #define MAX_HILITE_PAGER_STR 80 /* for command & its options */
|
---|
956 |
|
---|
957 | static void
|
---|
958 | MoreDocs (query_data * qd, char *Query, char OutputType)
|
---|
959 | {
|
---|
960 | static char terms_str[MAXTERMSTRLEN + 1];
|
---|
961 | int DocCount = 0; /* number of actual matches */
|
---|
962 | FILE *Output = NULL;
|
---|
963 | int using_pipe = 0;
|
---|
964 | char *pager = NULL;
|
---|
965 |
|
---|
966 | Ctrl_C = 0;
|
---|
967 |
|
---|
968 | qd->num_of_ans = qd->DL->num;
|
---|
969 |
|
---|
970 | /* [RPAP - Feb 97: WIN32 Port] */
|
---|
971 | #ifndef __WIN32__
|
---|
972 | signal (SIGPIPE, SIGPIPE_handler);
|
---|
973 | #endif
|
---|
974 | signal (SIGINT, SIGINT_handler);
|
---|
975 |
|
---|
976 | PagerRunning = 1;
|
---|
977 | if (isatty (fileno (OutFile)) && GetEnv ("pager") &&
|
---|
978 | OutputType != OUTPUT_HILITE &&
|
---|
979 | OutputType != OUTPUT_SILENT && OutputType != OUTPUT_COUNT)
|
---|
980 | {
|
---|
981 | pager = GetEnv ("pager");
|
---|
982 | }
|
---|
983 | else if (isatty (fileno (OutFile)) && OutputType == OUTPUT_HILITE)
|
---|
984 | {
|
---|
985 | /* concat the pager and its word argument strings */
|
---|
986 | ConvertTermsToString (qd->TL, terms_str);
|
---|
987 | pager = Xmalloc (MAX_HILITE_PAGER_STR + strlen (terms_str) + 1);
|
---|
988 | if (!pager)
|
---|
989 | {
|
---|
990 | fprintf (stderr, "Unable to allocate memory for highlighting\n");
|
---|
991 | return;
|
---|
992 | }
|
---|
993 | sprintf (pager, "%s --style=%s --pager=%s --stem_method=%ld --stemmer=%ld %s",
|
---|
994 | HILITE_PAGER,
|
---|
995 | GetEnv ("hilite_style"),
|
---|
996 | GetEnv ("pager"),
|
---|
997 | qd->sd->sdh.stem_method,
|
---|
998 | qd->sd->sdh.stemmer_num,
|
---|
999 | terms_str);
|
---|
1000 |
|
---|
1001 | }
|
---|
1002 | else
|
---|
1003 | {
|
---|
1004 | Output = OutFile;
|
---|
1005 | }
|
---|
1006 |
|
---|
1007 | /* [RPAP - Feb 97: NZDL Additions] */
|
---|
1008 | #if defined(OUTPUTSTEMMEDWORDS) || defined(NZDL)
|
---|
1009 | if (!isatty(fileno(OutFile)) && get_query_type() != QUERY_DOCNUMS)
|
---|
1010 | {
|
---|
1011 | ConvertTermsToString(qd->TL, terms_str);
|
---|
1012 | fprintf(Output, "%s\n", terms_str);
|
---|
1013 | }
|
---|
1014 | #endif
|
---|
1015 | if (pager)
|
---|
1016 | {
|
---|
1017 | /* [RPAP - Feb 97: WIN32 Port] */
|
---|
1018 | #ifdef __WIN32__
|
---|
1019 | Output = _popen (pager, "w");
|
---|
1020 | #else
|
---|
1021 | Output = popen (pager, "w");
|
---|
1022 | #endif
|
---|
1023 | using_pipe = (Output != NULL);
|
---|
1024 | if (!using_pipe)
|
---|
1025 | {
|
---|
1026 | fprintf (stderr, "Unable to run \"%s\"\n", pager);
|
---|
1027 | return;
|
---|
1028 | }
|
---|
1029 | }
|
---|
1030 |
|
---|
1031 |
|
---|
1032 | if (qd->DL->num > 0)
|
---|
1033 | {
|
---|
1034 | if (OutputType == OUTPUT_COUNT && !post_proc)
|
---|
1035 | DocCount = qd->DL->num;
|
---|
1036 | else {
|
---|
1037 | DocCount = ProcessDocs (qd, qd->DL->num,
|
---|
1038 | BooleanEnv (GetEnv ("verbatim"), 1),
|
---|
1039 | OutputType, Output);
|
---|
1040 | }
|
---|
1041 | }
|
---|
1042 |
|
---|
1043 | if (PagerRunning)
|
---|
1044 | {
|
---|
1045 | output_terminator (Output);
|
---|
1046 | fflush (Output);
|
---|
1047 | }
|
---|
1048 |
|
---|
1049 | if (OutputType == OUTPUT_HILITE && pager)
|
---|
1050 | free (pager); /* as needed to malloc to create the pager string */
|
---|
1051 |
|
---|
1052 | if (using_pipe)
|
---|
1053 | /* [RPAP - Feb 97: WIN32 Port] */
|
---|
1054 | #ifdef __WIN32__
|
---|
1055 | _pclose (Output);
|
---|
1056 | #else
|
---|
1057 | pclose (Output);
|
---|
1058 | #endif
|
---|
1059 |
|
---|
1060 | if (qd->DL->num == 0)
|
---|
1061 | fprintf (stderr, "No entries correspond to that query.\n");
|
---|
1062 | else
|
---|
1063 | {
|
---|
1064 | if (OutputType == OUTPUT_COUNT)
|
---|
1065 | fprintf (stderr, "%d documents match.\n", DocCount);
|
---|
1066 | else
|
---|
1067 | fprintf (stderr, "%d documents retrieved.\n", DocCount);
|
---|
1068 | }
|
---|
1069 |
|
---|
1070 | signal (SIGINT, SIG_DFL);
|
---|
1071 | }
|
---|
1072 |
|
---|
1073 |
|
---|
1074 | void
|
---|
1075 | start_up_stats (query_data * qd, InitQueryTimes iqt)
|
---|
1076 | {
|
---|
1077 | Clear_Stats ();
|
---|
1078 | if (BooleanEnv (GetEnv ("timestats"), 0) ||
|
---|
1079 | BooleanEnv (GetEnv ("briefstats"), 0))
|
---|
1080 | StartUpTimeStats (&iqt);
|
---|
1081 |
|
---|
1082 | if (BooleanEnv (GetEnv ("diskstats"), 0) ||
|
---|
1083 | BooleanEnv (GetEnv ("briefstats"), 0))
|
---|
1084 | File_Stats (qd);
|
---|
1085 |
|
---|
1086 | if (BooleanEnv (GetEnv ("memstats"), 0) ||
|
---|
1087 | BooleanEnv (GetEnv ("briefstats"), 0))
|
---|
1088 | MemStats (qd);
|
---|
1089 |
|
---|
1090 | }
|
---|
1091 |
|
---|
1092 |
|
---|
1093 | void
|
---|
1094 | shut_down_stats (query_data * qd, ProgTime * start,
|
---|
1095 | ProgTime * invf, ProgTime * text)
|
---|
1096 | {
|
---|
1097 | Clear_Stats ();
|
---|
1098 | if (BooleanEnv (GetEnv ("timestats"), 0) ||
|
---|
1099 | BooleanEnv (GetEnv ("briefstats"), 0))
|
---|
1100 | QueryTimeStats (start, invf, text);
|
---|
1101 |
|
---|
1102 | if (BooleanEnv (GetEnv ("diskstats"), 0) ||
|
---|
1103 | BooleanEnv (GetEnv ("briefstats"), 0))
|
---|
1104 | {
|
---|
1105 | TransFileStats (qd);
|
---|
1106 | File_Stats (qd);
|
---|
1107 | }
|
---|
1108 |
|
---|
1109 | if (BooleanEnv (GetEnv ("sizestats"), 0))
|
---|
1110 | TotalSizeStats (qd);
|
---|
1111 | }
|
---|
1112 |
|
---|
1113 |
|
---|
1114 |
|
---|
1115 | char *wordfreqword2str (u_char * s)
|
---|
1116 | {
|
---|
1117 | static char buf[1024];
|
---|
1118 | int i, len = (int) *s++;
|
---|
1119 |
|
---|
1120 | for (i = 0; i < len; i++)
|
---|
1121 | {
|
---|
1122 | buf[i] = (char)s[i];
|
---|
1123 | }
|
---|
1124 | buf[len] = '\0';
|
---|
1125 |
|
---|
1126 | return buf;
|
---|
1127 | }
|
---|
1128 |
|
---|
1129 |
|
---|
1130 | /* [RPAP - Feb 97: Term Frequency] */
|
---|
1131 | /*********************************
|
---|
1132 | * PrintQueryTermFreq
|
---|
1133 | *
|
---|
1134 | * Prints the query terms and their respective frequencies within the collection
|
---|
1135 | *********************************/
|
---|
1136 | void
|
---|
1137 | PrintQueryTermFreqs (QueryTermList *qtl)
|
---|
1138 | {
|
---|
1139 | int i;
|
---|
1140 |
|
---|
1141 | /* Print the number of terms */
|
---|
1142 | fprintf (OutFile, "%d\n", qtl->num);
|
---|
1143 |
|
---|
1144 | /* Print the terms and their respective frequency within the collection */
|
---|
1145 | for (i = 0; i < qtl->num; i++)
|
---|
1146 | if (qtl->QTE[i].stem_method == -1)
|
---|
1147 | /* Using default stem method - don't print stem method beside term */
|
---|
1148 | fprintf (OutFile, "%s %d\n", wordfreqword2str (qtl->QTE[i].Term), qtl->QTE[i].Count);
|
---|
1149 | else
|
---|
1150 | /* Term was forced with a stem, print stem method with term */
|
---|
1151 | fprintf (OutFile, "%s#%d %d\n", wordfreqword2str (qtl->QTE[i].Term), qtl->QTE[i].stem_method, qtl->QTE[i].Count);
|
---|
1152 | }
|
---|
1153 |
|
---|
1154 |
|
---|
1155 | void
|
---|
1156 | query (void)
|
---|
1157 | {
|
---|
1158 | ProgTime TotalStartTime, TotalInvfTime, TotalTextTime;
|
---|
1159 | InitQueryTimes iqt;
|
---|
1160 | query_data *qd;
|
---|
1161 |
|
---|
1162 | TotalStartTime.RealTime = TotalStartTime.CPUTime = 0;
|
---|
1163 | TotalInvfTime.RealTime = TotalInvfTime.CPUTime = 0;
|
---|
1164 | TotalTextTime.RealTime = TotalTextTime.CPUTime = 0;
|
---|
1165 |
|
---|
1166 | /* [RPAP - Feb 97: WIN32 Port] */
|
---|
1167 | #ifdef __WIN32__
|
---|
1168 | qd = InitQuerySystem (GetDefEnv ("mgdir", ".\\"),
|
---|
1169 | GetDefEnv ("mgname", ""),
|
---|
1170 | GetDefEnv ("textname", NULL), /* [RJM 06/97: text filename] */
|
---|
1171 | &iqt);
|
---|
1172 | #else
|
---|
1173 | qd = InitQuerySystem (GetDefEnv ("mgdir", "./"),
|
---|
1174 | GetDefEnv ("mgname", ""),
|
---|
1175 | GetDefEnv ("textname", NULL), /* [RJM 06/97: text filename] */
|
---|
1176 | &iqt);
|
---|
1177 | #endif
|
---|
1178 |
|
---|
1179 | if (!qd)
|
---|
1180 | FatalError (1, mg_errorstrs[mg_errno], mg_error_data);
|
---|
1181 | start_up_stats (qd, iqt);
|
---|
1182 |
|
---|
1183 |
|
---|
1184 | while (1)
|
---|
1185 | {
|
---|
1186 | ProgTime StartTime, InvfTime, TextTime;
|
---|
1187 | char QueryType;
|
---|
1188 | char OutputType;
|
---|
1189 | char *line;
|
---|
1190 | ResetFileStats (qd);
|
---|
1191 | qd->max_mem_in_use = qd->mem_in_use = 0;
|
---|
1192 |
|
---|
1193 | qd->tot_hops_taken += qd->hops_taken;
|
---|
1194 | qd->tot_num_of_ptrs += qd->num_of_ptrs;
|
---|
1195 | qd->tot_num_of_accum += qd->num_of_accum;
|
---|
1196 | qd->tot_num_of_terms += qd->num_of_terms;
|
---|
1197 | qd->tot_num_of_ans += qd->num_of_ans;
|
---|
1198 | qd->tot_text_idx_lookups += qd->text_idx_lookups;
|
---|
1199 | qd->hops_taken = qd->num_of_ptrs = 0;
|
---|
1200 | qd->num_of_accum = qd->num_of_ans = qd->num_of_terms = 0;
|
---|
1201 | qd->text_idx_lookups = 0;
|
---|
1202 |
|
---|
1203 | Display_Stats (stderr);
|
---|
1204 | Clear_Stats ();
|
---|
1205 | line = get_query (qd);
|
---|
1206 | if (!line || Quitting)
|
---|
1207 | break;
|
---|
1208 |
|
---|
1209 | GetPostProc (line);
|
---|
1210 |
|
---|
1211 | GetTime (&StartTime);
|
---|
1212 |
|
---|
1213 | FreeQueryDocs (qd);
|
---|
1214 |
|
---|
1215 | QueryType = get_query_type ();
|
---|
1216 | OutputType = get_output_type ();
|
---|
1217 | /* No point in hiliting words on a docnum query */
|
---|
1218 | if (OutputType == OUTPUT_HILITE && QueryType == QUERY_DOCNUMS)
|
---|
1219 | OutputType = OUTPUT_TEXT;
|
---|
1220 |
|
---|
1221 | switch (QueryType)
|
---|
1222 | {
|
---|
1223 | case QUERY_BOOLEAN:
|
---|
1224 | {
|
---|
1225 | char *maxdocs;
|
---|
1226 | BooleanQueryInfo bqi;
|
---|
1227 | maxdocs = GetDefEnv ("maxdocs", "all");
|
---|
1228 | bqi.MaxDocsToRetrieve = strcmp (maxdocs, "all") ? atoi (maxdocs) : -1;
|
---|
1229 | /* [RPAP - Jan 97: Stem Index Change] */
|
---|
1230 | if (qd->sd->sdh.indexed)
|
---|
1231 | BooleanQuery (qd, line, &bqi, (BooleanEnv (GetEnv ("casefold"), 0) |
|
---|
1232 | (BooleanEnv (GetEnv ("stem"), 0) << 1)));
|
---|
1233 | else
|
---|
1234 | BooleanQuery (qd, line, &bqi, qd->sd->sdh.stem_method);
|
---|
1235 |
|
---|
1236 | break;
|
---|
1237 | }
|
---|
1238 | case QUERY_APPROX:
|
---|
1239 | case QUERY_RANKED:
|
---|
1240 | {
|
---|
1241 | char *maxdocs;
|
---|
1242 | char *maxterms;
|
---|
1243 | char *maxaccum;
|
---|
1244 | RankedQueryInfo rqi;
|
---|
1245 | maxdocs = GetDefEnv ("maxdocs", "all");
|
---|
1246 | maxterms = GetDefEnv ("max_terms", "all");
|
---|
1247 | maxaccum = GetDefEnv ("max_accumulators", "all");
|
---|
1248 | rqi.Sort = BooleanEnv (GetEnv ("sorted_terms"), 0);
|
---|
1249 | rqi.QueryFreqs = BooleanEnv (GetEnv ("qfreq"), 1);
|
---|
1250 | rqi.Exact = QueryType == QUERY_RANKED;
|
---|
1251 | rqi.MaxDocsToRetrieve = strcmp (maxdocs, "all") ? atoi (maxdocs) : -1;
|
---|
1252 | rqi.MaxTerms = strcmp (maxterms, "all") ? atoi (maxterms) : -1;
|
---|
1253 | rqi.MaxParasToRetrieve = rqi.MaxDocsToRetrieve;
|
---|
1254 | if (qd->id->ifh.InvfLevel == 3 && GetEnv ("maxparas"))
|
---|
1255 | rqi.MaxParasToRetrieve = atoi (GetEnv ("maxparas"));
|
---|
1256 | rqi.AccumMethod = toupper (*GetDefEnv ("accumulator_method", "A"));
|
---|
1257 | rqi.MaxAccums = strcmp (maxaccum, "all") ? atoi (maxaccum) : -1;
|
---|
1258 | rqi.HashTblSize = IntEnv (GetEnv ("hash_tbl_size"), 1000);
|
---|
1259 | rqi.StopAtMaxAccum = BooleanEnv (GetEnv ("stop_at_max_accum"), 0);
|
---|
1260 | rqi.skip_dump = GetEnv ("skip_dump");
|
---|
1261 | RankedQuery (qd, line, &rqi);
|
---|
1262 | break;
|
---|
1263 | }
|
---|
1264 | case QUERY_DOCNUMS:
|
---|
1265 | {
|
---|
1266 | DocnumsQuery (qd, line);
|
---|
1267 | break;
|
---|
1268 | }
|
---|
1269 | }
|
---|
1270 |
|
---|
1271 | GetTime (&InvfTime);
|
---|
1272 |
|
---|
1273 | /* [RPAP - Feb 97: Term Frequency] */
|
---|
1274 | if (qd->QTL && BooleanEnv (GetEnv ("term_freq"), 0))
|
---|
1275 | PrintQueryTermFreqs (qd->QTL);
|
---|
1276 |
|
---|
1277 | if (qd->DL)
|
---|
1278 | MoreDocs (qd, line, OutputType);
|
---|
1279 |
|
---|
1280 | GetTime (&TextTime);
|
---|
1281 |
|
---|
1282 | if (BooleanEnv (GetEnv ("timestats"), 0) ||
|
---|
1283 | BooleanEnv (GetEnv ("briefstats"), 0))
|
---|
1284 | QueryTimeStats (&StartTime, &InvfTime, &TextTime);
|
---|
1285 |
|
---|
1286 | if (BooleanEnv (GetEnv ("diskstats"), 0) ||
|
---|
1287 | BooleanEnv (GetEnv ("briefstats"), 0))
|
---|
1288 | File_Stats (qd);
|
---|
1289 |
|
---|
1290 | if (BooleanEnv (GetEnv ("memstats"), 0) ||
|
---|
1291 | BooleanEnv (GetEnv ("briefstats"), 0))
|
---|
1292 | MemStats (qd);
|
---|
1293 |
|
---|
1294 | if (BooleanEnv (GetEnv ("sizestats"), 0))
|
---|
1295 | SizeStats (qd);
|
---|
1296 |
|
---|
1297 | TotalInvfTime.RealTime += InvfTime.RealTime - StartTime.RealTime;
|
---|
1298 | TotalInvfTime.CPUTime += InvfTime.CPUTime - StartTime.CPUTime;
|
---|
1299 | TotalTextTime.RealTime += TextTime.RealTime - StartTime.RealTime;
|
---|
1300 | TotalTextTime.CPUTime += TextTime.CPUTime - StartTime.CPUTime;
|
---|
1301 | }
|
---|
1302 |
|
---|
1303 | if (isatty (fileno (InFile)) && !Quitting)
|
---|
1304 | fprintf (stderr, "\n");
|
---|
1305 |
|
---|
1306 | shut_down_stats (qd, &TotalStartTime, &TotalInvfTime, &TotalTextTime);
|
---|
1307 |
|
---|
1308 | Display_Stats (stderr);
|
---|
1309 |
|
---|
1310 | }
|
---|
1311 |
|
---|
1312 |
|
---|
1313 | void
|
---|
1314 | search_for_collection (char *name)
|
---|
1315 | {
|
---|
1316 | /* [RPAP - Feb 97: WIN32 Port] */
|
---|
1317 | #ifdef __WIN32__
|
---|
1318 | char *dir = GetDefEnv ("mgdir", ".\\");
|
---|
1319 | #else
|
---|
1320 | char *dir = GetDefEnv ("mgdir", "./");
|
---|
1321 | #endif
|
---|
1322 | char buffer[512];
|
---|
1323 | struct stat stat_buf;
|
---|
1324 | if (strrchr (dir, '/') && *(strrchr (dir, '/') + 1) != '\0')
|
---|
1325 | {
|
---|
1326 | /* [RPAP - Feb 97: WIN32 Port] */
|
---|
1327 | #ifdef __WIN32__
|
---|
1328 | sprintf (buffer, "%s", dir);
|
---|
1329 | #else
|
---|
1330 | sprintf (buffer, "%s/", dir);
|
---|
1331 | #endif
|
---|
1332 | SetEnv ("mgdir", buffer, NULL);
|
---|
1333 | dir = GetEnv ("mgdir");
|
---|
1334 | }
|
---|
1335 |
|
---|
1336 | sprintf (buffer, "%s.text", name);
|
---|
1337 | if (stat (buffer, &stat_buf) != -1)
|
---|
1338 | {
|
---|
1339 | if ((stat_buf.st_mode & S_IFREG) != 0)
|
---|
1340 | {
|
---|
1341 | /* The name is a directory */
|
---|
1342 | SetEnv ("mgname", name, NULL);
|
---|
1343 | /* [RPAP - Feb 97: WIN32 Port] */
|
---|
1344 | #ifdef __WIN32__
|
---|
1345 | SetEnv ("mgdir", ".\\", NULL);
|
---|
1346 | #else
|
---|
1347 | SetEnv ("mgdir", "./", NULL);
|
---|
1348 | #endif
|
---|
1349 | return;
|
---|
1350 | }
|
---|
1351 | }
|
---|
1352 |
|
---|
1353 | sprintf (buffer, "%s%s", dir, name);
|
---|
1354 | if (stat (buffer, &stat_buf) != -1)
|
---|
1355 | {
|
---|
1356 | if ((stat_buf.st_mode & S_IFDIR) != 0)
|
---|
1357 | {
|
---|
1358 | /* The name is a directory */
|
---|
1359 | /* [RPAP - Feb 97: WIN32 Port] */
|
---|
1360 | #ifdef __WIN32__
|
---|
1361 | sprintf (buffer, "%s%s", name, name);
|
---|
1362 | #else
|
---|
1363 | sprintf (buffer, "%s/%s", name, name);
|
---|
1364 | #endif
|
---|
1365 | SetEnv ("mgname", buffer, NULL);
|
---|
1366 | return;
|
---|
1367 | }
|
---|
1368 | }
|
---|
1369 |
|
---|
1370 | /* Look in the current directory last */
|
---|
1371 | if (stat (name, &stat_buf) != -1)
|
---|
1372 | {
|
---|
1373 | if ((stat_buf.st_mode & S_IFDIR) != 0)
|
---|
1374 | {
|
---|
1375 | /* The name is a directory */
|
---|
1376 | /* [RPAP - Feb 97: WIN32 Port] */
|
---|
1377 | #ifdef __WIN32__
|
---|
1378 | sprintf (buffer, "%s%s", name, name);
|
---|
1379 | SetEnv ("mgdir", ".\\", NULL);
|
---|
1380 | #else
|
---|
1381 | sprintf (buffer, "%s/%s", name, name);
|
---|
1382 | SetEnv ("mgdir", "./", NULL);
|
---|
1383 | #endif
|
---|
1384 | SetEnv ("mgname", buffer, NULL);
|
---|
1385 | return;
|
---|
1386 | }
|
---|
1387 | }
|
---|
1388 |
|
---|
1389 | SetEnv ("mgname", name, NULL);
|
---|
1390 | }
|
---|
1391 |
|
---|
1392 | /* main () */
|
---|
1393 | /* Initialises global variables based on command line switches, and opens */
|
---|
1394 | /* files. Then calls query () to perform the querying. */
|
---|
1395 | int main (int argc, char **argv)
|
---|
1396 | {
|
---|
1397 | ProgTime StartTime;
|
---|
1398 | int decomp = 0;
|
---|
1399 | int ch;
|
---|
1400 |
|
---|
1401 | msg_prefix = argv[0];
|
---|
1402 | GetTime (&StartTime);
|
---|
1403 |
|
---|
1404 | /* Initialise the environment with default values */
|
---|
1405 |
|
---|
1406 | InitEnv ();
|
---|
1407 |
|
---|
1408 | read_mgrc_file ();
|
---|
1409 |
|
---|
1410 | OutFile = stdout;
|
---|
1411 | InFile = stdin;
|
---|
1412 |
|
---|
1413 | opterr = 0;
|
---|
1414 | /* [RJM 06/97: text filename] */
|
---|
1415 | while ((ch = getopt (argc, argv, "Df:d:t:h")) != -1) {
|
---|
1416 | switch (ch) {
|
---|
1417 | case 'f':
|
---|
1418 | SetEnv ("mgname", optarg, NULL);
|
---|
1419 | break;
|
---|
1420 | case 'd':
|
---|
1421 | SetEnv ("mgdir", optarg, NULL);
|
---|
1422 | break;
|
---|
1423 | case 't': /* [RJM 06/97: text filename] */
|
---|
1424 | SetEnv ("textname", optarg, NULL);
|
---|
1425 | break;
|
---|
1426 | case 'D':
|
---|
1427 | decomp = 1;
|
---|
1428 | break;
|
---|
1429 | case 'h':
|
---|
1430 | case '?':
|
---|
1431 | fprintf (stderr, "usage: %s [-D] [-f base name of collection] "
|
---|
1432 | "[-t base name of files for text] " /* [RJM 06/97: text filename] */
|
---|
1433 | "[-d data directory] [collection]\n", argv[0]);
|
---|
1434 | exit (1);
|
---|
1435 | }
|
---|
1436 | }
|
---|
1437 |
|
---|
1438 | PushEnv ();
|
---|
1439 |
|
---|
1440 | if (decomp == 0)
|
---|
1441 | {
|
---|
1442 |
|
---|
1443 | Init_ReadLine ();
|
---|
1444 |
|
---|
1445 | /* write a first prompt, let the user start thinking */
|
---|
1446 | if (!BooleanEnv (GetEnv ("expert"), 0) && isatty (fileno (InFile)))
|
---|
1447 | {
|
---|
1448 | fprintf (stderr, "\n\n\t FULL TEXT RETRIEVAL QUERY PROGRAM\n");
|
---|
1449 | fprintf (stderr, "%24s%s\n\n", "", *"21 Mar 1994" == '%' ? __DATE__ : "21 Mar 1994");
|
---|
1450 | fprintf (stderr, "\n");
|
---|
1451 | fprintf (stderr, " mgquery version " VERSION ", Copyright (C) 1994 Neil Sharman\n");
|
---|
1452 | fprintf (stderr, " mgquery comes with ABSOLUTELY NO WARRANTY; for details type `.warranty'\n");
|
---|
1453 | fprintf (stderr, " This is free software, and you are welcome to redistribute it\n");
|
---|
1454 | fprintf (stderr, " under certain conditions; type `.conditions' for details.\n");
|
---|
1455 | fprintf (stderr, "\n");
|
---|
1456 | }
|
---|
1457 | }
|
---|
1458 | if (optind < argc)
|
---|
1459 | search_for_collection (argv[optind]);
|
---|
1460 |
|
---|
1461 | if (decomp == 0)
|
---|
1462 | {
|
---|
1463 | query ();
|
---|
1464 | }
|
---|
1465 | else
|
---|
1466 | {
|
---|
1467 | int i;
|
---|
1468 | InitQueryTimes iqt;
|
---|
1469 | query_data *qd;
|
---|
1470 |
|
---|
1471 | /* [RPAP - Feb 97: WIN32 Port] */
|
---|
1472 | #ifdef __WIN32__
|
---|
1473 | qd = InitQuerySystem (GetDefEnv ("mgdir", ".\\"),
|
---|
1474 | GetDefEnv ("mgname", ""),
|
---|
1475 | GetDefEnv ("textname", NULL), /* [RJM 06/97: text filename] */
|
---|
1476 | &iqt);
|
---|
1477 | #else
|
---|
1478 | qd = InitQuerySystem (GetDefEnv ("mgdir", "./"),
|
---|
1479 | GetDefEnv ("mgname", ""),
|
---|
1480 | GetDefEnv ("textname", NULL), /* [RJM 06/97: text filename] */
|
---|
1481 | &iqt);
|
---|
1482 | #endif
|
---|
1483 | if (!qd)
|
---|
1484 | FatalError (1, mg_errorstrs[mg_errno], mg_error_data);
|
---|
1485 |
|
---|
1486 |
|
---|
1487 | start_up_stats (qd, iqt);
|
---|
1488 |
|
---|
1489 | Display_Stats (stderr);
|
---|
1490 | for (i = 0; i < qd->td->cth.num_of_docs; i++)
|
---|
1491 | {
|
---|
1492 | RawDocOutput (qd, i + 1, stdout);
|
---|
1493 | putc ('\2', stdout);
|
---|
1494 | }
|
---|
1495 | Message ("%s", ElapsedTime (&StartTime, NULL));
|
---|
1496 |
|
---|
1497 | FinishQuerySystem (qd);
|
---|
1498 | }
|
---|
1499 |
|
---|
1500 | UninitEnv ();
|
---|
1501 | return 0;
|
---|
1502 | }
|
---|