1 | /**************************************************************************
|
---|
2 | *
|
---|
3 | * mgquery.c -- The M G Q U E R Y program
|
---|
4 | * Copyright (C) 1994 Neil Sharman
|
---|
5 | *
|
---|
6 | * This program is free software; you can redistribute it and/or modify
|
---|
7 | * it under the terms of the GNU General Public License as published by
|
---|
8 | * the Free Software Foundation; either version 2 of the License, or
|
---|
9 | * (at your option) any later version.
|
---|
10 | *
|
---|
11 | * This program is distributed in the hope that it will be useful,
|
---|
12 | * but WITHOUT ANY WARRANTY; without even the implied warranty of
|
---|
13 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
---|
14 | * GNU General Public License for more details.
|
---|
15 | *
|
---|
16 | * You should have received a copy of the GNU General Public License
|
---|
17 | * along with this program; if not, write to the Free Software
|
---|
18 | * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
|
---|
19 | *
|
---|
20 | * $Id: mgquery.c 3745 2003-02-20 21:20:24Z mdewsnip $
|
---|
21 | *
|
---|
22 | **************************************************************************/
|
---|
23 |
|
---|
24 | /*
|
---|
25 | $Log$
|
---|
26 | Revision 1.1 2003/02/20 21:18:24 mdewsnip
|
---|
27 | Addition of MG package for search and retrieval
|
---|
28 |
|
---|
29 | Revision 1.1 1999/08/10 21:18:18 sjboddie
|
---|
30 | renamed mg-1.3d directory mg
|
---|
31 |
|
---|
32 | Revision 1.3 1999/01/08 00:33:46 rjmcnab
|
---|
33 |
|
---|
34 | Enabled mg and the library software to read in more than one index
|
---|
35 | at a time.
|
---|
36 |
|
---|
37 | Revision 1.2 1998/11/25 07:55:49 rjmcnab
|
---|
38 |
|
---|
39 | Modified mg to that you can specify the stemmer you want
|
---|
40 | to use via a command line option. You specify it to
|
---|
41 | mg_passes during the build process. The number of the
|
---|
42 | stemmer that you used is stored within the inverted
|
---|
43 | dictionary header and the stemmed dictionary header so
|
---|
44 | the correct stemmer is used in later stages of building
|
---|
45 | and querying.
|
---|
46 |
|
---|
47 | Revision 1.1 1998/11/17 09:35:29 rjmcnab
|
---|
48 | *** empty log message ***
|
---|
49 |
|
---|
50 | * Revision 1.3 1994/10/20 03:57:02 tes
|
---|
51 | * I have rewritten the boolean query optimiser and abstracted out the
|
---|
52 | * components of the boolean query.
|
---|
53 | *
|
---|
54 | * Revision 1.2 1994/09/20 04:41:58 tes
|
---|
55 | * For version 1.1
|
---|
56 | *
|
---|
57 | */
|
---|
58 |
|
---|
59 | static char *RCSID = "$Id: mgquery.c 3745 2003-02-20 21:20:24Z mdewsnip $";
|
---|
60 |
|
---|
61 | #include "sysfuncs.h"
|
---|
62 |
|
---|
63 | #if defined(HAVE_SYS_PROCFS_H) && defined(HAVE_PR_BRKSIZE) && \
|
---|
64 | (__STDC__ == 0)
|
---|
65 | /* STDC test was included to allow cc -Xc on SunOS 5 to work */
|
---|
66 | #define USE_PROCESS_MEM
|
---|
67 | #endif
|
---|
68 |
|
---|
69 | #ifdef USE_PROCESS_MEM
|
---|
70 | # include <sys/procfs.h>
|
---|
71 | #endif
|
---|
72 |
|
---|
73 | #ifdef HAVE_GETRUSAGE
|
---|
74 | # ifdef HAVE_SYS_TIME_H
|
---|
75 | # include <sys/time.h>
|
---|
76 | # endif
|
---|
77 | # include <sys/resource.h>
|
---|
78 | #endif
|
---|
79 |
|
---|
80 | #ifndef HAVE_GETPAGESIZE
|
---|
81 | # include "getpagesize.h"
|
---|
82 | #endif
|
---|
83 |
|
---|
84 | #if WITH_REGEX
|
---|
85 | # include <regex.h>
|
---|
86 | #else
|
---|
87 | # include <rx.h>
|
---|
88 | #endif
|
---|
89 |
|
---|
90 |
|
---|
91 | #include <stdarg.h>
|
---|
92 | #include <signal.h>
|
---|
93 |
|
---|
94 | #include "messages.h"
|
---|
95 | #include "timing.h"
|
---|
96 | #include "memlib.h"
|
---|
97 | #include "local_strings.h" /* [RPAP - Feb 97: Term Frequency] */
|
---|
98 |
|
---|
99 | #include "filestats.h"
|
---|
100 | #include "invf.h"
|
---|
101 | #include "text.h"
|
---|
102 | #include "mg.h"
|
---|
103 | #include "lists.h"
|
---|
104 | #include "backend.h"
|
---|
105 | #include "environment.h"
|
---|
106 | #include "globals.h"
|
---|
107 | #include "read_line.h"
|
---|
108 | #include "mg_errors.h"
|
---|
109 | #include "commands.h"
|
---|
110 | #include "text_get.h"
|
---|
111 | #include "term_lists.h"
|
---|
112 | #include "query_term_list.h"
|
---|
113 |
|
---|
114 |
|
---|
115 | FILE *OutFile = NULL, *InFile = NULL;
|
---|
116 | int OutPipe = 0, InPipe = 0;
|
---|
117 | int Quitting = 0;
|
---|
118 |
|
---|
119 | /* [RPAP - Feb 97: NZDL Additions] */
|
---|
120 | #if defined(PARADOCNUM) || defined(NZDL)
|
---|
121 | int GetDocNumFromParaNum(query_data *qd, int paranum) {
|
---|
122 | int Documents = qd->td->cth.num_of_docs;
|
---|
123 | int *Paragraph = qd->paragraph;
|
---|
124 | int low = 1, high = Documents;
|
---|
125 | int mid = (low+high)/2;
|
---|
126 |
|
---|
127 | while ((mid = (low+high)/2) >=1 && mid <= Documents)
|
---|
128 | {
|
---|
129 | if (paranum > Paragraph[mid])
|
---|
130 | low = mid+1;
|
---|
131 | else if (paranum <= Paragraph[mid-1])
|
---|
132 | high = mid-1;
|
---|
133 | else
|
---|
134 | return mid;
|
---|
135 | }
|
---|
136 | FatalError(1, "Bad paragraph number.\n");
|
---|
137 | }
|
---|
138 | #endif
|
---|
139 |
|
---|
140 | #ifdef TREC_MODE
|
---|
141 | char *trec_ids = NULL;
|
---|
142 | long *trec_paras = NULL;
|
---|
143 | #endif
|
---|
144 |
|
---|
145 | static volatile int PagerRunning = 0;
|
---|
146 | static volatile int Ctrl_C = 0;
|
---|
147 |
|
---|
148 |
|
---|
149 | /*****************************************************************************/
|
---|
150 |
|
---|
151 | typedef enum
|
---|
152 | {
|
---|
153 | S_Time, S_Mem, S_Size, S_File
|
---|
154 | }
|
---|
155 | S_Type;
|
---|
156 |
|
---|
157 | static struct Stat
|
---|
158 | {
|
---|
159 | S_Type typ;
|
---|
160 | char *name;
|
---|
161 | char *text;
|
---|
162 | }
|
---|
163 | *Stats = NULL;
|
---|
164 | static int NumStats = 0;
|
---|
165 |
|
---|
166 | static void
|
---|
167 | Clear_Stats (void)
|
---|
168 | {
|
---|
169 | if (Stats)
|
---|
170 | {
|
---|
171 | int i;
|
---|
172 | for (i = 0; i < NumStats; i++)
|
---|
173 | {
|
---|
174 | if (Stats[i].name)
|
---|
175 | Xfree (Stats[i].name);
|
---|
176 | if (Stats[i].text)
|
---|
177 | Xfree (Stats[i].text);
|
---|
178 | }
|
---|
179 | Xfree (Stats);
|
---|
180 | Stats = NULL;
|
---|
181 | NumStats = 0;
|
---|
182 | }
|
---|
183 | }
|
---|
184 |
|
---|
185 | static void
|
---|
186 | Add_Stats (S_Type typ, char *name, char *fmt,...)
|
---|
187 | {
|
---|
188 | char buf[1024];
|
---|
189 | va_list args;
|
---|
190 | va_start (args, fmt);
|
---|
191 | vsprintf (buf, fmt, args);
|
---|
192 | if (Stats)
|
---|
193 | Stats = Xrealloc (Stats, (++NumStats) * sizeof (*Stats));
|
---|
194 | else
|
---|
195 | Stats = Xmalloc ((++NumStats) * sizeof (*Stats));
|
---|
196 | Stats[NumStats - 1].typ = typ;
|
---|
197 | Stats[NumStats - 1].name = Xstrdup (name);
|
---|
198 | Stats[NumStats - 1].text = Xstrdup (buf);
|
---|
199 | }
|
---|
200 |
|
---|
201 | static void
|
---|
202 | Display_Stats (FILE * f)
|
---|
203 | {
|
---|
204 | static char *sep = "-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-="
|
---|
205 | "-=-=-=-=-=-=-=-=-=-=-";
|
---|
206 | char *names[] =
|
---|
207 | {"Time: ", "Memory:", "Sizes: ", "Disk: ", " "};
|
---|
208 | int i, last_typ = -1;
|
---|
209 | size_t len = 0;
|
---|
210 | if (NumStats == 0)
|
---|
211 | return;
|
---|
212 | fprintf (f, "%s\n", sep);
|
---|
213 | for (i = 0; i < NumStats; i++)
|
---|
214 | if (strlen (Stats[i].name) > len)
|
---|
215 | len = strlen (Stats[i].name);
|
---|
216 | for (i = 0; i < NumStats; i++)
|
---|
217 | {
|
---|
218 | int typ = 4;
|
---|
219 | if (Stats[i].typ != last_typ)
|
---|
220 | typ = last_typ = Stats[i].typ;
|
---|
221 | fprintf (f, "%s %-*s %s\n", names[typ], (int) len, Stats[i].name, Stats[i].text);
|
---|
222 | }
|
---|
223 | fprintf (f, "%s\n", sep);
|
---|
224 | }
|
---|
225 |
|
---|
226 | /*****************************************************************************/
|
---|
227 |
|
---|
228 |
|
---|
229 | static void
|
---|
230 | QueryTimeStats (ProgTime * Start, ProgTime * invf, ProgTime * text)
|
---|
231 | {
|
---|
232 | if (!BooleanEnv (GetEnv ("briefstats"), 0))
|
---|
233 | {
|
---|
234 | Add_Stats (S_Time, "invf", ElapsedTime (Start, invf));
|
---|
235 | Add_Stats (S_Time, "text", ElapsedTime (invf, text));
|
---|
236 | }
|
---|
237 | Add_Stats (S_Time, "total", ElapsedTime (Start, text));
|
---|
238 | }
|
---|
239 |
|
---|
240 | static void
|
---|
241 | StartUpTimeStats (InitQueryTimes * iqt)
|
---|
242 | {
|
---|
243 | if (!BooleanEnv (GetEnv ("briefstats"), 0))
|
---|
244 | {
|
---|
245 | Add_Stats (S_Time, "dict [stem]", ElapsedTime (&iqt->Start,
|
---|
246 | &iqt->StemDict));
|
---|
247 | Add_Stats (S_Time, "weights", ElapsedTime (&iqt->StemDict,
|
---|
248 | &iqt->ApproxWeights));
|
---|
249 | Add_Stats (S_Time, "dict [text]", ElapsedTime (&iqt->ApproxWeights,
|
---|
250 | &iqt->CompDict));
|
---|
251 | Add_Stats (S_Time, "Inverted", ElapsedTime (&iqt->CompDict,
|
---|
252 | &iqt->Invf));
|
---|
253 | Add_Stats (S_Time, "Compressed", ElapsedTime (&iqt->Invf,
|
---|
254 | &iqt->Text));
|
---|
255 | }
|
---|
256 | Add_Stats (S_Time, "total", ElapsedTime (&iqt->Start, &iqt->Text));
|
---|
257 | }
|
---|
258 |
|
---|
259 |
|
---|
260 |
|
---|
261 |
|
---|
262 | #ifdef USE_PROCESS_MEM
|
---|
263 | static u_long
|
---|
264 | process_mem (void)
|
---|
265 | {
|
---|
266 | prstatus_t pr;
|
---|
267 | static int fd = -1;
|
---|
268 | if (fd == -1)
|
---|
269 | {
|
---|
270 | char buf[128];
|
---|
271 | sprintf (buf, "/proc/%ld", (long) getpid ());
|
---|
272 | fd = open (buf, O_RDONLY);
|
---|
273 | }
|
---|
274 | if (fd == -1 || ioctl (fd, PIOCSTATUS, &pr) == -1)
|
---|
275 | return 0;
|
---|
276 | return pr.pr_brksize;
|
---|
277 | }
|
---|
278 | #endif
|
---|
279 |
|
---|
280 |
|
---|
281 |
|
---|
282 |
|
---|
283 | static void
|
---|
284 | MemStats (query_data * qd)
|
---|
285 | {
|
---|
286 | if (!BooleanEnv (GetEnv ("briefstats"), 0))
|
---|
287 | {
|
---|
288 | #ifdef HAVE_GETRUSAGE
|
---|
289 | struct rusage rusage;
|
---|
290 | getrusage (RUSAGE_SELF, &rusage);
|
---|
291 |
|
---|
292 | Add_Stats (S_Mem, "process mem", "%7.3f Mb",
|
---|
293 | (double) (rusage.ru_maxrss * getpagesize () / 1024.0 / 1024.0));
|
---|
294 | #endif
|
---|
295 | #ifdef USE_PROCESS_MEM
|
---|
296 | Add_Stats (S_Mem, "process mem", "%7.3f Mb",
|
---|
297 | (double) (process_mem () / 1024.0 / 1024.0));
|
---|
298 | #endif
|
---|
299 | Add_Stats (S_Mem, "dict [stem]", "%7.1f kB",
|
---|
300 | (double) qd->sd->MemForStemDict / 1024);
|
---|
301 | Add_Stats (S_Mem, "dict [text]", "%7.1f kB",
|
---|
302 | (double) qd->cd->MemForCompDict / 1024);
|
---|
303 | if (qd->awd)
|
---|
304 | Add_Stats (S_Mem, "weights", "%7.1f kB",
|
---|
305 | (double) qd->awd->MemForWeights / 1024);
|
---|
306 | }
|
---|
307 | if (qd->awd)
|
---|
308 | Add_Stats (S_Mem, "total [peak]", "%7.1f kB",
|
---|
309 | (double) (qd->max_mem_in_use + qd->sd->MemForStemDict +
|
---|
310 | qd->cd->MemForCompDict + qd->awd->MemForWeights) / 1024);
|
---|
311 | else
|
---|
312 | Add_Stats (S_Mem, "total [peak]", "%7.1f kB",
|
---|
313 | (double) (qd->max_mem_in_use + qd->sd->MemForStemDict +
|
---|
314 | qd->cd->MemForCompDict) / 1024);
|
---|
315 |
|
---|
316 | }
|
---|
317 |
|
---|
318 |
|
---|
319 |
|
---|
320 | static void
|
---|
321 | SizeStats (query_data * qd)
|
---|
322 | {
|
---|
323 | Add_Stats (S_Size, "skips", "%7d", qd->hops_taken);
|
---|
324 | Add_Stats (S_Size, "pointers", "%7d", qd->num_of_ptrs);
|
---|
325 | Add_Stats (S_Size, "accumulators", "%7d", qd->num_of_accum);
|
---|
326 | Add_Stats (S_Size, "terms", "%7d", qd->num_of_terms);
|
---|
327 | Add_Stats (S_Size, "answers", "%7d", qd->num_of_ans);
|
---|
328 | Add_Stats (S_Size, "index lookups", "%7d", qd->text_idx_lookups);
|
---|
329 | }
|
---|
330 |
|
---|
331 | static void
|
---|
332 | TotalSizeStats (query_data * qd)
|
---|
333 | {
|
---|
334 | Add_Stats (S_Size, "skips", "%7d", qd->tot_hops_taken);
|
---|
335 | Add_Stats (S_Size, "pointers", "%7d", qd->tot_num_of_ptrs);
|
---|
336 | Add_Stats (S_Size, "accumulators", "%7d", qd->tot_num_of_accum);
|
---|
337 | Add_Stats (S_Size, "terms", "%7d", qd->tot_num_of_terms);
|
---|
338 | Add_Stats (S_Size, "answers", "%7d", qd->tot_num_of_ans);
|
---|
339 | Add_Stats (S_Size, "index lookups", "%7d", qd->tot_text_idx_lookups);
|
---|
340 | }
|
---|
341 |
|
---|
342 |
|
---|
343 | static void
|
---|
344 | StatFile (File * F)
|
---|
345 | {
|
---|
346 | static unsigned long NumBytes = 0, NumSeeks = 0, NumReads = 0;
|
---|
347 | if (F)
|
---|
348 | if ((int) F != -1)
|
---|
349 | {
|
---|
350 | if (!BooleanEnv (GetEnv ("briefstats"), 0))
|
---|
351 | Add_Stats (S_File, F->name, "%7.1f kB (%3d seeks, %7d reads)",
|
---|
352 | (double) F->Current.NumBytes / 1024, F->Current.NumSeeks,
|
---|
353 | F->Current.NumReads);
|
---|
354 | NumBytes += F->Current.NumBytes;
|
---|
355 | NumSeeks += F->Current.NumSeeks;
|
---|
356 | NumReads += F->Current.NumReads;
|
---|
357 | }
|
---|
358 | else
|
---|
359 | {
|
---|
360 | Add_Stats (S_File, "total", "%7.1f kB (%3d seeks, %7d reads)",
|
---|
361 | (double) NumBytes / 1024, NumSeeks, NumReads);
|
---|
362 | NumSeeks = NumReads = NumBytes = 0;
|
---|
363 | }
|
---|
364 |
|
---|
365 | }
|
---|
366 |
|
---|
367 |
|
---|
368 | static void
|
---|
369 | File_Stats (query_data * qd)
|
---|
370 | {
|
---|
371 | StatFile (qd->File_comp_dict);
|
---|
372 | StatFile (qd->File_fast_comp_dict);
|
---|
373 | StatFile (qd->File_text_idx_wgt);
|
---|
374 | StatFile (qd->File_text);
|
---|
375 | StatFile (qd->File_stem);
|
---|
376 |
|
---|
377 | /* [RPAP - Jan 97: Stem Index Change] */
|
---|
378 | if (qd->sd->sdh.indexed)
|
---|
379 | {
|
---|
380 | StatFile (qd->File_stem1);
|
---|
381 | StatFile (qd->File_stem2);
|
---|
382 | StatFile (qd->File_stem3);
|
---|
383 | }
|
---|
384 |
|
---|
385 | StatFile (qd->File_invf);
|
---|
386 | StatFile (qd->File_weight_approx);
|
---|
387 | StatFile (qd->File_text_idx);
|
---|
388 | StatFile ((File *) (-1));
|
---|
389 | }
|
---|
390 |
|
---|
391 |
|
---|
392 | char *
|
---|
393 | get_query (query_data * qd)
|
---|
394 | {
|
---|
395 | char *line, *LinePtr;
|
---|
396 | WritePrompt ();
|
---|
397 | do
|
---|
398 | {
|
---|
399 | do
|
---|
400 | {
|
---|
401 | line = GetMultiLine ();
|
---|
402 | if (line == NULL)
|
---|
403 | {
|
---|
404 | if (stdin == InFile)
|
---|
405 | return (NULL); /* EOF */
|
---|
406 | if (InPipe)
|
---|
407 | /* [RPAP - Feb 97: WIN32 Port] */
|
---|
408 | #ifdef __WIN32__
|
---|
409 | _pclose (InFile);
|
---|
410 | #else
|
---|
411 | pclose (InFile);
|
---|
412 | #endif
|
---|
413 | else
|
---|
414 | fclose (InFile);
|
---|
415 | InPipe = 0;
|
---|
416 | InFile = stdin;
|
---|
417 | }
|
---|
418 | }
|
---|
419 | while (line == NULL);
|
---|
420 | LinePtr = ProcessCommands (line, qd);
|
---|
421 | if (CommandsErrorStr)
|
---|
422 | fprintf (stderr, "%s\n", CommandsErrorStr);
|
---|
423 | }
|
---|
424 | while (*LinePtr == '\0' && !Quitting);
|
---|
425 | return (LinePtr);
|
---|
426 | }
|
---|
427 |
|
---|
428 |
|
---|
429 | /* This is executed when a SIGPIPE is detected
|
---|
430 | i.e. If some one quits out of the PAGER, this is executed */
|
---|
431 | #ifdef HAVE_SIGCONTEXT
|
---|
432 | static RETSIGTYPE
|
---|
433 | SIGPIPE_handler (int sig, int code,
|
---|
434 | struct sigcontext *scp, char *addr)
|
---|
435 | #else
|
---|
436 | static RETSIGTYPE
|
---|
437 | SIGPIPE_handler (int sig)
|
---|
438 | #endif
|
---|
439 | {
|
---|
440 | /* [RPAP - Feb 97: WIN32 Port] */
|
---|
441 | #ifdef __WIN32__
|
---|
442 | signal (sig, SIG_IGN);
|
---|
443 | #else
|
---|
444 | signal (SIGPIPE, SIG_IGN);
|
---|
445 | #endif
|
---|
446 | PagerRunning = 0;
|
---|
447 | }
|
---|
448 |
|
---|
449 | /* This is executed when a SIGINT (i.e. CTRL-C) is detected */
|
---|
450 | #ifdef HAVE_SIGCONTEXT
|
---|
451 | static RETSIGTYPE
|
---|
452 | SIGINT_handler (int sig, int code,
|
---|
453 | struct sigcontext *scp, char *addr)
|
---|
454 | #else
|
---|
455 | static RETSIGTYPE
|
---|
456 | SIGINT_handler (int sig)
|
---|
457 | #endif
|
---|
458 | {
|
---|
459 | Ctrl_C = 1;
|
---|
460 | }
|
---|
461 |
|
---|
462 |
|
---|
463 |
|
---|
464 | static char *post_proc = NULL;
|
---|
465 |
|
---|
466 |
|
---|
467 |
|
---|
468 | void
|
---|
469 | GetPostProc (char *line)
|
---|
470 | {
|
---|
471 | char *start, *finish;
|
---|
472 | if (post_proc)
|
---|
473 | {
|
---|
474 | Xfree (post_proc);
|
---|
475 | post_proc = NULL;
|
---|
476 | }
|
---|
477 | start = strchr (line, '\"');
|
---|
478 | finish = strrchr (line, '\"');
|
---|
479 | if (start != finish)
|
---|
480 | {
|
---|
481 | /* found a pattern */
|
---|
482 | *finish = '\0';
|
---|
483 | post_proc = Xstrdup (start + 1);
|
---|
484 | strcpy (start, finish + 1);
|
---|
485 | if (BooleanEnv (GetEnv ("verbatim"), 1) == 0)
|
---|
486 | {
|
---|
487 | char *s;
|
---|
488 | s = re_comp (post_proc);
|
---|
489 | if (!s)
|
---|
490 | {
|
---|
491 | Xfree (post_proc);
|
---|
492 | post_proc = NULL;
|
---|
493 | }
|
---|
494 | }
|
---|
495 | }
|
---|
496 | else if (start != NULL)
|
---|
497 | {
|
---|
498 | /* found a single speech mark. Delete It. */
|
---|
499 | strcpy (start, start + 1);
|
---|
500 | }
|
---|
501 | }
|
---|
502 |
|
---|
503 | int
|
---|
504 | PostProc (char *UDoc, int verbatim)
|
---|
505 | {
|
---|
506 | if (!post_proc)
|
---|
507 | return 1;
|
---|
508 |
|
---|
509 | if (verbatim) {
|
---|
510 | return (strstr (UDoc, post_proc) != NULL);
|
---|
511 | }
|
---|
512 | return re_exec ((char *) UDoc);
|
---|
513 | }
|
---|
514 |
|
---|
515 |
|
---|
516 |
|
---|
517 | static DocEntry *
|
---|
518 | in_chain (int para, int ip, DocEntry * dc)
|
---|
519 | {
|
---|
520 | while (dc)
|
---|
521 | {
|
---|
522 | if (dc->DocNum - ip == para)
|
---|
523 | return dc;
|
---|
524 | dc = dc->Next;
|
---|
525 | }
|
---|
526 | return NULL;
|
---|
527 | }
|
---|
528 |
|
---|
529 | /* num should be greater than or equal to 1 */
|
---|
530 | int
|
---|
531 | RawDocOutput (query_data * qd, u_long num, FILE * Output)
|
---|
532 | {
|
---|
533 | static last_pos = 0;
|
---|
534 | static u_char *c_buffer = 0;
|
---|
535 | static int buf_len = -1;
|
---|
536 | static u_char *uc_buffer = 0;
|
---|
537 | u_long pos, len;
|
---|
538 | int ULen;
|
---|
539 |
|
---|
540 | FetchDocStart (qd, num, &pos, &len);
|
---|
541 |
|
---|
542 | if ((int) len > buf_len)
|
---|
543 | {
|
---|
544 | if (c_buffer)
|
---|
545 | {
|
---|
546 | Xfree (c_buffer);
|
---|
547 | Xfree (uc_buffer);
|
---|
548 | }
|
---|
549 | if (!(c_buffer = Xmalloc (len)))
|
---|
550 | return -1;
|
---|
551 | if (!(uc_buffer = Xmalloc ((int) (qd->td->cth.ratio * 1.01 *
|
---|
552 | len) + 100)))
|
---|
553 | return -1;
|
---|
554 | buf_len = len;
|
---|
555 | }
|
---|
556 | if (last_pos != pos)
|
---|
557 | Fseek (qd->td->TextFile, pos, 0);
|
---|
558 | Fread (c_buffer, 1, len, qd->td->TextFile);
|
---|
559 | last_pos = pos + len;
|
---|
560 | DecodeText (qd->cd, c_buffer, len, uc_buffer, &ULen);
|
---|
561 | fwrite (uc_buffer, ULen, sizeof (u_char), Output);
|
---|
562 | return 0;
|
---|
563 | }
|
---|
564 |
|
---|
565 |
|
---|
566 | void
|
---|
567 | StringOut (FILE * Output, char *string,
|
---|
568 | int intvalid, unsigned long intval,
|
---|
569 | int floatvalid, double floatval)
|
---|
570 | {
|
---|
571 | char *s;
|
---|
572 | for (s = string; *s; s++)
|
---|
573 | if (*s == '%' &&
|
---|
574 | (*(s + 1) == 'n' || *(s + 1) == 'w' || *(s + 1) == '%'))
|
---|
575 | {
|
---|
576 | s++;
|
---|
577 | switch (*s)
|
---|
578 | {
|
---|
579 | case 'n':
|
---|
580 | if (intvalid)
|
---|
581 | fprintf (Output, "%lu", intval);
|
---|
582 | else
|
---|
583 | fprintf (Output, "%%n");
|
---|
584 | break;
|
---|
585 | case 'w':
|
---|
586 | if (floatvalid)
|
---|
587 | fprintf (Output, "%f", floatval);
|
---|
588 | else
|
---|
589 | fprintf (Output, "%%w");
|
---|
590 | break;
|
---|
591 | case '%':
|
---|
592 | fputc ('%', Output);
|
---|
593 | }
|
---|
594 | }
|
---|
595 | else
|
---|
596 | fputc (*s, Output);
|
---|
597 | }
|
---|
598 |
|
---|
599 |
|
---|
600 | void
|
---|
601 | HeaderOut (FILE * Output, u_char * UDoc, unsigned long ULen, int heads_length)
|
---|
602 | {
|
---|
603 | int i, space = 1, num = 0;
|
---|
604 | for (i = 0; i < ULen && num < heads_length; i++)
|
---|
605 | {
|
---|
606 | char c = UDoc[i];
|
---|
607 | if (c == '\02')
|
---|
608 | break;
|
---|
609 |
|
---|
610 | if (isspace (c) || c == '\01' || c == '\03')
|
---|
611 | {
|
---|
612 | if (!space)
|
---|
613 | {
|
---|
614 | fputc (' ', Output);
|
---|
615 | num++;
|
---|
616 | }
|
---|
617 | space = 1;
|
---|
618 | }
|
---|
619 | else
|
---|
620 | {
|
---|
621 | space = 0;
|
---|
622 | fputc (c, Output);
|
---|
623 | num++;
|
---|
624 | }
|
---|
625 | }
|
---|
626 | }
|
---|
627 |
|
---|
628 | /* [RPAP - Feb 97: NZDL Additions] */
|
---|
629 | #if defined(PARADOCNUM) || defined(NZDL)
|
---|
630 | void PrintDocNum(FILE *output, char query_type,
|
---|
631 | int docnum, int indexnum, float weight)
|
---|
632 | {
|
---|
633 | if (query_type == 'R' || query_type == 'A')
|
---|
634 | fprintf(output, "%7d.%-7d %6.4f\n", docnum, indexnum, weight);
|
---|
635 | else
|
---|
636 | fprintf(output, "%7d.%-7d\n", docnum, indexnum);
|
---|
637 | }
|
---|
638 | #endif
|
---|
639 |
|
---|
640 | static int
|
---|
641 | ProcessDocs (query_data * qd, int num, int verbatim,
|
---|
642 | char OutputType, FILE * Output)
|
---|
643 | {
|
---|
644 | int max_buf = 0;
|
---|
645 | int DocCount = 0;
|
---|
646 | char *doc_sepstr = NULL;
|
---|
647 | char *para_sepstr = NULL;
|
---|
648 | char *para_start = NULL;
|
---|
649 | int heads_length = atoi (GetDefEnv ("heads_length", "50"));
|
---|
650 | char QueryType = get_query_type ();
|
---|
651 | int need_text = (OutputType == OUTPUT_TEXT || OutputType == OUTPUT_HILITE ||
|
---|
652 | OutputType == OUTPUT_HEADERS || OutputType == OUTPUT_SILENT ||
|
---|
653 | post_proc); /* [RJM June 1997 -- fixing post retrieval scan] */
|
---|
654 |
|
---|
655 | if (OutputType == OUTPUT_TEXT || OutputType == OUTPUT_HILITE)
|
---|
656 | {
|
---|
657 | if (QueryType == QUERY_APPROX || QueryType == QUERY_RANKED)
|
---|
658 | {
|
---|
659 | doc_sepstr = de_escape_string (
|
---|
660 | Xstrdup (GetDefEnv ("ranked_doc_sepstr",
|
---|
661 | "---------------------------------- %n %w\\n")));
|
---|
662 | }
|
---|
663 | else
|
---|
664 | {
|
---|
665 | doc_sepstr = de_escape_string (
|
---|
666 | Xstrdup (GetDefEnv ("doc_sepstr",
|
---|
667 | "---------------------------------- %n\\n")));
|
---|
668 | }
|
---|
669 | para_sepstr = de_escape_string (
|
---|
670 | Xstrdup (GetDefEnv ("para_sepstr",
|
---|
671 | "\\n######## PARAGRAPH %n ########\\n")));
|
---|
672 |
|
---|
673 | para_start = de_escape_string (
|
---|
674 | Xstrdup (GetDefEnv ("para_start",
|
---|
675 | "***** Weight = %w *****\\n")));
|
---|
676 | }
|
---|
677 |
|
---|
678 | if (need_text)
|
---|
679 | {
|
---|
680 | max_buf = atoi (GetDefEnv ("buffer", "1048576"));
|
---|
681 | }
|
---|
682 |
|
---|
683 | do
|
---|
684 | {
|
---|
685 | u_char *UDoc = NULL;
|
---|
686 | unsigned long ULen;
|
---|
687 |
|
---|
688 | if (need_text)
|
---|
689 | {
|
---|
690 | /* load the compressed text */
|
---|
691 | if (LoadCompressedText (qd, max_buf))
|
---|
692 | {
|
---|
693 | Message ("Unable to load compressed text.");
|
---|
694 | FatalError (1, "This is probably due to lack of memory.");
|
---|
695 | }
|
---|
696 |
|
---|
697 | /* uncompress the loaded text */
|
---|
698 | UDoc = GetDocText (qd, &ULen);
|
---|
699 | if (UDoc == NULL)
|
---|
700 | FatalError (1, "UDoc is unexpectedly NULL");
|
---|
701 | }
|
---|
702 |
|
---|
703 | if (!UDoc || PostProc ((char *) UDoc, verbatim))
|
---|
704 | {
|
---|
705 | switch (OutputType)
|
---|
706 | {
|
---|
707 | case OUTPUT_COUNT:
|
---|
708 | case OUTPUT_SILENT:
|
---|
709 | break;
|
---|
710 | case OUTPUT_DOCNUMS: /* This prints out the docnums string */
|
---|
711 | if (PagerRunning)
|
---|
712 | {
|
---|
713 |
|
---|
714 | /* [RPAP - Feb 97: NZDL Additions] */
|
---|
715 | #if defined(PARADOCNUM) || defined(NZDL)
|
---|
716 | int doc_num = GetDocNum(qd);
|
---|
717 |
|
---|
718 | if (qd->paragraph)
|
---|
719 | {
|
---|
720 | if (qd->id->ifh.InvfLevel == 3 &&
|
---|
721 | (QueryType == 'R' || QueryType == 'A'))
|
---|
722 | {
|
---|
723 | /* Print weights for each paragraph in document */
|
---|
724 |
|
---|
725 |
|
---|
726 | int true_doc_num = GetDocNumFromParaNum(qd, doc_num);
|
---|
727 |
|
---|
728 | /* Get number of paragraphs in this document */
|
---|
729 |
|
---|
730 | int num_paragraphs =
|
---|
731 | qd->paragraph[true_doc_num]-qd->paragraph[true_doc_num-1];
|
---|
732 |
|
---|
733 | int init_para = FetchInitialParagraph(qd->td,
|
---|
734 | doc_num);
|
---|
735 | DocEntry *de, *doc_chain = GetDocChain(qd);
|
---|
736 | int i;
|
---|
737 |
|
---|
738 | for (i = 0; i < num_paragraphs; i++)
|
---|
739 | {
|
---|
740 | if ((de = in_chain(i, init_para, doc_chain)))
|
---|
741 | PrintDocNum(Output, QueryType,
|
---|
742 | true_doc_num, init_para+i,
|
---|
743 | de->Weight);
|
---|
744 | }
|
---|
745 | }
|
---|
746 | else
|
---|
747 | PrintDocNum(Output, QueryType,
|
---|
748 | GetDocNumFromParaNum(qd, GetDocNum(qd)),
|
---|
749 | GetDocNum(qd),
|
---|
750 | GetDocWeight(qd));
|
---|
751 | }
|
---|
752 | else
|
---|
753 | {
|
---|
754 | PrintDocNum(Output, QueryType,
|
---|
755 | doc_num, doc_num, GetDocWeight(qd));
|
---|
756 | }
|
---|
757 | #else
|
---|
758 | fprintf (Output, "%7d %6.4f %7lu\n", GetDocNum (qd),
|
---|
759 | GetDocWeight (qd), GetDocCompLength (qd));
|
---|
760 | #endif
|
---|
761 | }
|
---|
762 | break;
|
---|
763 | case OUTPUT_HEADERS: /* This prints out the headers of the documents */
|
---|
764 | if (PagerRunning)
|
---|
765 | fprintf (Output, "%d ", GetDocNum (qd));
|
---|
766 | HeaderOut (Output, UDoc, ULen, heads_length);
|
---|
767 | if (PagerRunning)
|
---|
768 | fputc ('\n', Output);
|
---|
769 | break;
|
---|
770 | #if TREC_MODE
|
---|
771 | case OUTPUT_EXTRAS: /* This prints out the docnums string */
|
---|
772 | if (PagerRunning && trec_ids)
|
---|
773 | {
|
---|
774 | long DN, PN = GetDocNum (qd) - 1;
|
---|
775 | if (trec_paras)
|
---|
776 | DN = trec_paras[PN];
|
---|
777 | else
|
---|
778 | DN = PN;
|
---|
779 | fprintf (Output, "%-14.14s %8ld %10.5f\n",
|
---|
780 | &trec_ids[DN * 14], PN + 1, GetDocWeight (qd));
|
---|
781 | }
|
---|
782 | break;
|
---|
783 | #endif
|
---|
784 | case OUTPUT_TEXT:
|
---|
785 | case OUTPUT_HILITE:
|
---|
786 | {
|
---|
787 | int j, para = -1, curr_para = 0;
|
---|
788 | int init_para = -1;
|
---|
789 | DocEntry *de, *doc_chain = NULL;
|
---|
790 | register char ch = ' ';
|
---|
791 | register char lch = '\n';
|
---|
792 |
|
---|
793 | /* [RPAP - Feb 97: NZDL Additions] */
|
---|
794 | #if defined(PARADOCNUM) || defined(NZDL)
|
---|
795 | if (qd->id->ifh.InvfLevel == 3)
|
---|
796 | {
|
---|
797 | init_para = FetchInitialParagraph(qd->td, GetDocNum(qd));
|
---|
798 |
|
---|
799 | StringOut(Output, para_sepstr,
|
---|
800 | 1, init_para+curr_para,
|
---|
801 | 0, 0);
|
---|
802 |
|
---|
803 | }
|
---|
804 | else
|
---|
805 | StringOut(Output, doc_sepstr,
|
---|
806 | 1, GetDocNum(qd),
|
---|
807 | QueryType == 'A' || QueryType == 'R',
|
---|
808 | GetDocWeight(qd));
|
---|
809 |
|
---|
810 | #else
|
---|
811 | int p_on = 0;
|
---|
812 |
|
---|
813 | if (PagerRunning)
|
---|
814 | {
|
---|
815 | StringOut (Output, doc_sepstr,
|
---|
816 | 1, GetDocNum (qd),
|
---|
817 | QueryType == 'A' || QueryType == 'R',
|
---|
818 | GetDocWeight (qd));
|
---|
819 | }
|
---|
820 | if (qd->id->ifh.InvfLevel == 3)
|
---|
821 | {
|
---|
822 | init_para = FetchInitialParagraph (qd->td, GetDocNum (qd));
|
---|
823 | doc_chain = GetDocChain (qd);
|
---|
824 | para = GetDocNum (qd) - init_para;
|
---|
825 |
|
---|
826 | StringOut (Output, para_sepstr,
|
---|
827 | 1, curr_para + 1,
|
---|
828 | 0, 0);
|
---|
829 |
|
---|
830 | if ((de = in_chain (0, init_para, doc_chain)))
|
---|
831 | StringOut (Output, para_start,
|
---|
832 | 0, 0,
|
---|
833 | 1, de->Weight);
|
---|
834 |
|
---|
835 | if (doc_chain->DocNum - init_para == 0)
|
---|
836 | p_on = 1;
|
---|
837 | }
|
---|
838 | #endif
|
---|
839 | for (j = 0; j < ULen; j++)
|
---|
840 | {
|
---|
841 | ch = UDoc[j];
|
---|
842 | switch (ch)
|
---|
843 | {
|
---|
844 | case '\02':
|
---|
845 | break;
|
---|
846 | case '\01':
|
---|
847 | ch = '\n';
|
---|
848 | case '\03':
|
---|
849 | /* [RPAP - Feb 97: NZDL Additions] */
|
---|
850 | #if defined(PARADOCNUM) || defined(NZDL)
|
---|
851 | /* print paragraph numbers only if this is
|
---|
852 | a level 3 index */
|
---|
853 | if (qd->id->ifh.InvfLevel == 3)
|
---|
854 | {
|
---|
855 | curr_para++;
|
---|
856 | StringOut(Output, para_sepstr,
|
---|
857 | 1, init_para+curr_para,
|
---|
858 | 0, 0);
|
---|
859 | }
|
---|
860 | #else
|
---|
861 | p_on = 0;
|
---|
862 | curr_para++;
|
---|
863 | StringOut (Output, para_sepstr,
|
---|
864 | 1, curr_para + 1,
|
---|
865 | 0, 0);
|
---|
866 | lch = *(strchr (para_sepstr, '\0') - 1);
|
---|
867 | if ((de = in_chain (curr_para, init_para, doc_chain)))
|
---|
868 | StringOut (Output, para_start,
|
---|
869 | 0, 0,
|
---|
870 | 1, de->Weight);
|
---|
871 | if (doc_chain &&
|
---|
872 | doc_chain->DocNum - init_para == curr_para)
|
---|
873 | p_on = 1;
|
---|
874 | #endif
|
---|
875 | break;
|
---|
876 | default:
|
---|
877 | {
|
---|
878 | if (PagerRunning)
|
---|
879 | {
|
---|
880 | fputc (ch, Output);
|
---|
881 | /* [RPAP - Feb 97: NZDL Additions] */
|
---|
882 | #if !defined(PARADOCNUM) && !defined(NZDL)
|
---|
883 | if (p_on && isprint (ch))
|
---|
884 | {
|
---|
885 | fputc ('\b', Output);
|
---|
886 | fputc ('_', Output);
|
---|
887 | }
|
---|
888 | #endif
|
---|
889 | }
|
---|
890 |
|
---|
891 | lch = ch;
|
---|
892 | }
|
---|
893 | }
|
---|
894 | }
|
---|
895 | if (PagerRunning && lch != '\n')
|
---|
896 | fputc ('\n', Output);
|
---|
897 | /* [RPAP - Feb 97: NZDL Additions] */
|
---|
898 | #if !defined(PARADOCNUM) && !defined(NZDL)
|
---|
899 | p_on = 0;
|
---|
900 | #endif
|
---|
901 | }
|
---|
902 | }
|
---|
903 | if (PagerRunning)
|
---|
904 | fflush (Output);
|
---|
905 |
|
---|
906 | DocCount++; /* moved within if statement [RJM June 1997 -- fixing post retrieval scan] */
|
---|
907 | }
|
---|
908 | }
|
---|
909 | while (NextDoc (qd) && PagerRunning && (!Ctrl_C));
|
---|
910 |
|
---|
911 | if (need_text)
|
---|
912 | {
|
---|
913 | FreeTextBuffer (qd);
|
---|
914 | }
|
---|
915 |
|
---|
916 | if (OutputType == OUTPUT_TEXT || OutputType == OUTPUT_HILITE)
|
---|
917 | {
|
---|
918 | Xfree (doc_sepstr);
|
---|
919 | Xfree (para_sepstr);
|
---|
920 | Xfree (para_start);
|
---|
921 | }
|
---|
922 |
|
---|
923 | return (DocCount);
|
---|
924 | }
|
---|
925 |
|
---|
926 |
|
---|
927 | void
|
---|
928 | output_terminator (FILE * out)
|
---|
929 | {
|
---|
930 | char *terminator = Xstrdup (GetDefEnv ("terminator", ""));
|
---|
931 | de_escape_string (terminator);
|
---|
932 | fputs (terminator, out);
|
---|
933 | Xfree (terminator);
|
---|
934 | }
|
---|
935 |
|
---|
936 |
|
---|
937 |
|
---|
938 |
|
---|
939 | /* MoreDocs () */
|
---|
940 | /* Displays all documents in list DocList. */
|
---|
941 | /* Documents are fetched, then decompressed and displayed according to the */
|
---|
942 | /* format implied in FormString(). */
|
---|
943 |
|
---|
944 | /* [RPAP - Feb 97: WIN32 Port] */
|
---|
945 | #ifdef __WIN32__
|
---|
946 | # define HILITE_PAGER "mg_hilite_words.exe"
|
---|
947 | #else
|
---|
948 | # define HILITE_PAGER "mg_hilite_words"
|
---|
949 | #endif
|
---|
950 |
|
---|
951 | #define MAX_HILITE_PAGER_STR 80 /* for command & its options */
|
---|
952 |
|
---|
953 | static void
|
---|
954 | MoreDocs (query_data * qd, char *Query, char OutputType)
|
---|
955 | {
|
---|
956 | static char terms_str[MAXTERMSTRLEN + 1];
|
---|
957 | int DocCount = 0; /* number of actual matches */
|
---|
958 | FILE *Output = NULL;
|
---|
959 | int using_pipe = 0;
|
---|
960 | char *pager = NULL;
|
---|
961 |
|
---|
962 | Ctrl_C = 0;
|
---|
963 |
|
---|
964 | qd->num_of_ans = qd->DL->num;
|
---|
965 |
|
---|
966 | /* [RPAP - Feb 97: WIN32 Port] */
|
---|
967 | #ifndef __WIN32__
|
---|
968 | signal (SIGPIPE, SIGPIPE_handler);
|
---|
969 | #endif
|
---|
970 | signal (SIGINT, SIGINT_handler);
|
---|
971 |
|
---|
972 | PagerRunning = 1;
|
---|
973 | if (isatty (fileno (OutFile)) && GetEnv ("pager") &&
|
---|
974 | OutputType != OUTPUT_HILITE &&
|
---|
975 | OutputType != OUTPUT_SILENT && OutputType != OUTPUT_COUNT)
|
---|
976 | {
|
---|
977 | pager = GetEnv ("pager");
|
---|
978 | }
|
---|
979 | else if (isatty (fileno (OutFile)) && OutputType == OUTPUT_HILITE)
|
---|
980 | {
|
---|
981 | /* concat the pager and its word argument strings */
|
---|
982 | ConvertTermsToString (qd->TL, terms_str);
|
---|
983 | pager = Xmalloc (MAX_HILITE_PAGER_STR + strlen (terms_str) + 1);
|
---|
984 | if (!pager)
|
---|
985 | {
|
---|
986 | fprintf (stderr, "Unable to allocate memory for highlighting\n");
|
---|
987 | return;
|
---|
988 | }
|
---|
989 | sprintf (pager, "%s --style=%s --pager=%s --stem_method=%ld --stemmer=%ld %s",
|
---|
990 | HILITE_PAGER,
|
---|
991 | GetEnv ("hilite_style"),
|
---|
992 | GetEnv ("pager"),
|
---|
993 | qd->sd->sdh.stem_method,
|
---|
994 | qd->sd->sdh.stemmer_num,
|
---|
995 | terms_str);
|
---|
996 |
|
---|
997 | }
|
---|
998 | else
|
---|
999 | {
|
---|
1000 | Output = OutFile;
|
---|
1001 | }
|
---|
1002 |
|
---|
1003 | /* [RPAP - Feb 97: NZDL Additions] */
|
---|
1004 | #if defined(OUTPUTSTEMMEDWORDS) || defined(NZDL)
|
---|
1005 | if (!isatty(fileno(OutFile)) && get_query_type() != QUERY_DOCNUMS)
|
---|
1006 | {
|
---|
1007 | ConvertTermsToString(qd->TL, terms_str);
|
---|
1008 | fprintf(Output, "%s\n", terms_str);
|
---|
1009 | }
|
---|
1010 | #endif
|
---|
1011 | if (pager)
|
---|
1012 | {
|
---|
1013 | /* [RPAP - Feb 97: WIN32 Port] */
|
---|
1014 | #ifdef __WIN32__
|
---|
1015 | Output = _popen (pager, "w");
|
---|
1016 | #else
|
---|
1017 | Output = popen (pager, "w");
|
---|
1018 | #endif
|
---|
1019 | using_pipe = (Output != NULL);
|
---|
1020 | if (!using_pipe)
|
---|
1021 | {
|
---|
1022 | fprintf (stderr, "Unable to run \"%s\"\n", pager);
|
---|
1023 | return;
|
---|
1024 | }
|
---|
1025 | }
|
---|
1026 |
|
---|
1027 |
|
---|
1028 | if (qd->DL->num > 0)
|
---|
1029 | {
|
---|
1030 | if (OutputType == OUTPUT_COUNT && !post_proc)
|
---|
1031 | DocCount = qd->DL->num;
|
---|
1032 | else {
|
---|
1033 | DocCount = ProcessDocs (qd, qd->DL->num,
|
---|
1034 | BooleanEnv (GetEnv ("verbatim"), 1),
|
---|
1035 | OutputType, Output);
|
---|
1036 | }
|
---|
1037 | }
|
---|
1038 |
|
---|
1039 | if (PagerRunning)
|
---|
1040 | {
|
---|
1041 | output_terminator (Output);
|
---|
1042 | fflush (Output);
|
---|
1043 | }
|
---|
1044 |
|
---|
1045 | if (OutputType == OUTPUT_HILITE && pager)
|
---|
1046 | free (pager); /* as needed to malloc to create the pager string */
|
---|
1047 |
|
---|
1048 | if (using_pipe)
|
---|
1049 | /* [RPAP - Feb 97: WIN32 Port] */
|
---|
1050 | #ifdef __WIN32__
|
---|
1051 | _pclose (Output);
|
---|
1052 | #else
|
---|
1053 | pclose (Output);
|
---|
1054 | #endif
|
---|
1055 |
|
---|
1056 | if (qd->DL->num == 0)
|
---|
1057 | fprintf (stderr, "No entries correspond to that query.\n");
|
---|
1058 | else
|
---|
1059 | {
|
---|
1060 | if (OutputType == OUTPUT_COUNT)
|
---|
1061 | fprintf (stderr, "%d documents match.\n", DocCount);
|
---|
1062 | else
|
---|
1063 | fprintf (stderr, "%d documents retrieved.\n", DocCount);
|
---|
1064 | }
|
---|
1065 |
|
---|
1066 | signal (SIGINT, SIG_DFL);
|
---|
1067 | }
|
---|
1068 |
|
---|
1069 |
|
---|
1070 | void
|
---|
1071 | start_up_stats (query_data * qd, InitQueryTimes iqt)
|
---|
1072 | {
|
---|
1073 | Clear_Stats ();
|
---|
1074 | if (BooleanEnv (GetEnv ("timestats"), 0) ||
|
---|
1075 | BooleanEnv (GetEnv ("briefstats"), 0))
|
---|
1076 | StartUpTimeStats (&iqt);
|
---|
1077 |
|
---|
1078 | if (BooleanEnv (GetEnv ("diskstats"), 0) ||
|
---|
1079 | BooleanEnv (GetEnv ("briefstats"), 0))
|
---|
1080 | File_Stats (qd);
|
---|
1081 |
|
---|
1082 | if (BooleanEnv (GetEnv ("memstats"), 0) ||
|
---|
1083 | BooleanEnv (GetEnv ("briefstats"), 0))
|
---|
1084 | MemStats (qd);
|
---|
1085 |
|
---|
1086 | }
|
---|
1087 |
|
---|
1088 |
|
---|
1089 | void
|
---|
1090 | shut_down_stats (query_data * qd, ProgTime * start,
|
---|
1091 | ProgTime * invf, ProgTime * text)
|
---|
1092 | {
|
---|
1093 | Clear_Stats ();
|
---|
1094 | if (BooleanEnv (GetEnv ("timestats"), 0) ||
|
---|
1095 | BooleanEnv (GetEnv ("briefstats"), 0))
|
---|
1096 | QueryTimeStats (start, invf, text);
|
---|
1097 |
|
---|
1098 | if (BooleanEnv (GetEnv ("diskstats"), 0) ||
|
---|
1099 | BooleanEnv (GetEnv ("briefstats"), 0))
|
---|
1100 | {
|
---|
1101 | TransFileStats (qd);
|
---|
1102 | File_Stats (qd);
|
---|
1103 | }
|
---|
1104 |
|
---|
1105 | if (BooleanEnv (GetEnv ("sizestats"), 0))
|
---|
1106 | TotalSizeStats (qd);
|
---|
1107 | }
|
---|
1108 |
|
---|
1109 |
|
---|
1110 |
|
---|
1111 | char *wordfreqword2str (u_char * s)
|
---|
1112 | {
|
---|
1113 | static char buf[1024];
|
---|
1114 | int i, len = (int) *s++;
|
---|
1115 |
|
---|
1116 | for (i = 0; i < len; i++)
|
---|
1117 | {
|
---|
1118 | buf[i] = (char)s[i];
|
---|
1119 | }
|
---|
1120 | buf[len] = '\0';
|
---|
1121 |
|
---|
1122 | return buf;
|
---|
1123 | }
|
---|
1124 |
|
---|
1125 |
|
---|
1126 | /* [RPAP - Feb 97: Term Frequency] */
|
---|
1127 | /*********************************
|
---|
1128 | * PrintQueryTermFreq
|
---|
1129 | *
|
---|
1130 | * Prints the query terms and their respective frequencies within the collection
|
---|
1131 | *********************************/
|
---|
1132 | void
|
---|
1133 | PrintQueryTermFreqs (QueryTermList *qtl)
|
---|
1134 | {
|
---|
1135 | int i;
|
---|
1136 |
|
---|
1137 | /* Print the number of terms */
|
---|
1138 | fprintf (OutFile, "%d\n", qtl->num);
|
---|
1139 |
|
---|
1140 | /* Print the terms and their respective frequency within the collection */
|
---|
1141 | for (i = 0; i < qtl->num; i++)
|
---|
1142 | if (qtl->QTE[i].stem_method == -1)
|
---|
1143 | /* Using default stem method - don't print stem method beside term */
|
---|
1144 | fprintf (OutFile, "%s %d\n", wordfreqword2str (qtl->QTE[i].Term), qtl->QTE[i].Count);
|
---|
1145 | else
|
---|
1146 | /* Term was forced with a stem, print stem method with term */
|
---|
1147 | fprintf (OutFile, "%s#%d %d\n", wordfreqword2str (qtl->QTE[i].Term), qtl->QTE[i].stem_method, qtl->QTE[i].Count);
|
---|
1148 | }
|
---|
1149 |
|
---|
1150 |
|
---|
1151 | void
|
---|
1152 | query (void)
|
---|
1153 | {
|
---|
1154 | ProgTime TotalStartTime, TotalInvfTime, TotalTextTime;
|
---|
1155 | InitQueryTimes iqt;
|
---|
1156 | query_data *qd;
|
---|
1157 |
|
---|
1158 | TotalStartTime.RealTime = TotalStartTime.CPUTime = 0;
|
---|
1159 | TotalInvfTime.RealTime = TotalInvfTime.CPUTime = 0;
|
---|
1160 | TotalTextTime.RealTime = TotalTextTime.CPUTime = 0;
|
---|
1161 |
|
---|
1162 | /* [RPAP - Feb 97: WIN32 Port] */
|
---|
1163 | #ifdef __WIN32__
|
---|
1164 | qd = InitQuerySystem (GetDefEnv ("mgdir", ".\\"),
|
---|
1165 | GetDefEnv ("mgname", ""),
|
---|
1166 | GetDefEnv ("textname", NULL), /* [RJM 06/97: text filename] */
|
---|
1167 | &iqt);
|
---|
1168 | #else
|
---|
1169 | qd = InitQuerySystem (GetDefEnv ("mgdir", "./"),
|
---|
1170 | GetDefEnv ("mgname", ""),
|
---|
1171 | GetDefEnv ("textname", NULL), /* [RJM 06/97: text filename] */
|
---|
1172 | &iqt);
|
---|
1173 | #endif
|
---|
1174 |
|
---|
1175 | if (!qd)
|
---|
1176 | FatalError (1, mg_errorstrs[mg_errno], mg_error_data);
|
---|
1177 | start_up_stats (qd, iqt);
|
---|
1178 |
|
---|
1179 |
|
---|
1180 | while (1)
|
---|
1181 | {
|
---|
1182 | ProgTime StartTime, InvfTime, TextTime;
|
---|
1183 | char QueryType;
|
---|
1184 | char OutputType;
|
---|
1185 | char *line;
|
---|
1186 | ResetFileStats (qd);
|
---|
1187 | qd->max_mem_in_use = qd->mem_in_use = 0;
|
---|
1188 |
|
---|
1189 | qd->tot_hops_taken += qd->hops_taken;
|
---|
1190 | qd->tot_num_of_ptrs += qd->num_of_ptrs;
|
---|
1191 | qd->tot_num_of_accum += qd->num_of_accum;
|
---|
1192 | qd->tot_num_of_terms += qd->num_of_terms;
|
---|
1193 | qd->tot_num_of_ans += qd->num_of_ans;
|
---|
1194 | qd->tot_text_idx_lookups += qd->text_idx_lookups;
|
---|
1195 | qd->hops_taken = qd->num_of_ptrs = 0;
|
---|
1196 | qd->num_of_accum = qd->num_of_ans = qd->num_of_terms = 0;
|
---|
1197 | qd->text_idx_lookups = 0;
|
---|
1198 |
|
---|
1199 | Display_Stats (stderr);
|
---|
1200 | Clear_Stats ();
|
---|
1201 | line = get_query (qd);
|
---|
1202 | if (!line || Quitting)
|
---|
1203 | break;
|
---|
1204 |
|
---|
1205 | GetPostProc (line);
|
---|
1206 |
|
---|
1207 | GetTime (&StartTime);
|
---|
1208 |
|
---|
1209 | FreeQueryDocs (qd);
|
---|
1210 |
|
---|
1211 | QueryType = get_query_type ();
|
---|
1212 | OutputType = get_output_type ();
|
---|
1213 | /* No point in hiliting words on a docnum query */
|
---|
1214 | if (OutputType == OUTPUT_HILITE && QueryType == QUERY_DOCNUMS)
|
---|
1215 | OutputType = OUTPUT_TEXT;
|
---|
1216 |
|
---|
1217 | switch (QueryType)
|
---|
1218 | {
|
---|
1219 | case QUERY_BOOLEAN:
|
---|
1220 | {
|
---|
1221 | char *maxdocs;
|
---|
1222 | BooleanQueryInfo bqi;
|
---|
1223 | maxdocs = GetDefEnv ("maxdocs", "all");
|
---|
1224 | bqi.MaxDocsToRetrieve = strcmp (maxdocs, "all") ? atoi (maxdocs) : -1;
|
---|
1225 | /* [RPAP - Jan 97: Stem Index Change] */
|
---|
1226 | if (qd->sd->sdh.indexed)
|
---|
1227 | BooleanQuery (qd, line, &bqi, (BooleanEnv (GetEnv ("casefold"), 0) |
|
---|
1228 | (BooleanEnv (GetEnv ("stem"), 0) << 1)));
|
---|
1229 | else
|
---|
1230 | BooleanQuery (qd, line, &bqi, qd->sd->sdh.stem_method);
|
---|
1231 |
|
---|
1232 | break;
|
---|
1233 | }
|
---|
1234 | case QUERY_APPROX:
|
---|
1235 | case QUERY_RANKED:
|
---|
1236 | {
|
---|
1237 | char *maxdocs;
|
---|
1238 | char *maxterms;
|
---|
1239 | char *maxaccum;
|
---|
1240 | RankedQueryInfo rqi;
|
---|
1241 | maxdocs = GetDefEnv ("maxdocs", "all");
|
---|
1242 | maxterms = GetDefEnv ("max_terms", "all");
|
---|
1243 | maxaccum = GetDefEnv ("max_accumulators", "all");
|
---|
1244 | rqi.Sort = BooleanEnv (GetEnv ("sorted_terms"), 0);
|
---|
1245 | rqi.QueryFreqs = BooleanEnv (GetEnv ("qfreq"), 1);
|
---|
1246 | rqi.Exact = QueryType == QUERY_RANKED;
|
---|
1247 | rqi.MaxDocsToRetrieve = strcmp (maxdocs, "all") ? atoi (maxdocs) : -1;
|
---|
1248 | rqi.MaxTerms = strcmp (maxterms, "all") ? atoi (maxterms) : -1;
|
---|
1249 | rqi.MaxParasToRetrieve = rqi.MaxDocsToRetrieve;
|
---|
1250 | if (qd->id->ifh.InvfLevel == 3 && GetEnv ("maxparas"))
|
---|
1251 | rqi.MaxParasToRetrieve = atoi (GetEnv ("maxparas"));
|
---|
1252 | rqi.AccumMethod = toupper (*GetDefEnv ("accumulator_method", "A"));
|
---|
1253 | rqi.MaxAccums = strcmp (maxaccum, "all") ? atoi (maxaccum) : -1;
|
---|
1254 | rqi.HashTblSize = IntEnv (GetEnv ("hash_tbl_size"), 1000);
|
---|
1255 | rqi.StopAtMaxAccum = BooleanEnv (GetEnv ("stop_at_max_accum"), 0);
|
---|
1256 | rqi.skip_dump = GetEnv ("skip_dump");
|
---|
1257 | RankedQuery (qd, line, &rqi);
|
---|
1258 | break;
|
---|
1259 | }
|
---|
1260 | case QUERY_DOCNUMS:
|
---|
1261 | {
|
---|
1262 | DocnumsQuery (qd, line);
|
---|
1263 | break;
|
---|
1264 | }
|
---|
1265 | }
|
---|
1266 |
|
---|
1267 | GetTime (&InvfTime);
|
---|
1268 |
|
---|
1269 | /* [RPAP - Feb 97: Term Frequency] */
|
---|
1270 | if (qd->QTL && BooleanEnv (GetEnv ("term_freq"), 0))
|
---|
1271 | PrintQueryTermFreqs (qd->QTL);
|
---|
1272 |
|
---|
1273 | if (qd->DL)
|
---|
1274 | MoreDocs (qd, line, OutputType);
|
---|
1275 |
|
---|
1276 | GetTime (&TextTime);
|
---|
1277 |
|
---|
1278 | if (BooleanEnv (GetEnv ("timestats"), 0) ||
|
---|
1279 | BooleanEnv (GetEnv ("briefstats"), 0))
|
---|
1280 | QueryTimeStats (&StartTime, &InvfTime, &TextTime);
|
---|
1281 |
|
---|
1282 | if (BooleanEnv (GetEnv ("diskstats"), 0) ||
|
---|
1283 | BooleanEnv (GetEnv ("briefstats"), 0))
|
---|
1284 | File_Stats (qd);
|
---|
1285 |
|
---|
1286 | if (BooleanEnv (GetEnv ("memstats"), 0) ||
|
---|
1287 | BooleanEnv (GetEnv ("briefstats"), 0))
|
---|
1288 | MemStats (qd);
|
---|
1289 |
|
---|
1290 | if (BooleanEnv (GetEnv ("sizestats"), 0))
|
---|
1291 | SizeStats (qd);
|
---|
1292 |
|
---|
1293 | TotalInvfTime.RealTime += InvfTime.RealTime - StartTime.RealTime;
|
---|
1294 | TotalInvfTime.CPUTime += InvfTime.CPUTime - StartTime.CPUTime;
|
---|
1295 | TotalTextTime.RealTime += TextTime.RealTime - StartTime.RealTime;
|
---|
1296 | TotalTextTime.CPUTime += TextTime.CPUTime - StartTime.CPUTime;
|
---|
1297 | }
|
---|
1298 |
|
---|
1299 | if (isatty (fileno (InFile)) && !Quitting)
|
---|
1300 | fprintf (stderr, "\n");
|
---|
1301 |
|
---|
1302 | shut_down_stats (qd, &TotalStartTime, &TotalInvfTime, &TotalTextTime);
|
---|
1303 |
|
---|
1304 | Display_Stats (stderr);
|
---|
1305 |
|
---|
1306 | }
|
---|
1307 |
|
---|
1308 |
|
---|
1309 | void
|
---|
1310 | search_for_collection (char *name)
|
---|
1311 | {
|
---|
1312 | /* [RPAP - Feb 97: WIN32 Port] */
|
---|
1313 | #ifdef __WIN32__
|
---|
1314 | char *dir = GetDefEnv ("mgdir", ".\\");
|
---|
1315 | #else
|
---|
1316 | char *dir = GetDefEnv ("mgdir", "./");
|
---|
1317 | #endif
|
---|
1318 | char buffer[512];
|
---|
1319 | struct stat stat_buf;
|
---|
1320 | if (strrchr (dir, '/') && *(strrchr (dir, '/') + 1) != '\0')
|
---|
1321 | {
|
---|
1322 | /* [RPAP - Feb 97: WIN32 Port] */
|
---|
1323 | #ifdef __WIN32__
|
---|
1324 | sprintf (buffer, "%s", dir);
|
---|
1325 | #else
|
---|
1326 | sprintf (buffer, "%s/", dir);
|
---|
1327 | #endif
|
---|
1328 | SetEnv ("mgdir", buffer, NULL);
|
---|
1329 | dir = GetEnv ("mgdir");
|
---|
1330 | }
|
---|
1331 |
|
---|
1332 | sprintf (buffer, "%s.text", name);
|
---|
1333 | if (stat (buffer, &stat_buf) != -1)
|
---|
1334 | {
|
---|
1335 | if ((stat_buf.st_mode & S_IFREG) != 0)
|
---|
1336 | {
|
---|
1337 | /* The name is a directory */
|
---|
1338 | SetEnv ("mgname", name, NULL);
|
---|
1339 | /* [RPAP - Feb 97: WIN32 Port] */
|
---|
1340 | #ifdef __WIN32__
|
---|
1341 | SetEnv ("mgdir", ".\\", NULL);
|
---|
1342 | #else
|
---|
1343 | SetEnv ("mgdir", "./", NULL);
|
---|
1344 | #endif
|
---|
1345 | return;
|
---|
1346 | }
|
---|
1347 | }
|
---|
1348 |
|
---|
1349 | sprintf (buffer, "%s%s", dir, name);
|
---|
1350 | if (stat (buffer, &stat_buf) != -1)
|
---|
1351 | {
|
---|
1352 | if ((stat_buf.st_mode & S_IFDIR) != 0)
|
---|
1353 | {
|
---|
1354 | /* The name is a directory */
|
---|
1355 | /* [RPAP - Feb 97: WIN32 Port] */
|
---|
1356 | #ifdef __WIN32__
|
---|
1357 | sprintf (buffer, "%s%s", name, name);
|
---|
1358 | #else
|
---|
1359 | sprintf (buffer, "%s/%s", name, name);
|
---|
1360 | #endif
|
---|
1361 | SetEnv ("mgname", buffer, NULL);
|
---|
1362 | return;
|
---|
1363 | }
|
---|
1364 | }
|
---|
1365 |
|
---|
1366 | /* Look in the current directory last */
|
---|
1367 | if (stat (name, &stat_buf) != -1)
|
---|
1368 | {
|
---|
1369 | if ((stat_buf.st_mode & S_IFDIR) != 0)
|
---|
1370 | {
|
---|
1371 | /* The name is a directory */
|
---|
1372 | /* [RPAP - Feb 97: WIN32 Port] */
|
---|
1373 | #ifdef __WIN32__
|
---|
1374 | sprintf (buffer, "%s%s", name, name);
|
---|
1375 | SetEnv ("mgdir", ".\\", NULL);
|
---|
1376 | #else
|
---|
1377 | sprintf (buffer, "%s/%s", name, name);
|
---|
1378 | SetEnv ("mgdir", "./", NULL);
|
---|
1379 | #endif
|
---|
1380 | SetEnv ("mgname", buffer, NULL);
|
---|
1381 | return;
|
---|
1382 | }
|
---|
1383 | }
|
---|
1384 |
|
---|
1385 | SetEnv ("mgname", name, NULL);
|
---|
1386 | }
|
---|
1387 |
|
---|
1388 | /* main () */
|
---|
1389 | /* Initialises global variables based on command line switches, and opens */
|
---|
1390 | /* files. Then calls query () to perform the querying. */
|
---|
1391 | int main (int argc, char **argv)
|
---|
1392 | {
|
---|
1393 | ProgTime StartTime;
|
---|
1394 | int decomp = 0;
|
---|
1395 | int ch;
|
---|
1396 |
|
---|
1397 | msg_prefix = argv[0];
|
---|
1398 | GetTime (&StartTime);
|
---|
1399 |
|
---|
1400 | /* Initialise the environment with default values */
|
---|
1401 |
|
---|
1402 | InitEnv ();
|
---|
1403 |
|
---|
1404 | read_mgrc_file ();
|
---|
1405 |
|
---|
1406 | OutFile = stdout;
|
---|
1407 | InFile = stdin;
|
---|
1408 |
|
---|
1409 | opterr = 0;
|
---|
1410 | /* [RJM 06/97: text filename] */
|
---|
1411 | while ((ch = getopt (argc, argv, "Df:d:t:h")) != -1) {
|
---|
1412 | switch (ch) {
|
---|
1413 | case 'f':
|
---|
1414 | SetEnv ("mgname", optarg, NULL);
|
---|
1415 | break;
|
---|
1416 | case 'd':
|
---|
1417 | SetEnv ("mgdir", optarg, NULL);
|
---|
1418 | break;
|
---|
1419 | case 't': /* [RJM 06/97: text filename] */
|
---|
1420 | SetEnv ("textname", optarg, NULL);
|
---|
1421 | break;
|
---|
1422 | case 'D':
|
---|
1423 | decomp = 1;
|
---|
1424 | break;
|
---|
1425 | case 'h':
|
---|
1426 | case '?':
|
---|
1427 | fprintf (stderr, "usage: %s [-D] [-f base name of collection] "
|
---|
1428 | "[-t base name of files for text] " /* [RJM 06/97: text filename] */
|
---|
1429 | "[-d data directory] [collection]\n", argv[0]);
|
---|
1430 | exit (1);
|
---|
1431 | }
|
---|
1432 | }
|
---|
1433 |
|
---|
1434 | PushEnv ();
|
---|
1435 |
|
---|
1436 | if (decomp == 0)
|
---|
1437 | {
|
---|
1438 |
|
---|
1439 | Init_ReadLine ();
|
---|
1440 |
|
---|
1441 | /* write a first prompt, let the user start thinking */
|
---|
1442 | if (!BooleanEnv (GetEnv ("expert"), 0) && isatty (fileno (InFile)))
|
---|
1443 | {
|
---|
1444 | fprintf (stderr, "\n\n\t FULL TEXT RETRIEVAL QUERY PROGRAM\n");
|
---|
1445 | fprintf (stderr, "%24s%s\n\n", "", *"21 Mar 1994" == '%' ? __DATE__ : "21 Mar 1994");
|
---|
1446 | fprintf (stderr, "\n");
|
---|
1447 | fprintf (stderr, " mgquery version " VERSION ", Copyright (C) 1994 Neil Sharman\n");
|
---|
1448 | fprintf (stderr, " mgquery comes with ABSOLUTELY NO WARRANTY; for details type `.warranty'\n");
|
---|
1449 | fprintf (stderr, " This is free software, and you are welcome to redistribute it\n");
|
---|
1450 | fprintf (stderr, " under certain conditions; type `.conditions' for details.\n");
|
---|
1451 | fprintf (stderr, "\n");
|
---|
1452 | }
|
---|
1453 | }
|
---|
1454 | if (optind < argc)
|
---|
1455 | search_for_collection (argv[optind]);
|
---|
1456 |
|
---|
1457 | if (decomp == 0)
|
---|
1458 | {
|
---|
1459 | query ();
|
---|
1460 | }
|
---|
1461 | else
|
---|
1462 | {
|
---|
1463 | int i;
|
---|
1464 | InitQueryTimes iqt;
|
---|
1465 | query_data *qd;
|
---|
1466 |
|
---|
1467 | /* [RPAP - Feb 97: WIN32 Port] */
|
---|
1468 | #ifdef __WIN32__
|
---|
1469 | qd = InitQuerySystem (GetDefEnv ("mgdir", ".\\"),
|
---|
1470 | GetDefEnv ("mgname", ""),
|
---|
1471 | GetDefEnv ("textname", NULL), /* [RJM 06/97: text filename] */
|
---|
1472 | &iqt);
|
---|
1473 | #else
|
---|
1474 | qd = InitQuerySystem (GetDefEnv ("mgdir", "./"),
|
---|
1475 | GetDefEnv ("mgname", ""),
|
---|
1476 | GetDefEnv ("textname", NULL), /* [RJM 06/97: text filename] */
|
---|
1477 | &iqt);
|
---|
1478 | #endif
|
---|
1479 | if (!qd)
|
---|
1480 | FatalError (1, mg_errorstrs[mg_errno], mg_error_data);
|
---|
1481 |
|
---|
1482 |
|
---|
1483 | start_up_stats (qd, iqt);
|
---|
1484 |
|
---|
1485 | Display_Stats (stderr);
|
---|
1486 | for (i = 0; i < qd->td->cth.num_of_docs; i++)
|
---|
1487 | {
|
---|
1488 | RawDocOutput (qd, i + 1, stdout);
|
---|
1489 | putc ('\2', stdout);
|
---|
1490 | }
|
---|
1491 | Message ("%s", ElapsedTime (&StartTime, NULL));
|
---|
1492 |
|
---|
1493 | FinishQuerySystem (qd);
|
---|
1494 | }
|
---|
1495 |
|
---|
1496 | UninitEnv ();
|
---|
1497 | return 0;
|
---|
1498 | }
|
---|