source: trunk/gsdl/packages/mg-1.3d/src/text/mg_hilite_words.c@ 34

Last change on this file since 34 was 34, checked in by rjmcnab, 26 years ago

Modified mg to that you can specify the stemmer you want
to use via a command line option. You specify it to
mg_passes during the build process. The number of the
stemmer that you used is stored within the inverted
dictionary header and the stemmed dictionary header so
the correct stemmer is used in later stages of building
and querying.

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 16.2 KB
Line 
1/**************************************************************************
2 *
3 * mg_hilite_words -- display text and highlight particular words which
4 * it contains
5 * Copyright (C) 1994 Tim Shimmin
6 *
7 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation; either version 2 of the License, or
10 * (at your option) any later version.
11 *
12 * This program is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with this program; if not, write to the Free Software
19 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
20 *
21 * $Id: mg_hilite_words.c 34 1998-11-25 07:55:52Z rjmcnab $
22 *
23 **************************************************************************/
24
25static char *RCSID = "$Id: mg_hilite_words.c 34 1998-11-25 07:55:52Z rjmcnab $";
26
27#include "sysfuncs.h"
28
29#include "getopt.h"
30#include "messages.h"
31#include "local_strings.h"
32#include "stemmer.h"
33#include "words.h"
34
35
36/*
37 * Description
38 * -----------
39 * Hilite_words reads text from stdin and outputs it into a pager such as
40 * less. It's command arguments include a list of stemmed words
41 * which if extracted from the text will be highlighted on the output.
42 *
43 * Implementation
44 * --------------
45 * Extracting of words - using PARSE_STEM_WORD & stemmer
46 *
47 * Highlighting of words - for standard pager:
48 * using back-space character code
49 * e.g. bolded a = "a\ba", underlined a = "_\ba"
50 * - for pager==html:
51 * use some of the standard html character
52 * formatting tags
53 *
54 * Storage of words - using hashtable (set) of size equalling a constant
55 * times the number of words
56 *
57 * Usage
58 * -----
59 * mg_hilite_words --stem_method [0-3]
60 * --style [bold|underline|italic|emphasis|strong]
61 * --pager [less|more|html|???]
62 * --terminator [terminator-string]
63 * list-of-words-to-highlight
64 */
65
66/*
67 * Modifications:
68 *
69 * 21/Apr/95: To handle outputting html tags to stdout
70 *
71 */
72
73/* --- constants --- */
74
75/* highlighting styles */
76#define HILITE_MAX 5 /* the number of styles */
77#define BOLD 0
78#define UNDERLINE 1
79#define ITALIC 2
80#define EMPHASIS 3
81#define STRONG 4
82
83/* maximum length of line buffer */
84#define MAX_LINE_BUFFER 200
85
86/* set pager to this to get html hilited text to stdout */
87#define HTML_OUT "html"
88
89/* output types */
90#define PAGER 0
91#define HTML 1
92
93/* --- types --- */
94
95typedef u_char *Word;
96
97/* --- globals --- */
98
99/* keep in synch with constants */
100static char *hilite_names[] =
101{"bold", "underline", "italic", "emphasis", "strong"};
102static char *hilite_tags[] =
103{"B", "", "I", "EM", "STRONG"};
104static short hilite_style = BOLD;
105static char *pager = "less";
106static int stemmer_num = 0; /* Lovin's stemmer */
107static int stem_method = 3; /* fold & stem */
108static char **word_list;
109static int num_of_words = 0;
110static int output_type = PAGER;
111static char *terminator = NULL;
112
113/* --- prototypes --- */
114
115static Word copy_c_word (char *w);
116static void process_words (char **words, int num_of_words);
117static int get_line (u_char * line, int n, FILE * stream);
118static void process_text (FILE * input_file, FILE * output_file);
119static void copy_word (u_char * w1, u_char * w2);
120static void process_buffer (u_char * s_in, int len, FILE * output_file);
121static void output_word (u_char * s_start, u_char * s_finish, FILE * output_file);
122static void output_hilite_word (u_char * s_start, u_char * s_finish,
123 FILE * output_file);
124static void process_args (int argc, char *argv[]);
125static void print_line (u_char * line, int n, FILE * stream);
126
127
128/******************************** word set ************************************/
129
130#include "hash.h"
131#include "locallib.h"
132
133#define SIZE_FACTOR 2
134
135typedef Word *WordSet;
136static WordSet set_of_words = NULL;
137static int hash_size = 0;
138
139/* prototypes - set routines */
140static void set_create (int num_of_words);
141static void set_add (Word word);
142static int set_member (Word word);
143static void set_print (void);
144
145/* =========================================================================
146 * Function: set_print
147 * Description:
148 * Input:
149 * Output:
150 * ========================================================================= */
151
152static void
153set_print (void)
154{
155 int i = 0;
156
157 for (i = 0; i < hash_size; i++)
158 {
159 Word word = set_of_words[i];
160
161 if (word)
162 {
163 int len = *word++;
164
165 fprintf (stderr, "[%d] = ", i);
166 while (len--)
167 fputc (*word++, stderr);
168 fputc ('\n', stderr);
169 }
170 }
171
172}
173
174
175/* =========================================================================
176 * Function: set_create
177 * Description:
178 * Allocate memory for the set
179 * Input:
180 * Output:
181 * ========================================================================= */
182
183static void
184set_create (int num_of_words)
185{
186 WordSet set;
187
188 hash_size = prime (num_of_words * SIZE_FACTOR);
189 set = (WordSet) malloc (hash_size * sizeof (Word));
190 if (!set)
191 FatalError (1, "Runout of memory for word hashtable");
192 bzero ((char *) set, hash_size * sizeof (Word)); /* [RPAP - Feb 97: WIN32 Port] */
193
194 set_of_words = set;
195
196}
197
198/* =========================================================================
199 * Function: set_add
200 * Description:
201 * Add a string element to the set.
202 * Input:
203 * Output:
204 * ========================================================================= */
205
206static void
207set_add (Word word)
208{
209 int hash_val;
210 int hash_step;
211
212 HASH (hash_val, hash_step, word, hash_size);
213
214 /* loop around in case of collisions and need to step */
215 while (1)
216 {
217
218 Word entry = set_of_words[hash_val];
219
220 /* if doesn't exist then */
221 if (!entry)
222 {
223 set_of_words[hash_val] = word;
224 break;
225 }
226
227 /* if we have a matching word */
228 if (compare (entry, word) == 0)
229 break;
230
231 /* if collides with a different word */
232 hash_val = (hash_val + hash_step) % hash_size;
233
234 }
235}
236
237
238/* =========================================================================
239 * Function: set_member
240 * Description:
241 * Tests whether a string is a member of the set
242 * Input:
243 * Output:
244 * ========================================================================= */
245
246static int
247set_member (Word word)
248{
249 int hash_val;
250 int hash_step;
251
252 HASH (hash_val, hash_step, word, hash_size);
253
254 /* loop around in case of collisions and need to step */
255 while (1)
256 {
257
258 Word entry = set_of_words[hash_val];
259
260 /* if doesn't exist then */
261 if (!entry)
262 return 0;
263
264 /* if we have a matching word */
265 if (compare (entry, word) == 0)
266 return 1;
267
268 /* if collides with a different word */
269 hash_val = (hash_val + hash_step) % hash_size;
270
271 }
272}
273
274/******************************** end of word set ******************************/
275
276
277
278/* =========================================================================
279 * Function: copy_c_word
280 * Description:
281 * Allocate enough memory and copy word over
282 * Input:
283 * w = null terminated string (c_word)
284 * Output:
285 * word with length in 1st byte
286 * ========================================================================= */
287
288static Word
289copy_c_word (char *w)
290{
291 int len = strlen (w);
292 Word w_copy = (Word) malloc (len + 1);
293 Word w_ptr = NULL;
294 int j = 0;
295
296 if (!w_copy)
297 FatalError (1, "Not enough memory to copy a word");
298
299 w_copy[0] = len;
300 w_ptr = w_copy + 1;
301 for (j = 0; j < len; j++)
302 *w_ptr++ = *w++;
303
304 return w_copy;
305}
306
307/* =========================================================================
308 * Function: process_words
309 * Description:
310 * Go through the stemmed words and add to word set
311 * Input:
312 * Output:
313 * ========================================================================= */
314
315static void
316process_words (char **words, int num_of_words)
317{
318 int i = 0;
319
320 set_create (num_of_words);
321
322 for (i = 0; i < num_of_words; i++)
323 {
324 Word word = copy_c_word (words[i]);
325 set_add (word);
326 }
327
328}
329
330/* =========================================================================
331 * Function: get_line
332 * Description:
333 * Equivalent of fgets for u_char*.
334 * But returns length of read-in line.
335 * Expects to see a '\n' before an EOF
336 * Input:
337 * Output:
338 * ========================================================================= */
339
340static int
341get_line (u_char * line, int n, FILE * stream)
342{
343 int i = 0;
344 int ch = '\0';
345
346 while (1)
347 {
348 if (i == n)
349 return i;
350
351 ch = fgetc (stream);
352
353 if (ch == EOF)
354 {
355 if (!feof (stream))
356 FatalError (1, "Error on reading a line from stdin");
357 return EOF;
358 }
359
360 if (ch == '\n')
361 return i;
362
363 *line++ = ch;
364 i++;
365 }
366
367}
368
369/* =========================================================================
370 * Function: print_line
371 * Description:
372 * Input:
373 * Output:
374 * ========================================================================= */
375
376static void
377print_line (u_char * line, int n, FILE * stream)
378{
379
380 while (n--)
381 {
382 fputc (*line++, stream);
383 }
384 fputc ('\n', stream);
385
386}
387
388/* =========================================================================
389 * Function: process_text
390 * Description:
391 * Go through the text from input_file and highlight to output_file
392 * Input:
393 * Output:
394 * ========================================================================= */
395
396static void
397process_text (FILE * input_file, FILE * output_file)
398{
399 static u_char line_buffer[MAX_LINE_BUFFER];
400
401
402 while (1)
403 {
404 int len = get_line (line_buffer, MAX_LINE_BUFFER, input_file);
405
406 if (len == EOF)
407 break;
408 process_buffer (line_buffer, len, output_file);
409
410 }
411}
412
413/* =========================================================================
414 * Function: copy_word
415 * Description:
416 * Copies w2 into w1. Assumes both have storage allocated.
417 * Input:
418 * Output:
419 * ========================================================================= */
420
421static void
422copy_word (u_char * w1, u_char * w2)
423{
424 int i;
425 int len = w2[0];
426
427 for (i = 0; i <= len; i++)
428 *w1++ = *w2++;
429
430}
431
432/* =========================================================================
433 * Function: process_buffer
434 * Description:
435 * Parse & stem words of line buffer
436 * Based on the usage of PARSEing in other mg files.
437 * Input:
438 * Output:
439 * ========================================================================= */
440
441static void
442process_buffer (u_char * s_in, int len, FILE * output_file)
443{
444 u_char *end = s_in + len - 1;
445 u_char *s_start = NULL;
446
447 if (!INAWORD (*s_in))
448 {
449 s_start = s_in;
450 PARSE_NON_STEM_WORD (s_in, end);
451 output_word (s_start, s_in - 1, output_file);
452 }
453
454 while (s_in <= end)
455 {
456 u_char word[MAXSTEMLEN + 1];
457
458 s_start = s_in;
459 PARSE_STEM_WORD (word, s_in, end);
460
461 stemmer (stem_method, stemmer_num, word);
462
463 if (set_member (word)) /* output with highlighting */
464 {
465 output_hilite_word (s_start, s_start + word[0] - 1, output_file);
466 s_start += word[0]; /* step over hilited output */
467 }
468 output_word (s_start, s_in - 1, output_file);
469
470 s_start = s_in;
471 PARSE_NON_STEM_WORD (s_in, end);
472 output_word (s_start, s_in - 1, output_file);
473
474 } /*while */
475
476 fputc ('\n', output_file);
477 fflush (output_file);
478
479} /*process_buffer */
480
481/* =========================================================================
482 * Function: output_word
483 * Description:
484 * Output a word which lies from s_start to s_finish in buffer
485 * Input:
486 * s_start = ptr to 1st char
487 * s_finish = ptr to last char
488 * Output:
489 * ========================================================================= */
490
491static void
492output_word (u_char * s_start, u_char * s_finish, FILE * output_file)
493{
494 while (s_start <= s_finish)
495 {
496 fputc (*s_start++, output_file);
497 }
498}
499
500
501/* =========================================================================
502 * Function: output_hilite_word
503 * Description:
504 * Highlight a word (with length in 1st byte)
505 * Pager highlighting:
506 * Highlighting is either by bolding or underlining using
507 * the method used by UNIX utilities More(1) and Less(1)
508 * HTML highlighting:
509 * use the appropriate start and end tags around the word
510 * ========================================================================= */
511
512static void
513output_hilite_word (u_char * s_start, u_char * s_finish, FILE * output_file)
514{
515
516 if (output_type == HTML)
517 {
518 char *hilite_tag = hilite_tags[hilite_style];
519
520 /* print start tag */
521 fprintf (output_file, "<%s>", hilite_tag);
522
523 output_word (s_start, s_finish, output_file);
524
525 /* print end tag */
526 fprintf (output_file, "</%s>", hilite_tag);
527 }
528
529 else
530 /* PAGER */
531 {
532 /* use backspaces around each letter */
533 while (s_start <= s_finish)
534 {
535 switch (hilite_style)
536 {
537 case BOLD:
538 fputc (*s_start, output_file);
539 fputc ('\b', output_file);
540 fputc (*s_start, output_file);
541 break;
542 case UNDERLINE:
543 fputc ('_', output_file);
544 fputc ('\b', output_file);
545 fputc (*s_start, output_file);
546 break;
547 default:
548 fputc (*s_start, output_file);
549 }
550 s_start++;
551 } /*while */
552 }
553}
554
555/* =========================================================================
556 * Function: process_args
557 * Description:
558 * sets the global variables:
559 * hilite_style, pager, num_of_words, word_list
560 * Input:
561 * Output:
562 * ========================================================================= */
563
564struct option long_opts[] =
565{
566 {"style", required_argument, 0, 's'},
567 {"terminator", required_argument, 0, 't'},
568 {"pager", required_argument, 0, 'p'},
569 {"stem_method", required_argument, 0, 'm'},
570 {"stemmer", required_argument, 0, 'a'},
571 {0, 0, 0, 0}
572};
573
574static void
575process_args (int argc, char *argv[])
576{
577 int ch;
578
579
580 opterr = 0;
581 while ((ch = getopt_long (argc, argv, "s:p:t:m:a:", long_opts, (int *) 0)) != -1)
582 {
583 switch (ch)
584 {
585 case 's':
586 {
587 int i;
588 for (i = 0; i < HILITE_MAX; i++)
589 if (strcmp (optarg, hilite_names[i]) == 0)
590 break;
591
592 if (i < HILITE_MAX)
593 hilite_style = i;
594 }
595 break;
596 case 'a':
597 stemmer_num = stemmernumber (optarg);
598 break;
599 case 't':
600 terminator = optarg;
601 break;
602 case 'm':
603 stem_method = atoi (optarg);
604 break;
605 case 'p':
606 pager = optarg;
607 break;
608 default:
609 FatalError (1, "Usage: \n"
610 "mg_hilite_words --stem_method [0-3]\n"
611 " --stemmer [english|lovin|french|simplefrench]\n"
612 " --style [bold|underline|italic|emphasis|strong]\n"
613 " --pager [less|more|html|???]\n");
614 }
615 }
616
617 num_of_words = argc - optind;
618
619 word_list = &argv[optind];
620
621 /* fix up output type */
622 if (strcmp (pager, HTML_OUT) == 0)
623 output_type = HTML;
624 else
625 output_type = PAGER;
626
627}
628
629/* =========================================================================
630 * Function: main
631 * Description:
632 * Input:
633 * Output:
634 * ========================================================================= */
635
636int
637main (int argc, char *argv[])
638{
639 FILE *output = NULL;
640
641 process_args (argc, argv);
642
643 /* set output file */
644 if (output_type == PAGER)
645/* [RPAP - Feb 97: WIN32 Port] */
646#ifdef __WIN32__
647 output = _popen (pager, "w");
648#else
649 output = popen (pager, "w");
650#endif
651 else
652 output = stdout;
653
654 if (!output)
655 FatalError (1, "Unable to run \"%s\"\n", pager);
656
657 if (num_of_words < 1)
658 {
659 int ch;
660
661 /* just echo the input */
662 /* better not to call this program at all ;-) */
663 while ((ch = fgetc (stdin)) != EOF)
664 {
665 fputc (ch, output);
666 }
667 }
668 else
669 {
670 /* set up hash table for words */
671 process_words (word_list, num_of_words);
672
673
674 /* Go thru lines of text from stdin and
675 * output words with hilite info if
676 * words parse into existence in the hash table
677 */
678 process_text (stdin, output);
679 }
680
681 if (terminator)
682 fprintf (output, "%s\n", terminator);
683
684 if (output != stdout)
685/* [RPAP - Feb 97: WIN32 Port] */
686#ifdef __WIN32__
687 _pclose (output);
688#else
689 pclose (output);
690#endif
691
692 return 0;
693}
Note: See TracBrowser for help on using the repository browser.