source: trunk/gsdl/packages/mg-1.3d/src/text/mg_hilite_words.c@ 29

Last change on this file since 29 was 29, checked in by rjmcnab, 25 years ago

Incorporated the french stemmer better.

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 16.0 KB
Line 
1/**************************************************************************
2 *
3 * mg_hilite_words -- display text and highlight particular words which
4 * it contains
5 * Copyright (C) 1994 Tim Shimmin
6 *
7 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation; either version 2 of the License, or
10 * (at your option) any later version.
11 *
12 * This program is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with this program; if not, write to the Free Software
19 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
20 *
21 * $Id: mg_hilite_words.c 29 1998-11-24 06:37:53Z rjmcnab $
22 *
23 **************************************************************************/
24
25static char *RCSID = "$Id: mg_hilite_words.c 29 1998-11-24 06:37:53Z rjmcnab $";
26
27#include "sysfuncs.h"
28
29#include "getopt.h"
30#include "messages.h"
31#include "local_strings.h"
32#include "stemmer.h"
33#include "words.h"
34
35
36/*
37 * Description
38 * -----------
39 * Hilite_words reads text from stdin and outputs it into a pager such as
40 * less. It's command arguments include a list of stemmed words
41 * which if extracted from the text will be highlighted on the output.
42 *
43 * Implementation
44 * --------------
45 * Extracting of words - using PARSE_STEM_WORD & stemmer
46 *
47 * Highlighting of words - for standard pager:
48 * using back-space character code
49 * e.g. bolded a = "a\ba", underlined a = "_\ba"
50 * - for pager==html:
51 * use some of the standard html character
52 * formatting tags
53 *
54 * Storage of words - using hashtable (set) of size equalling a constant
55 * times the number of words
56 *
57 * Usage
58 * -----
59 * mg_hilite_words --stem_method [0-3]
60 * --style [bold|underline|italic|emphasis|strong]
61 * --pager [less|more|html|???]
62 * --terminator [terminator-string]
63 * list-of-words-to-highlight
64 */
65
66/*
67 * Modifications:
68 *
69 * 21/Apr/95: To handle outputting html tags to stdout
70 *
71 */
72
73/* --- constants --- */
74
75/* highlighting styles */
76#define HILITE_MAX 5 /* the number of styles */
77#define BOLD 0
78#define UNDERLINE 1
79#define ITALIC 2
80#define EMPHASIS 3
81#define STRONG 4
82
83/* maximum length of line buffer */
84#define MAX_LINE_BUFFER 200
85
86/* set pager to this to get html hilited text to stdout */
87#define HTML_OUT "html"
88
89/* output types */
90#define PAGER 0
91#define HTML 1
92
93/* --- types --- */
94
95typedef u_char *Word;
96
97/* --- globals --- */
98
99/* keep in synch with constants */
100static char *hilite_names[] =
101{"bold", "underline", "italic", "emphasis", "strong"};
102static char *hilite_tags[] =
103{"B", "", "I", "EM", "STRONG"};
104static short hilite_style = BOLD;
105static char *pager = "less";
106static int stem_method = 3; /* fold & stem */
107static char **word_list;
108static int num_of_words = 0;
109static int output_type = PAGER;
110static char *terminator = NULL;
111
112/* --- prototypes --- */
113
114static Word copy_c_word (char *w);
115static void process_words (char **words, int num_of_words);
116static int get_line (u_char * line, int n, FILE * stream);
117static void process_text (FILE * input_file, FILE * output_file);
118static void copy_word (u_char * w1, u_char * w2);
119static void process_buffer (u_char * s_in, int len, FILE * output_file);
120static void output_word (u_char * s_start, u_char * s_finish, FILE * output_file);
121static void output_hilite_word (u_char * s_start, u_char * s_finish,
122 FILE * output_file);
123static void process_args (int argc, char *argv[]);
124static void print_line (u_char * line, int n, FILE * stream);
125
126
127/******************************** word set ************************************/
128
129#include "hash.h"
130#include "locallib.h"
131
132#define SIZE_FACTOR 2
133
134typedef Word *WordSet;
135static WordSet set_of_words = NULL;
136static int hash_size = 0;
137
138/* prototypes - set routines */
139static void set_create (int num_of_words);
140static void set_add (Word word);
141static int set_member (Word word);
142static void set_print (void);
143
144/* =========================================================================
145 * Function: set_print
146 * Description:
147 * Input:
148 * Output:
149 * ========================================================================= */
150
151static void
152set_print (void)
153{
154 int i = 0;
155
156 for (i = 0; i < hash_size; i++)
157 {
158 Word word = set_of_words[i];
159
160 if (word)
161 {
162 int len = *word++;
163
164 fprintf (stderr, "[%d] = ", i);
165 while (len--)
166 fputc (*word++, stderr);
167 fputc ('\n', stderr);
168 }
169 }
170
171}
172
173
174/* =========================================================================
175 * Function: set_create
176 * Description:
177 * Allocate memory for the set
178 * Input:
179 * Output:
180 * ========================================================================= */
181
182static void
183set_create (int num_of_words)
184{
185 WordSet set;
186
187 hash_size = prime (num_of_words * SIZE_FACTOR);
188 set = (WordSet) malloc (hash_size * sizeof (Word));
189 if (!set)
190 FatalError (1, "Runout of memory for word hashtable");
191 bzero ((char *) set, hash_size * sizeof (Word)); /* [RPAP - Feb 97: WIN32 Port] */
192
193 set_of_words = set;
194
195}
196
197/* =========================================================================
198 * Function: set_add
199 * Description:
200 * Add a string element to the set.
201 * Input:
202 * Output:
203 * ========================================================================= */
204
205static void
206set_add (Word word)
207{
208 int hash_val;
209 int hash_step;
210
211 HASH (hash_val, hash_step, word, hash_size);
212
213 /* loop around in case of collisions and need to step */
214 while (1)
215 {
216
217 Word entry = set_of_words[hash_val];
218
219 /* if doesn't exist then */
220 if (!entry)
221 {
222 set_of_words[hash_val] = word;
223 break;
224 }
225
226 /* if we have a matching word */
227 if (compare (entry, word) == 0)
228 break;
229
230 /* if collides with a different word */
231 hash_val = (hash_val + hash_step) % hash_size;
232
233 }
234}
235
236
237/* =========================================================================
238 * Function: set_member
239 * Description:
240 * Tests whether a string is a member of the set
241 * Input:
242 * Output:
243 * ========================================================================= */
244
245static int
246set_member (Word word)
247{
248 int hash_val;
249 int hash_step;
250
251 HASH (hash_val, hash_step, word, hash_size);
252
253 /* loop around in case of collisions and need to step */
254 while (1)
255 {
256
257 Word entry = set_of_words[hash_val];
258
259 /* if doesn't exist then */
260 if (!entry)
261 return 0;
262
263 /* if we have a matching word */
264 if (compare (entry, word) == 0)
265 return 1;
266
267 /* if collides with a different word */
268 hash_val = (hash_val + hash_step) % hash_size;
269
270 }
271}
272
273/******************************** end of word set ******************************/
274
275
276
277/* =========================================================================
278 * Function: copy_c_word
279 * Description:
280 * Allocate enough memory and copy word over
281 * Input:
282 * w = null terminated string (c_word)
283 * Output:
284 * word with length in 1st byte
285 * ========================================================================= */
286
287static Word
288copy_c_word (char *w)
289{
290 int len = strlen (w);
291 Word w_copy = (Word) malloc (len + 1);
292 Word w_ptr = NULL;
293 int j = 0;
294
295 if (!w_copy)
296 FatalError (1, "Not enough memory to copy a word");
297
298 w_copy[0] = len;
299 w_ptr = w_copy + 1;
300 for (j = 0; j < len; j++)
301 *w_ptr++ = *w++;
302
303 return w_copy;
304}
305
306/* =========================================================================
307 * Function: process_words
308 * Description:
309 * Go through the stemmed words and add to word set
310 * Input:
311 * Output:
312 * ========================================================================= */
313
314static void
315process_words (char **words, int num_of_words)
316{
317 int i = 0;
318
319 set_create (num_of_words);
320
321 for (i = 0; i < num_of_words; i++)
322 {
323 Word word = copy_c_word (words[i]);
324 set_add (word);
325 }
326
327}
328
329/* =========================================================================
330 * Function: get_line
331 * Description:
332 * Equivalent of fgets for u_char*.
333 * But returns length of read-in line.
334 * Expects to see a '\n' before an EOF
335 * Input:
336 * Output:
337 * ========================================================================= */
338
339static int
340get_line (u_char * line, int n, FILE * stream)
341{
342 int i = 0;
343 int ch = '\0';
344
345 while (1)
346 {
347 if (i == n)
348 return i;
349
350 ch = fgetc (stream);
351
352 if (ch == EOF)
353 {
354 if (!feof (stream))
355 FatalError (1, "Error on reading a line from stdin");
356 return EOF;
357 }
358
359 if (ch == '\n')
360 return i;
361
362 *line++ = ch;
363 i++;
364 }
365
366}
367
368/* =========================================================================
369 * Function: print_line
370 * Description:
371 * Input:
372 * Output:
373 * ========================================================================= */
374
375static void
376print_line (u_char * line, int n, FILE * stream)
377{
378
379 while (n--)
380 {
381 fputc (*line++, stream);
382 }
383 fputc ('\n', stream);
384
385}
386
387/* =========================================================================
388 * Function: process_text
389 * Description:
390 * Go through the text from input_file and highlight to output_file
391 * Input:
392 * Output:
393 * ========================================================================= */
394
395static void
396process_text (FILE * input_file, FILE * output_file)
397{
398 static u_char line_buffer[MAX_LINE_BUFFER];
399
400
401 while (1)
402 {
403 int len = get_line (line_buffer, MAX_LINE_BUFFER, input_file);
404
405 if (len == EOF)
406 break;
407 process_buffer (line_buffer, len, output_file);
408
409 }
410}
411
412/* =========================================================================
413 * Function: copy_word
414 * Description:
415 * Copies w2 into w1. Assumes both have storage allocated.
416 * Input:
417 * Output:
418 * ========================================================================= */
419
420static void
421copy_word (u_char * w1, u_char * w2)
422{
423 int i;
424 int len = w2[0];
425
426 for (i = 0; i <= len; i++)
427 *w1++ = *w2++;
428
429}
430
431/* =========================================================================
432 * Function: process_buffer
433 * Description:
434 * Parse & stem words of line buffer
435 * Based on the usage of PARSEing in other mg files.
436 * Input:
437 * Output:
438 * ========================================================================= */
439
440static void
441process_buffer (u_char * s_in, int len, FILE * output_file)
442{
443 u_char *end = s_in + len - 1;
444 u_char *s_start = NULL;
445
446 if (!INAWORD (*s_in))
447 {
448 s_start = s_in;
449 PARSE_NON_STEM_WORD (s_in, end);
450 output_word (s_start, s_in - 1, output_file);
451 }
452
453 while (s_in <= end)
454 {
455 u_char word[MAXSTEMLEN + 1];
456
457 s_start = s_in;
458 PARSE_STEM_WORD (word, s_in, end);
459
460 stemmer (stem_method, word);
461
462 if (set_member (word)) /* output with highlighting */
463 {
464 output_hilite_word (s_start, s_start + word[0] - 1, output_file);
465 s_start += word[0]; /* step over hilited output */
466 }
467 output_word (s_start, s_in - 1, output_file);
468
469 s_start = s_in;
470 PARSE_NON_STEM_WORD (s_in, end);
471 output_word (s_start, s_in - 1, output_file);
472
473 } /*while */
474
475 fputc ('\n', output_file);
476 fflush (output_file);
477
478} /*process_buffer */
479
480/* =========================================================================
481 * Function: output_word
482 * Description:
483 * Output a word which lies from s_start to s_finish in buffer
484 * Input:
485 * s_start = ptr to 1st char
486 * s_finish = ptr to last char
487 * Output:
488 * ========================================================================= */
489
490static void
491output_word (u_char * s_start, u_char * s_finish, FILE * output_file)
492{
493 while (s_start <= s_finish)
494 {
495 fputc (*s_start++, output_file);
496 }
497}
498
499
500/* =========================================================================
501 * Function: output_hilite_word
502 * Description:
503 * Highlight a word (with length in 1st byte)
504 * Pager highlighting:
505 * Highlighting is either by bolding or underlining using
506 * the method used by UNIX utilities More(1) and Less(1)
507 * HTML highlighting:
508 * use the appropriate start and end tags around the word
509 * ========================================================================= */
510
511static void
512output_hilite_word (u_char * s_start, u_char * s_finish, FILE * output_file)
513{
514
515 if (output_type == HTML)
516 {
517 char *hilite_tag = hilite_tags[hilite_style];
518
519 /* print start tag */
520 fprintf (output_file, "<%s>", hilite_tag);
521
522 output_word (s_start, s_finish, output_file);
523
524 /* print end tag */
525 fprintf (output_file, "</%s>", hilite_tag);
526 }
527
528 else
529 /* PAGER */
530 {
531 /* use backspaces around each letter */
532 while (s_start <= s_finish)
533 {
534 switch (hilite_style)
535 {
536 case BOLD:
537 fputc (*s_start, output_file);
538 fputc ('\b', output_file);
539 fputc (*s_start, output_file);
540 break;
541 case UNDERLINE:
542 fputc ('_', output_file);
543 fputc ('\b', output_file);
544 fputc (*s_start, output_file);
545 break;
546 default:
547 fputc (*s_start, output_file);
548 }
549 s_start++;
550 } /*while */
551 }
552}
553
554/* =========================================================================
555 * Function: process_args
556 * Description:
557 * sets the global variables:
558 * hilite_style, pager, num_of_words, word_list
559 * Input:
560 * Output:
561 * ========================================================================= */
562
563struct option long_opts[] =
564{
565 {"style", required_argument, 0, 's'},
566 {"terminator", required_argument, 0, 't'},
567 {"pager", required_argument, 0, 'p'},
568 {"stem_method", required_argument, 0, 'm'},
569 {0, 0, 0, 0}
570};
571
572static void
573process_args (int argc, char *argv[])
574{
575 int ch;
576
577
578 opterr = 0;
579 while ((ch = getopt_long (argc, argv, "s:p:t:m:", long_opts, (int *) 0)) != -1)
580 {
581 switch (ch)
582 {
583 case 's':
584 {
585 int i;
586 for (i = 0; i < HILITE_MAX; i++)
587 if (strcmp (optarg, hilite_names[i]) == 0)
588 break;
589
590 if (i < HILITE_MAX)
591 hilite_style = i;
592 }
593 break;
594 case 't':
595 terminator = optarg;
596 break;
597 case 'm':
598 stem_method = atoi (optarg);
599 break;
600 case 'p':
601 pager = optarg;
602 break;
603 default:
604 FatalError (1, "Usage: \n"
605 "mg_hilite_words --stem_method [0-3]\n"
606 " --style [bold|underline|italic|emphasis|strong]\n"
607 " --pager [less|more|html|???]\n");
608 }
609 }
610
611 num_of_words = argc - optind;
612
613 word_list = &argv[optind];
614
615 /* fix up output type */
616 if (strcmp (pager, HTML_OUT) == 0)
617 output_type = HTML;
618 else
619 output_type = PAGER;
620
621}
622
623/* =========================================================================
624 * Function: main
625 * Description:
626 * Input:
627 * Output:
628 * ========================================================================= */
629
630int
631main (int argc, char *argv[])
632{
633 FILE *output = NULL;
634
635 process_args (argc, argv);
636
637 /* set output file */
638 if (output_type == PAGER)
639/* [RPAP - Feb 97: WIN32 Port] */
640#ifdef __WIN32__
641 output = _popen (pager, "w");
642#else
643 output = popen (pager, "w");
644#endif
645 else
646 output = stdout;
647
648 if (!output)
649 FatalError (1, "Unable to run \"%s\"\n", pager);
650
651 if (num_of_words < 1)
652 {
653 int ch;
654
655 /* just echo the input */
656 /* better not to call this program at all ;-) */
657 while ((ch = fgetc (stdin)) != EOF)
658 {
659 fputc (ch, output);
660 }
661 }
662 else
663 {
664 /* set up hash table for words */
665 process_words (word_list, num_of_words);
666
667
668 /* Go thru lines of text from stdin and
669 * output words with hilite info if
670 * words parse into existence in the hash table
671 */
672 process_text (stdin, output);
673 }
674
675 if (terminator)
676 fprintf (output, "%s\n", terminator);
677
678 if (output != stdout)
679/* [RPAP - Feb 97: WIN32 Port] */
680#ifdef __WIN32__
681 _pclose (output);
682#else
683 pclose (output);
684#endif
685
686 return 0;
687}
Note: See TracBrowser for help on using the repository browser.