source: gsdl/trunk/trunk/mg/src/text/mg_hilite_words.c@ 16583

Last change on this file since 16583 was 16583, checked in by davidb, 16 years ago

Undoing change commited in r16582

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 16.2 KB
Line 
1/**************************************************************************
2 *
3 * mg_hilite_words -- display text and highlight particular words which
4 * it contains
5 * Copyright (C) 1994 Tim Shimmin
6 *
7 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation; either version 2 of the License, or
10 * (at your option) any later version.
11 *
12 * This program is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with this program; if not, write to the Free Software
19 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
20 *
21 * $Id: mg_hilite_words.c 16583 2008-07-29 10:20:36Z davidb $
22 *
23 **************************************************************************/
24
25static char *RCSID = "$Id: mg_hilite_words.c 16583 2008-07-29 10:20:36Z davidb $";
26
27#include "sysfuncs.h"
28
29#include "getopt.h"
30#include "messages.h"
31#include "local_strings.h"
32#include "stemmer.h"
33#include "words.h"
34
35
36/*
37 * Description
38 * -----------
39 * Hilite_words reads text from stdin and outputs it into a pager such as
40 * less. It's command arguments include a list of stemmed words
41 * which if extracted from the text will be highlighted on the output.
42 *
43 * Implementation
44 * --------------
45 * Extracting of words - using PARSE_STEM_WORD & stemmer
46 *
47 * Highlighting of words - for standard pager:
48 * using back-space character code
49 * e.g. bolded a = "a\ba", underlined a = "_\ba"
50 * - for pager==html:
51 * use some of the standard html character
52 * formatting tags
53 *
54 * Storage of words - using hashtable (set) of size equalling a constant
55 * times the number of words
56 *
57 * Usage
58 * -----
59 * mg_hilite_words --stem_method [0-3]
60 * --style [bold|underline|italic|emphasis|strong]
61 * --pager [less|more|html|???]
62 * --terminator [terminator-string]
63 * list-of-words-to-highlight
64 */
65
66/*
67 * Modifications:
68 *
69 * 21/Apr/95: To handle outputting html tags to stdout
70 *
71 */
72
73/* --- constants --- */
74
75/* highlighting styles */
76#define HILITE_MAX 5 /* the number of styles */
77#define BOLD 0
78#define UNDERLINE 1
79#define ITALIC 2
80#define EMPHASIS 3
81#define STRONG 4
82
83/* maximum length of line buffer */
84#define MAX_LINE_BUFFER 200
85
86/* set pager to this to get html hilited text to stdout */
87#define HTML_OUT "html"
88
89/* output types */
90#define PAGER 0
91#define HTML 1
92
93/* --- types --- */
94
95typedef u_char *Word;
96
97/* --- globals --- */
98
99/* keep in synch with constants */
100static char *hilite_names[] =
101{"bold", "underline", "italic", "emphasis", "strong"};
102static char *hilite_tags[] =
103{"B", "", "I", "EM", "STRONG"};
104static short hilite_style = BOLD;
105static char *pager = "less";
106static int stemmer_num = 0; /* Lovin's stemmer */
107static int stem_method = 3; /* fold & stem */
108static char **word_list;
109static int num_of_words = 0;
110static int output_type = PAGER;
111static char *terminator = NULL;
112
113/* --- prototypes --- */
114
115static Word copy_c_word (char *w);
116static void process_words (char **words, int num_of_words);
117static int get_line (u_char * line, int n, FILE * stream);
118static void process_text (FILE * input_file, FILE * output_file);
119static void copy_word (u_char * w1, u_char * w2);
120static void process_buffer (u_char * s_in, int len, FILE * output_file);
121static void output_word (u_char * s_start, u_char * s_finish, FILE * output_file);
122static void output_hilite_word (u_char * s_start, u_char * s_finish,
123 FILE * output_file);
124static void process_args (int argc, char *argv[]);
125static void print_line (u_char * line, int n, FILE * stream);
126
127
128/******************************** word set ************************************/
129
130#include "hash.h"
131#include "locallib.h"
132
133#define SIZE_FACTOR 2
134
135typedef Word *WordSet;
136static WordSet set_of_words = NULL;
137static int hash_size = 0;
138
139/* prototypes - set routines */
140static void set_create (int num_of_words);
141static void set_add (Word word);
142static int set_member (Word word);
143static void set_print (void);
144
145/* =========================================================================
146 * Function: set_print
147 * Description:
148 * Input:
149 * Output:
150 * ========================================================================= */
151
152static void
153set_print (void)
154{
155 int i = 0;
156
157 for (i = 0; i < hash_size; i++)
158 {
159 Word word = set_of_words[i];
160
161 if (word)
162 {
163 int len = *word++;
164
165 fprintf (stderr, "[%d] = ", i);
166 while (len--)
167 fputc (*word++, stderr);
168 fputc ('\n', stderr);
169 }
170 }
171
172}
173
174
175/* =========================================================================
176 * Function: set_create
177 * Description:
178 * Allocate memory for the set
179 * Input:
180 * Output:
181 * ========================================================================= */
182
183static void
184set_create (int num_of_words)
185{
186 WordSet set;
187
188 hash_size = prime (num_of_words * SIZE_FACTOR);
189 set = (WordSet) malloc (hash_size * sizeof (Word));
190 if (!set)
191 FatalError (1, "Runout of memory for word hashtable");
192 bzero ((char *) set, hash_size * sizeof (Word)); /* [RPAP - Feb 97: WIN32 Port] */
193
194 set_of_words = set;
195
196}
197
198/* =========================================================================
199 * Function: set_add
200 * Description:
201 * Add a string element to the set.
202 * Input:
203 * Output:
204 * ========================================================================= */
205
206static void
207set_add (Word word)
208{
209 int hash_val;
210 int hash_step;
211
212 HASH (hash_val, hash_step, word, hash_size);
213
214 /* loop around in case of collisions and need to step */
215 while (1)
216 {
217
218 Word entry = set_of_words[hash_val];
219
220 /* if doesn't exist then */
221 if (!entry)
222 {
223 set_of_words[hash_val] = word;
224 break;
225 }
226
227 /* if we have a matching word */
228 if (compare (entry, word) == 0)
229 break;
230
231 /* if collides with a different word */
232 hash_val = (hash_val + hash_step) % hash_size;
233
234 }
235}
236
237
238/* =========================================================================
239 * Function: set_member
240 * Description:
241 * Tests whether a string is a member of the set
242 * Input:
243 * Output:
244 * ========================================================================= */
245
246static int
247set_member (Word word)
248{
249 int hash_val;
250 int hash_step;
251
252 HASH (hash_val, hash_step, word, hash_size);
253
254 /* loop around in case of collisions and need to step */
255 while (1)
256 {
257
258 Word entry = set_of_words[hash_val];
259
260 /* if doesn't exist then */
261 if (!entry)
262 return 0;
263
264 /* if we have a matching word */
265 if (compare (entry, word) == 0)
266 return 1;
267
268 /* if collides with a different word */
269 hash_val = (hash_val + hash_step) % hash_size;
270
271 }
272}
273
274/******************************** end of word set ******************************/
275
276
277
278/* =========================================================================
279 * Function: copy_c_word
280 * Description:
281 * Allocate enough memory and copy word over
282 * Input:
283 * w = null terminated string (c_word)
284 * Output:
285 * word with length in 1st byte
286 * ========================================================================= */
287
288static Word
289copy_c_word (char *w)
290{
291 int len = strlen (w);
292 Word w_copy = (Word) malloc (len + 1);
293 Word w_ptr = NULL;
294 int j = 0;
295
296 if (!w_copy)
297 FatalError (1, "Not enough memory to copy a word");
298
299 w_copy[0] = len;
300 w_ptr = w_copy + 1;
301 for (j = 0; j < len; j++)
302 *w_ptr++ = *w++;
303
304 return w_copy;
305}
306
307/* =========================================================================
308 * Function: process_words
309 * Description:
310 * Go through the stemmed words and add to word set
311 * Input:
312 * Output:
313 * ========================================================================= */
314
315static void
316process_words (char **words, int num_of_words)
317{
318 int i = 0;
319
320 set_create (num_of_words);
321
322 for (i = 0; i < num_of_words; i++)
323 {
324 Word word = copy_c_word (words[i]);
325 set_add (word);
326 }
327
328}
329
330/* =========================================================================
331 * Function: get_line
332 * Description:
333 * Equivalent of fgets for u_char*.
334 * But returns length of read-in line.
335 * Expects to see a '\n' before an EOF
336 * Input:
337 * Output:
338 * ========================================================================= */
339
340static int
341get_line (u_char * line, int n, FILE * stream)
342{
343 int i = 0;
344 int ch = '\0';
345
346 while (1)
347 {
348 if (i == n)
349 return i;
350
351 ch = fgetc (stream);
352
353 if (ch == EOF)
354 {
355 if (!feof (stream))
356 FatalError (1, "Error on reading a line from stdin");
357 return EOF;
358 }
359
360 if (ch == '\n')
361 return i;
362
363 *line++ = ch;
364 i++;
365 }
366
367}
368
369/* =========================================================================
370 * Function: print_line
371 * Description:
372 * Input:
373 * Output:
374 * ========================================================================= */
375
376static void
377print_line (u_char * line, int n, FILE * stream)
378{
379
380 while (n--)
381 {
382 fputc (*line++, stream);
383 }
384 fputc ('\n', stream);
385
386}
387
388/* =========================================================================
389 * Function: process_text
390 * Description:
391 * Go through the text from input_file and highlight to output_file
392 * Input:
393 * Output:
394 * ========================================================================= */
395
396static void
397process_text (FILE * input_file, FILE * output_file)
398{
399 static u_char line_buffer[MAX_LINE_BUFFER];
400
401
402 while (1)
403 {
404 int len = get_line (line_buffer, MAX_LINE_BUFFER, input_file);
405
406 if (len == EOF)
407 break;
408 process_buffer (line_buffer, len, output_file);
409
410 }
411}
412
413/* =========================================================================
414 * Function: copy_word
415 * Description:
416 * Copies w2 into w1. Assumes both have storage allocated.
417 * Input:
418 * Output:
419 * ========================================================================= */
420
421static void
422copy_word (u_char * w1, u_char * w2)
423{
424 int i;
425 int len = w2[0];
426
427 for (i = 0; i <= len; i++)
428 *w1++ = *w2++;
429
430}
431
432/* =========================================================================
433 * Function: process_buffer
434 * Description:
435 * Parse & stem words of line buffer
436 * Based on the usage of PARSEing in other mg files.
437 * Input:
438 * Output:
439 * ========================================================================= */
440
441static void
442process_buffer (u_char * s_in, int len, FILE * output_file)
443{
444 u_char *end = s_in + len - 1;
445 u_char *s_start = NULL;
446
447 if (!inaword (s_in, end))
448 {
449 s_start = s_in;
450 PARSE_NON_STEM_WORD (s_in, end);
451 output_word (s_start, s_in - 1, output_file);
452 }
453
454 while (s_in <= end)
455 {
456 u_char word[MAXSTEMLEN + 1];
457
458 s_start = s_in;
459 PARSE_STEM_WORD (word, s_in, end);
460
461 stemmer (stem_method, stemmer_num, word);
462
463 if (set_member (word)) /* output with highlighting */
464 {
465 output_hilite_word (s_start, s_start + word[0] - 1, output_file);
466 s_start += word[0]; /* step over hilited output */
467 }
468 output_word (s_start, s_in - 1, output_file);
469
470 s_start = s_in;
471 PARSE_NON_STEM_WORD (s_in, end);
472 output_word (s_start, s_in - 1, output_file);
473
474 } /*while */
475
476 fputc ('\n', output_file);
477 fflush (output_file);
478
479} /*process_buffer */
480
481/* =========================================================================
482 * Function: output_word
483 * Description:
484 * Output a word which lies from s_start to s_finish in buffer
485 * Input:
486 * s_start = ptr to 1st char
487 * s_finish = ptr to last char
488 * Output:
489 * ========================================================================= */
490
491static void
492output_word (u_char * s_start, u_char * s_finish, FILE * output_file)
493{
494 while (s_start <= s_finish)
495 {
496 fputc (*s_start++, output_file);
497 }
498}
499
500
501/* =========================================================================
502 * Function: output_hilite_word
503 * Description:
504 * Highlight a word (with length in 1st byte)
505 * Pager highlighting:
506 * Highlighting is either by bolding or underlining using
507 * the method used by UNIX utilities More(1) and Less(1)
508 * HTML highlighting:
509 * use the appropriate start and end tags around the word
510 * ========================================================================= */
511
512static void
513output_hilite_word (u_char * s_start, u_char * s_finish, FILE * output_file)
514{
515
516 if (output_type == HTML)
517 {
518 char *hilite_tag = hilite_tags[hilite_style];
519
520 /* print start tag */
521 fprintf (output_file, "<%s>", hilite_tag);
522
523 output_word (s_start, s_finish, output_file);
524
525 /* print end tag */
526 fprintf (output_file, "</%s>", hilite_tag);
527 }
528
529 else
530 /* PAGER */
531 {
532 /* use backspaces around each letter */
533 while (s_start <= s_finish)
534 {
535 switch (hilite_style)
536 {
537 case BOLD:
538 fputc (*s_start, output_file);
539 fputc ('\b', output_file);
540 fputc (*s_start, output_file);
541 break;
542 case UNDERLINE:
543 fputc ('_', output_file);
544 fputc ('\b', output_file);
545 fputc (*s_start, output_file);
546 break;
547 default:
548 fputc (*s_start, output_file);
549 }
550 s_start++;
551 } /*while */
552 }
553}
554
555/* =========================================================================
556 * Function: process_args
557 * Description:
558 * sets the global variables:
559 * hilite_style, pager, num_of_words, word_list
560 * Input:
561 * Output:
562 * ========================================================================= */
563
564struct option long_opts[] =
565{
566 {"style", required_argument, 0, 's'},
567 {"terminator", required_argument, 0, 't'},
568 {"pager", required_argument, 0, 'p'},
569 {"stem_method", required_argument, 0, 'm'},
570 {"stemmer", required_argument, 0, 'a'},
571 {0, 0, 0, 0}
572};
573
574static void
575process_args (int argc, char *argv[])
576{
577 int ch;
578
579
580 opterr = 0;
581 while ((ch = getopt_long (argc, argv, "s:p:t:m:a:", long_opts, (int *) 0)) != -1)
582 {
583 switch (ch)
584 {
585 case 's':
586 {
587 int i;
588 for (i = 0; i < HILITE_MAX; i++)
589 if (strcmp (optarg, hilite_names[i]) == 0)
590 break;
591
592 if (i < HILITE_MAX)
593 hilite_style = i;
594 }
595 break;
596 case 'a':
597 stemmer_num = stemmernumber (optarg);
598 break;
599 case 't':
600 terminator = optarg;
601 break;
602 case 'm':
603 stem_method = atoi (optarg);
604 break;
605 case 'p':
606 pager = optarg;
607 break;
608 default:
609 FatalError (1, "Usage: \n"
610 "mg_hilite_words --stem_method [0-3]\n"
611 " --stemmer [english|lovin|french|simplefrench]\n"
612 " --style [bold|underline|italic|emphasis|strong]\n"
613 " --pager [less|more|html|???]\n");
614 }
615 }
616
617 num_of_words = argc - optind;
618
619 word_list = &argv[optind];
620
621 /* fix up output type */
622 if (strcmp (pager, HTML_OUT) == 0)
623 output_type = HTML;
624 else
625 output_type = PAGER;
626
627}
628
629/* =========================================================================
630 * Function: main
631 * Description:
632 * Input:
633 * Output:
634 * ========================================================================= */
635
636int
637main (int argc, char *argv[])
638{
639 FILE *output = NULL;
640
641 process_args (argc, argv);
642
643 /* set output file */
644 if (output_type == PAGER)
645/* [RPAP - Feb 97: WIN32 Port] */
646#ifdef __WIN32__
647 output = _popen (pager, "w");
648#else
649 output = popen (pager, "w");
650#endif
651 else
652 output = stdout;
653
654 if (!output)
655 FatalError (1, "Unable to run \"%s\"\n", pager);
656
657 if (num_of_words < 1)
658 {
659 int ch;
660
661 /* just echo the input */
662 /* better not to call this program at all ;-) */
663 while ((ch = fgetc (stdin)) != EOF)
664 {
665 fputc (ch, output);
666 }
667 }
668 else
669 {
670 /* set up hash table for words */
671 process_words (word_list, num_of_words);
672
673
674 /* Go thru lines of text from stdin and
675 * output words with hilite info if
676 * words parse into existence in the hash table
677 */
678 process_text (stdin, output);
679 }
680
681 if (terminator)
682 fprintf (output, "%s\n", terminator);
683
684 if (output != stdout)
685/* [RPAP - Feb 97: WIN32 Port] */
686#ifdef __WIN32__
687 _pclose (output);
688#else
689 pclose (output);
690#endif
691
692 return 0;
693}
Note: See TracBrowser for help on using the repository browser.