/************************************************************************** * * mg_hilite_words -- display text and highlight particular words which * it contains * Copyright (C) 1994 Tim Shimmin * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. * * $Id: mg_hilite_words.c 16583 2008-07-29 10:20:36Z davidb $ * **************************************************************************/ static char *RCSID = "$Id: mg_hilite_words.c 16583 2008-07-29 10:20:36Z davidb $"; #include "sysfuncs.h" #include "getopt.h" #include "messages.h" #include "local_strings.h" #include "stemmer.h" #include "words.h" /* * Description * ----------- * Hilite_words reads text from stdin and outputs it into a pager such as * less. It's command arguments include a list of stemmed words * which if extracted from the text will be highlighted on the output. * * Implementation * -------------- * Extracting of words - using PARSE_STEM_WORD & stemmer * * Highlighting of words - for standard pager: * using back-space character code * e.g. bolded a = "a\ba", underlined a = "_\ba" * - for pager==html: * use some of the standard html character * formatting tags * * Storage of words - using hashtable (set) of size equalling a constant * times the number of words * * Usage * ----- * mg_hilite_words --stem_method [0-3] * --style [bold|underline|italic|emphasis|strong] * --pager [less|more|html|???] * --terminator [terminator-string] * list-of-words-to-highlight */ /* * Modifications: * * 21/Apr/95: To handle outputting html tags to stdout * */ /* --- constants --- */ /* highlighting styles */ #define HILITE_MAX 5 /* the number of styles */ #define BOLD 0 #define UNDERLINE 1 #define ITALIC 2 #define EMPHASIS 3 #define STRONG 4 /* maximum length of line buffer */ #define MAX_LINE_BUFFER 200 /* set pager to this to get html hilited text to stdout */ #define HTML_OUT "html" /* output types */ #define PAGER 0 #define HTML 1 /* --- types --- */ typedef u_char *Word; /* --- globals --- */ /* keep in synch with constants */ static char *hilite_names[] = {"bold", "underline", "italic", "emphasis", "strong"}; static char *hilite_tags[] = {"B", "", "I", "EM", "STRONG"}; static short hilite_style = BOLD; static char *pager = "less"; static int stemmer_num = 0; /* Lovin's stemmer */ static int stem_method = 3; /* fold & stem */ static char **word_list; static int num_of_words = 0; static int output_type = PAGER; static char *terminator = NULL; /* --- prototypes --- */ static Word copy_c_word (char *w); static void process_words (char **words, int num_of_words); static int get_line (u_char * line, int n, FILE * stream); static void process_text (FILE * input_file, FILE * output_file); static void copy_word (u_char * w1, u_char * w2); static void process_buffer (u_char * s_in, int len, FILE * output_file); static void output_word (u_char * s_start, u_char * s_finish, FILE * output_file); static void output_hilite_word (u_char * s_start, u_char * s_finish, FILE * output_file); static void process_args (int argc, char *argv[]); static void print_line (u_char * line, int n, FILE * stream); /******************************** word set ************************************/ #include "hash.h" #include "locallib.h" #define SIZE_FACTOR 2 typedef Word *WordSet; static WordSet set_of_words = NULL; static int hash_size = 0; /* prototypes - set routines */ static void set_create (int num_of_words); static void set_add (Word word); static int set_member (Word word); static void set_print (void); /* ========================================================================= * Function: set_print * Description: * Input: * Output: * ========================================================================= */ static void set_print (void) { int i = 0; for (i = 0; i < hash_size; i++) { Word word = set_of_words[i]; if (word) { int len = *word++; fprintf (stderr, "[%d] = ", i); while (len--) fputc (*word++, stderr); fputc ('\n', stderr); } } } /* ========================================================================= * Function: set_create * Description: * Allocate memory for the set * Input: * Output: * ========================================================================= */ static void set_create (int num_of_words) { WordSet set; hash_size = prime (num_of_words * SIZE_FACTOR); set = (WordSet) malloc (hash_size * sizeof (Word)); if (!set) FatalError (1, "Runout of memory for word hashtable"); bzero ((char *) set, hash_size * sizeof (Word)); /* [RPAP - Feb 97: WIN32 Port] */ set_of_words = set; } /* ========================================================================= * Function: set_add * Description: * Add a string element to the set. * Input: * Output: * ========================================================================= */ static void set_add (Word word) { int hash_val; int hash_step; HASH (hash_val, hash_step, word, hash_size); /* loop around in case of collisions and need to step */ while (1) { Word entry = set_of_words[hash_val]; /* if doesn't exist then */ if (!entry) { set_of_words[hash_val] = word; break; } /* if we have a matching word */ if (compare (entry, word) == 0) break; /* if collides with a different word */ hash_val = (hash_val + hash_step) % hash_size; } } /* ========================================================================= * Function: set_member * Description: * Tests whether a string is a member of the set * Input: * Output: * ========================================================================= */ static int set_member (Word word) { int hash_val; int hash_step; HASH (hash_val, hash_step, word, hash_size); /* loop around in case of collisions and need to step */ while (1) { Word entry = set_of_words[hash_val]; /* if doesn't exist then */ if (!entry) return 0; /* if we have a matching word */ if (compare (entry, word) == 0) return 1; /* if collides with a different word */ hash_val = (hash_val + hash_step) % hash_size; } } /******************************** end of word set ******************************/ /* ========================================================================= * Function: copy_c_word * Description: * Allocate enough memory and copy word over * Input: * w = null terminated string (c_word) * Output: * word with length in 1st byte * ========================================================================= */ static Word copy_c_word (char *w) { int len = strlen (w); Word w_copy = (Word) malloc (len + 1); Word w_ptr = NULL; int j = 0; if (!w_copy) FatalError (1, "Not enough memory to copy a word"); w_copy[0] = len; w_ptr = w_copy + 1; for (j = 0; j < len; j++) *w_ptr++ = *w++; return w_copy; } /* ========================================================================= * Function: process_words * Description: * Go through the stemmed words and add to word set * Input: * Output: * ========================================================================= */ static void process_words (char **words, int num_of_words) { int i = 0; set_create (num_of_words); for (i = 0; i < num_of_words; i++) { Word word = copy_c_word (words[i]); set_add (word); } } /* ========================================================================= * Function: get_line * Description: * Equivalent of fgets for u_char*. * But returns length of read-in line. * Expects to see a '\n' before an EOF * Input: * Output: * ========================================================================= */ static int get_line (u_char * line, int n, FILE * stream) { int i = 0; int ch = '\0'; while (1) { if (i == n) return i; ch = fgetc (stream); if (ch == EOF) { if (!feof (stream)) FatalError (1, "Error on reading a line from stdin"); return EOF; } if (ch == '\n') return i; *line++ = ch; i++; } } /* ========================================================================= * Function: print_line * Description: * Input: * Output: * ========================================================================= */ static void print_line (u_char * line, int n, FILE * stream) { while (n--) { fputc (*line++, stream); } fputc ('\n', stream); } /* ========================================================================= * Function: process_text * Description: * Go through the text from input_file and highlight to output_file * Input: * Output: * ========================================================================= */ static void process_text (FILE * input_file, FILE * output_file) { static u_char line_buffer[MAX_LINE_BUFFER]; while (1) { int len = get_line (line_buffer, MAX_LINE_BUFFER, input_file); if (len == EOF) break; process_buffer (line_buffer, len, output_file); } } /* ========================================================================= * Function: copy_word * Description: * Copies w2 into w1. Assumes both have storage allocated. * Input: * Output: * ========================================================================= */ static void copy_word (u_char * w1, u_char * w2) { int i; int len = w2[0]; for (i = 0; i <= len; i++) *w1++ = *w2++; } /* ========================================================================= * Function: process_buffer * Description: * Parse & stem words of line buffer * Based on the usage of PARSEing in other mg files. * Input: * Output: * ========================================================================= */ static void process_buffer (u_char * s_in, int len, FILE * output_file) { u_char *end = s_in + len - 1; u_char *s_start = NULL; if (!inaword (s_in, end)) { s_start = s_in; PARSE_NON_STEM_WORD (s_in, end); output_word (s_start, s_in - 1, output_file); } while (s_in <= end) { u_char word[MAXSTEMLEN + 1]; s_start = s_in; PARSE_STEM_WORD (word, s_in, end); stemmer (stem_method, stemmer_num, word); if (set_member (word)) /* output with highlighting */ { output_hilite_word (s_start, s_start + word[0] - 1, output_file); s_start += word[0]; /* step over hilited output */ } output_word (s_start, s_in - 1, output_file); s_start = s_in; PARSE_NON_STEM_WORD (s_in, end); output_word (s_start, s_in - 1, output_file); } /*while */ fputc ('\n', output_file); fflush (output_file); } /*process_buffer */ /* ========================================================================= * Function: output_word * Description: * Output a word which lies from s_start to s_finish in buffer * Input: * s_start = ptr to 1st char * s_finish = ptr to last char * Output: * ========================================================================= */ static void output_word (u_char * s_start, u_char * s_finish, FILE * output_file) { while (s_start <= s_finish) { fputc (*s_start++, output_file); } } /* ========================================================================= * Function: output_hilite_word * Description: * Highlight a word (with length in 1st byte) * Pager highlighting: * Highlighting is either by bolding or underlining using * the method used by UNIX utilities More(1) and Less(1) * HTML highlighting: * use the appropriate start and end tags around the word * ========================================================================= */ static void output_hilite_word (u_char * s_start, u_char * s_finish, FILE * output_file) { if (output_type == HTML) { char *hilite_tag = hilite_tags[hilite_style]; /* print start tag */ fprintf (output_file, "<%s>", hilite_tag); output_word (s_start, s_finish, output_file); /* print end tag */ fprintf (output_file, "", hilite_tag); } else /* PAGER */ { /* use backspaces around each letter */ while (s_start <= s_finish) { switch (hilite_style) { case BOLD: fputc (*s_start, output_file); fputc ('\b', output_file); fputc (*s_start, output_file); break; case UNDERLINE: fputc ('_', output_file); fputc ('\b', output_file); fputc (*s_start, output_file); break; default: fputc (*s_start, output_file); } s_start++; } /*while */ } } /* ========================================================================= * Function: process_args * Description: * sets the global variables: * hilite_style, pager, num_of_words, word_list * Input: * Output: * ========================================================================= */ struct option long_opts[] = { {"style", required_argument, 0, 's'}, {"terminator", required_argument, 0, 't'}, {"pager", required_argument, 0, 'p'}, {"stem_method", required_argument, 0, 'm'}, {"stemmer", required_argument, 0, 'a'}, {0, 0, 0, 0} }; static void process_args (int argc, char *argv[]) { int ch; opterr = 0; while ((ch = getopt_long (argc, argv, "s:p:t:m:a:", long_opts, (int *) 0)) != -1) { switch (ch) { case 's': { int i; for (i = 0; i < HILITE_MAX; i++) if (strcmp (optarg, hilite_names[i]) == 0) break; if (i < HILITE_MAX) hilite_style = i; } break; case 'a': stemmer_num = stemmernumber (optarg); break; case 't': terminator = optarg; break; case 'm': stem_method = atoi (optarg); break; case 'p': pager = optarg; break; default: FatalError (1, "Usage: \n" "mg_hilite_words --stem_method [0-3]\n" " --stemmer [english|lovin|french|simplefrench]\n" " --style [bold|underline|italic|emphasis|strong]\n" " --pager [less|more|html|???]\n"); } } num_of_words = argc - optind; word_list = &argv[optind]; /* fix up output type */ if (strcmp (pager, HTML_OUT) == 0) output_type = HTML; else output_type = PAGER; } /* ========================================================================= * Function: main * Description: * Input: * Output: * ========================================================================= */ int main (int argc, char *argv[]) { FILE *output = NULL; process_args (argc, argv); /* set output file */ if (output_type == PAGER) /* [RPAP - Feb 97: WIN32 Port] */ #ifdef __WIN32__ output = _popen (pager, "w"); #else output = popen (pager, "w"); #endif else output = stdout; if (!output) FatalError (1, "Unable to run \"%s\"\n", pager); if (num_of_words < 1) { int ch; /* just echo the input */ /* better not to call this program at all ;-) */ while ((ch = fgetc (stdin)) != EOF) { fputc (ch, output); } } else { /* set up hash table for words */ process_words (word_list, num_of_words); /* Go thru lines of text from stdin and * output words with hilite info if * words parse into existence in the hash table */ process_text (stdin, output); } if (terminator) fprintf (output, "%s\n", terminator); if (output != stdout) /* [RPAP - Feb 97: WIN32 Port] */ #ifdef __WIN32__ _pclose (output); #else pclose (output); #endif return 0; }