Context Navigation

← Previous Revision
Latest Revision
Next Revision →
Blame
Revision Log

source: gsdl/trunk/trunk/mg/src/text/mg_hilite_words.c@ 16583

Last change on this file since 16583 was 16583, checked in by davidb, 16 years ago
Undoing change commited in r16582
Property svn:executable set to ``* Property svn:keywords set to `Author Date Id Revision`
File size: 16.2 KB

Line
1	/**************************************************************************
2	*
3	* mg_hilite_words -- display text and highlight particular words which
4	* it contains
5	* Copyright (C) 1994 Tim Shimmin
6	*
7	* This program is free software; you can redistribute it and/or modify
8	* it under the terms of the GNU General Public License as published by
9	* the Free Software Foundation; either version 2 of the License, or
10	* (at your option) any later version.
11	*
12	* This program is distributed in the hope that it will be useful,
13	* but WITHOUT ANY WARRANTY; without even the implied warranty of
14	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15	* GNU General Public License for more details.
16	*
17	* You should have received a copy of the GNU General Public License
18	* along with this program; if not, write to the Free Software
19	* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
20	*
21	* $Id: mg_hilite_words.c 16583 2008-07-29 10:20:36Z davidb $
22	*
23	**************************************************************************/
24
25	static char *RCSID = "$Id: mg_hilite_words.c 16583 2008-07-29 10:20:36Z davidb $";
26
27	#include "sysfuncs.h"
28
29	#include "getopt.h"
30	#include "messages.h"
31	#include "local_strings.h"
32	#include "stemmer.h"
33	#include "words.h"
34
35
36	/*
37	* Description
38	* -----------
39	* Hilite_words reads text from stdin and outputs it into a pager such as
40	* less. It's command arguments include a list of stemmed words
41	* which if extracted from the text will be highlighted on the output.
42	*
43	* Implementation
44	* --------------
45	* Extracting of words - using PARSE_STEM_WORD & stemmer
46	*
47	* Highlighting of words - for standard pager:
48	* using back-space character code
49	* e.g. bolded a = "a\ba", underlined a = "_\ba"
50	* - for pager==html:
51	* use some of the standard html character
52	* formatting tags
53	*
54	* Storage of words - using hashtable (set) of size equalling a constant
55	* times the number of words
56	*
57	* Usage
58	* -----
59	* mg_hilite_words --stem_method [0-3]
60	* --style [bold\|underline\|italic\|emphasis\|strong]
61	* --pager [less\|more\|html\|???]
62	* --terminator [terminator-string]
63	* list-of-words-to-highlight
64	*/
65
66	/*
67	* Modifications:
68	*
69	* 21/Apr/95: To handle outputting html tags to stdout
70	*
71	*/
72
73	/* --- constants --- */
74
75	/* highlighting styles */
76	#define HILITE_MAX 5 /* the number of styles */
77	#define BOLD 0
78	#define UNDERLINE 1
79	#define ITALIC 2
80	#define EMPHASIS 3
81	#define STRONG 4
82
83	/* maximum length of line buffer */
84	#define MAX_LINE_BUFFER 200
85
86	/* set pager to this to get html hilited text to stdout */
87	#define HTML_OUT "html"
88
89	/* output types */
90	#define PAGER 0
91	#define HTML 1
92
93	/* --- types --- */
94
95	typedef u_char *Word;
96
97	/* --- globals --- */
98
99	/* keep in synch with constants */
100	static char *hilite_names[] =
101	{"bold", "underline", "italic", "emphasis", "strong"};
102	static char *hilite_tags[] =
103	{"B", "", "I", "EM", "STRONG"};
104	static short hilite_style = BOLD;
105	static char *pager = "less";
106	static int stemmer_num = 0; /* Lovin's stemmer */
107	static int stem_method = 3; /* fold & stem */
108	static char **word_list;
109	static int num_of_words = 0;
110	static int output_type = PAGER;
111	static char *terminator = NULL;
112
113	/* --- prototypes --- */
114
115	static Word copy_c_word (char *w);
116	static void process_words (char **words, int num_of_words);
117	static int get_line (u_char * line, int n, FILE * stream);
118	static void process_text (FILE * input_file, FILE * output_file);
119	static void copy_word (u_char * w1, u_char * w2);
120	static void process_buffer (u_char * s_in, int len, FILE * output_file);
121	static void output_word (u_char * s_start, u_char * s_finish, FILE * output_file);
122	static void output_hilite_word (u_char * s_start, u_char * s_finish,
123	FILE * output_file);
124	static void process_args (int argc, char *argv[]);
125	static void print_line (u_char * line, int n, FILE * stream);
126
127
128	/****************************** word set **********************************/
129
130	#include "hash.h"
131	#include "locallib.h"
132
133	#define SIZE_FACTOR 2
134
135	typedef Word *WordSet;
136	static WordSet set_of_words = NULL;
137	static int hash_size = 0;
138
139	/* prototypes - set routines */
140	static void set_create (int num_of_words);
141	static void set_add (Word word);
142	static int set_member (Word word);
143	static void set_print (void);
144
145	/* =========================================================================
146	* Function: set_print
147	* Description:
148	* Input:
149	* Output:
150	* ========================================================================= */
151
152	static void
153	set_print (void)
154	{
155	int i = 0;
156
157	for (i = 0; i < hash_size; i++)
158	{
159	Word word = set_of_words[i];
160
161	if (word)
162	{
163	int len = *word++;
164
165	fprintf (stderr, "[%d] = ", i);
166	while (len--)
167	fputc (*word++, stderr);
168	fputc ('\n', stderr);
169	}
170	}
171
172	}
173
174
175	/* =========================================================================
176	* Function: set_create
177	* Description:
178	* Allocate memory for the set
179	* Input:
180	* Output:
181	* ========================================================================= */
182
183	static void
184	set_create (int num_of_words)
185	{
186	WordSet set;
187
188	hash_size = prime (num_of_words * SIZE_FACTOR);
189	set = (WordSet) malloc (hash_size * sizeof (Word));
190	if (!set)
191	FatalError (1, "Runout of memory for word hashtable");
192	bzero ((char ) set, hash_size sizeof (Word)); /* [RPAP - Feb 97: WIN32 Port] */
193
194	set_of_words = set;
195
196	}
197
198	/* =========================================================================
199	* Function: set_add
200	* Description:
201	* Add a string element to the set.
202	* Input:
203	* Output:
204	* ========================================================================= */
205
206	static void
207	set_add (Word word)
208	{
209	int hash_val;
210	int hash_step;
211
212	HASH (hash_val, hash_step, word, hash_size);
213
214	/* loop around in case of collisions and need to step */
215	while (1)
216	{
217
218	Word entry = set_of_words[hash_val];
219
220	/* if doesn't exist then */
221	if (!entry)
222	{
223	set_of_words[hash_val] = word;
224	break;
225	}
226
227	/* if we have a matching word */
228	if (compare (entry, word) == 0)
229	break;
230
231	/* if collides with a different word */
232	hash_val = (hash_val + hash_step) % hash_size;
233
234	}
235	}
236
237
238	/* =========================================================================
239	* Function: set_member
240	* Description:
241	* Tests whether a string is a member of the set
242	* Input:
243	* Output:
244	* ========================================================================= */
245
246	static int
247	set_member (Word word)
248	{
249	int hash_val;
250	int hash_step;
251
252	HASH (hash_val, hash_step, word, hash_size);
253
254	/* loop around in case of collisions and need to step */
255	while (1)
256	{
257
258	Word entry = set_of_words[hash_val];
259
260	/* if doesn't exist then */
261	if (!entry)
262	return 0;
263
264	/* if we have a matching word */
265	if (compare (entry, word) == 0)
266	return 1;
267
268	/* if collides with a different word */
269	hash_val = (hash_val + hash_step) % hash_size;
270
271	}
272	}
273
274	/****************************** end of word set ****************************/
275
276
277
278	/* =========================================================================
279	* Function: copy_c_word
280	* Description:
281	* Allocate enough memory and copy word over
282	* Input:
283	* w = null terminated string (c_word)
284	* Output:
285	* word with length in 1st byte
286	* ========================================================================= */
287
288	static Word
289	copy_c_word (char *w)
290	{
291	int len = strlen (w);
292	Word w_copy = (Word) malloc (len + 1);
293	Word w_ptr = NULL;
294	int j = 0;
295
296	if (!w_copy)
297	FatalError (1, "Not enough memory to copy a word");
298
299	w_copy[0] = len;
300	w_ptr = w_copy + 1;
301	for (j = 0; j < len; j++)
302	w_ptr++ = w++;
303
304	return w_copy;
305	}
306
307	/* =========================================================================
308	* Function: process_words
309	* Description:
310	* Go through the stemmed words and add to word set
311	* Input:
312	* Output:
313	* ========================================================================= */
314
315	static void
316	process_words (char **words, int num_of_words)
317	{
318	int i = 0;
319
320	set_create (num_of_words);
321
322	for (i = 0; i < num_of_words; i++)
323	{
324	Word word = copy_c_word (words[i]);
325	set_add (word);
326	}
327
328	}
329
330	/* =========================================================================
331	* Function: get_line
332	* Description:
333	* Equivalent of fgets for u_char*.
334	* But returns length of read-in line.
335	* Expects to see a '\n' before an EOF
336	* Input:
337	* Output:
338	* ========================================================================= */
339
340	static int
341	get_line (u_char * line, int n, FILE * stream)
342	{
343	int i = 0;
344	int ch = '\0';
345
346	while (1)
347	{
348	if (i == n)
349	return i;
350
351	ch = fgetc (stream);
352
353	if (ch == EOF)
354	{
355	if (!feof (stream))
356	FatalError (1, "Error on reading a line from stdin");
357	return EOF;
358	}
359
360	if (ch == '\n')
361	return i;
362
363	*line++ = ch;
364	i++;
365	}
366
367	}
368
369	/* =========================================================================
370	* Function: print_line
371	* Description:
372	* Input:
373	* Output:
374	* ========================================================================= */
375
376	static void
377	print_line (u_char * line, int n, FILE * stream)
378	{
379
380	while (n--)
381	{
382	fputc (*line++, stream);
383	}
384	fputc ('\n', stream);
385
386	}
387
388	/* =========================================================================
389	* Function: process_text
390	* Description:
391	* Go through the text from input_file and highlight to output_file
392	* Input:
393	* Output:
394	* ========================================================================= */
395
396	static void
397	process_text (FILE * input_file, FILE * output_file)
398	{
399	static u_char line_buffer[MAX_LINE_BUFFER];
400
401
402	while (1)
403	{
404	int len = get_line (line_buffer, MAX_LINE_BUFFER, input_file);
405
406	if (len == EOF)
407	break;
408	process_buffer (line_buffer, len, output_file);
409
410	}
411	}
412
413	/* =========================================================================
414	* Function: copy_word
415	* Description:
416	* Copies w2 into w1. Assumes both have storage allocated.
417	* Input:
418	* Output:
419	* ========================================================================= */
420
421	static void
422	copy_word (u_char * w1, u_char * w2)
423	{
424	int i;
425	int len = w2[0];
426
427	for (i = 0; i <= len; i++)
428	w1++ = w2++;
429
430	}
431
432	/* =========================================================================
433	* Function: process_buffer
434	* Description:
435	* Parse & stem words of line buffer
436	* Based on the usage of PARSEing in other mg files.
437	* Input:
438	* Output:
439	* ========================================================================= */
440
441	static void
442	process_buffer (u_char * s_in, int len, FILE * output_file)
443	{
444	u_char *end = s_in + len - 1;
445	u_char *s_start = NULL;
446
447	if (!inaword (s_in, end))
448	{
449	s_start = s_in;
450	PARSE_NON_STEM_WORD (s_in, end);
451	output_word (s_start, s_in - 1, output_file);
452	}
453
454	while (s_in <= end)
455	{
456	u_char word[MAXSTEMLEN + 1];
457
458	s_start = s_in;
459	PARSE_STEM_WORD (word, s_in, end);
460
461	stemmer (stem_method, stemmer_num, word);
462
463	if (set_member (word)) /* output with highlighting */
464	{
465	output_hilite_word (s_start, s_start + word[0] - 1, output_file);
466	s_start += word[0]; /* step over hilited output */
467	}
468	output_word (s_start, s_in - 1, output_file);
469
470	s_start = s_in;
471	PARSE_NON_STEM_WORD (s_in, end);
472	output_word (s_start, s_in - 1, output_file);
473
474	} /while /
475
476	fputc ('\n', output_file);
477	fflush (output_file);
478
479	} /process_buffer /
480
481	/* =========================================================================
482	* Function: output_word
483	* Description:
484	* Output a word which lies from s_start to s_finish in buffer
485	* Input:
486	* s_start = ptr to 1st char
487	* s_finish = ptr to last char
488	* Output:
489	* ========================================================================= */
490
491	static void
492	output_word (u_char * s_start, u_char * s_finish, FILE * output_file)
493	{
494	while (s_start <= s_finish)
495	{
496	fputc (*s_start++, output_file);
497	}
498	}
499
500
501	/* =========================================================================
502	* Function: output_hilite_word
503	* Description:
504	* Highlight a word (with length in 1st byte)
505	* Pager highlighting:
506	* Highlighting is either by bolding or underlining using
507	* the method used by UNIX utilities More(1) and Less(1)
508	* HTML highlighting:
509	* use the appropriate start and end tags around the word
510	* ========================================================================= */
511
512	static void
513	output_hilite_word (u_char * s_start, u_char * s_finish, FILE * output_file)
514	{
515
516	if (output_type == HTML)
517	{
518	char *hilite_tag = hilite_tags[hilite_style];
519
520	/* print start tag */
521	fprintf (output_file, "<%s>", hilite_tag);
522
523	output_word (s_start, s_finish, output_file);
524
525	/* print end tag */
526	fprintf (output_file, "</%s>", hilite_tag);
527	}
528
529	else
530	/* PAGER */
531	{
532	/* use backspaces around each letter */
533	while (s_start <= s_finish)
534	{
535	switch (hilite_style)
536	{
537	case BOLD:
538	fputc (*s_start, output_file);
539	fputc ('\b', output_file);
540	fputc (*s_start, output_file);
541	break;
542	case UNDERLINE:
543	fputc ('_', output_file);
544	fputc ('\b', output_file);
545	fputc (*s_start, output_file);
546	break;
547	default:
548	fputc (*s_start, output_file);
549	}
550	s_start++;
551	} /while /
552	}
553	}
554
555	/* =========================================================================
556	* Function: process_args
557	* Description:
558	* sets the global variables:
559	* hilite_style, pager, num_of_words, word_list
560	* Input:
561	* Output:
562	* ========================================================================= */
563
564	struct option long_opts[] =
565	{
566	{"style", required_argument, 0, 's'},
567	{"terminator", required_argument, 0, 't'},
568	{"pager", required_argument, 0, 'p'},
569	{"stem_method", required_argument, 0, 'm'},
570	{"stemmer", required_argument, 0, 'a'},
571	{0, 0, 0, 0}
572	};
573
574	static void
575	process_args (int argc, char *argv[])
576	{
577	int ch;
578
579
580	opterr = 0;
581	while ((ch = getopt_long (argc, argv, "s:p:t:m:a:", long_opts, (int *) 0)) != -1)
582	{
583	switch (ch)
584	{
585	case 's':
586	{
587	int i;
588	for (i = 0; i < HILITE_MAX; i++)
589	if (strcmp (optarg, hilite_names[i]) == 0)
590	break;
591
592	if (i < HILITE_MAX)
593	hilite_style = i;
594	}
595	break;
596	case 'a':
597	stemmer_num = stemmernumber (optarg);
598	break;
599	case 't':
600	terminator = optarg;
601	break;
602	case 'm':
603	stem_method = atoi (optarg);
604	break;
605	case 'p':
606	pager = optarg;
607	break;
608	default:
609	FatalError (1, "Usage: \n"
610	"mg_hilite_words --stem_method [0-3]\n"
611	" --stemmer [english\|lovin\|french\|simplefrench]\n"
612	" --style [bold\|underline\|italic\|emphasis\|strong]\n"
613	" --pager [less\|more\|html\|???]\n");
614	}
615	}
616
617	num_of_words = argc - optind;
618
619	word_list = &argv[optind];
620
621	/* fix up output type */
622	if (strcmp (pager, HTML_OUT) == 0)
623	output_type = HTML;
624	else
625	output_type = PAGER;
626
627	}
628
629	/* =========================================================================
630	* Function: main
631	* Description:
632	* Input:
633	* Output:
634	* ========================================================================= */
635
636	int
637	main (int argc, char *argv[])
638	{
639	FILE *output = NULL;
640
641	process_args (argc, argv);
642
643	/* set output file */
644	if (output_type == PAGER)
645	/* [RPAP - Feb 97: WIN32 Port] */
646	#ifdef __WIN32__
647	output = _popen (pager, "w");
648	#else
649	output = popen (pager, "w");
650	#endif
651	else
652	output = stdout;
653
654	if (!output)
655	FatalError (1, "Unable to run \"%s\"\n", pager);
656
657	if (num_of_words < 1)
658	{
659	int ch;
660
661	/* just echo the input */
662	/* better not to call this program at all ;-) */
663	while ((ch = fgetc (stdin)) != EOF)
664	{
665	fputc (ch, output);
666	}
667	}
668	else
669	{
670	/* set up hash table for words */
671	process_words (word_list, num_of_words);
672
673
674	/* Go thru lines of text from stdin and
675	* output words with hilite info if
676	* words parse into existence in the hash table
677	*/
678	process_text (stdin, output);
679	}
680
681	if (terminator)
682	fprintf (output, "%s\n", terminator);
683
684	if (output != stdout)
685	/* [RPAP - Feb 97: WIN32 Port] */
686	#ifdef __WIN32__
687	_pclose (output);
688	#else
689	pclose (output);
690	#endif
691
692	return 0;
693	}

Note: See TracBrowser for help on using the repository browser.

Download in other formats: