Context Navigation

source: trunk/gsdl/packages/mg-1.3d/src/text/mg_hilite_words.c@ 29

Last change on this file since 29 was 29, checked in by rjmcnab, 25 years ago
Incorporated the french stemmer better.
Property svn:executable set to ``* Property svn:keywords set to `Author Date Id Revision`
File size: 16.0 KB

Line
1	/**************************************************************************
2	*
3	* mg_hilite_words -- display text and highlight particular words which
4	* it contains
5	* Copyright (C) 1994 Tim Shimmin
6	*
7	* This program is free software; you can redistribute it and/or modify
8	* it under the terms of the GNU General Public License as published by
9	* the Free Software Foundation; either version 2 of the License, or
10	* (at your option) any later version.
11	*
12	* This program is distributed in the hope that it will be useful,
13	* but WITHOUT ANY WARRANTY; without even the implied warranty of
14	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15	* GNU General Public License for more details.
16	*
17	* You should have received a copy of the GNU General Public License
18	* along with this program; if not, write to the Free Software
19	* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
20	*
21	* $Id: mg_hilite_words.c 29 1998-11-24 06:37:53Z rjmcnab $
22	*
23	**************************************************************************/
24
25	static char *RCSID = "$Id: mg_hilite_words.c 29 1998-11-24 06:37:53Z rjmcnab $";
26
27	#include "sysfuncs.h"
28
29	#include "getopt.h"
30	#include "messages.h"
31	#include "local_strings.h"
32	#include "stemmer.h"
33	#include "words.h"
34
35
36	/*
37	* Description
38	* -----------
39	* Hilite_words reads text from stdin and outputs it into a pager such as
40	* less. It's command arguments include a list of stemmed words
41	* which if extracted from the text will be highlighted on the output.
42	*
43	* Implementation
44	* --------------
45	* Extracting of words - using PARSE_STEM_WORD & stemmer
46	*
47	* Highlighting of words - for standard pager:
48	* using back-space character code
49	* e.g. bolded a = "a\ba", underlined a = "_\ba"
50	* - for pager==html:
51	* use some of the standard html character
52	* formatting tags
53	*
54	* Storage of words - using hashtable (set) of size equalling a constant
55	* times the number of words
56	*
57	* Usage
58	* -----
59	* mg_hilite_words --stem_method [0-3]
60	* --style [bold\|underline\|italic\|emphasis\|strong]
61	* --pager [less\|more\|html\|???]
62	* --terminator [terminator-string]
63	* list-of-words-to-highlight
64	*/
65
66	/*
67	* Modifications:
68	*
69	* 21/Apr/95: To handle outputting html tags to stdout
70	*
71	*/
72
73	/* --- constants --- */
74
75	/* highlighting styles */
76	#define HILITE_MAX 5 /* the number of styles */
77	#define BOLD 0
78	#define UNDERLINE 1
79	#define ITALIC 2
80	#define EMPHASIS 3
81	#define STRONG 4
82
83	/* maximum length of line buffer */
84	#define MAX_LINE_BUFFER 200
85
86	/* set pager to this to get html hilited text to stdout */
87	#define HTML_OUT "html"
88
89	/* output types */
90	#define PAGER 0
91	#define HTML 1
92
93	/* --- types --- */
94
95	typedef u_char *Word;
96
97	/* --- globals --- */
98
99	/* keep in synch with constants */
100	static char *hilite_names[] =
101	{"bold", "underline", "italic", "emphasis", "strong"};
102	static char *hilite_tags[] =
103	{"B", "", "I", "EM", "STRONG"};
104	static short hilite_style = BOLD;
105	static char *pager = "less";
106	static int stem_method = 3; /* fold & stem */
107	static char **word_list;
108	static int num_of_words = 0;
109	static int output_type = PAGER;
110	static char *terminator = NULL;
111
112	/* --- prototypes --- */
113
114	static Word copy_c_word (char *w);
115	static void process_words (char **words, int num_of_words);
116	static int get_line (u_char * line, int n, FILE * stream);
117	static void process_text (FILE * input_file, FILE * output_file);
118	static void copy_word (u_char * w1, u_char * w2);
119	static void process_buffer (u_char * s_in, int len, FILE * output_file);
120	static void output_word (u_char * s_start, u_char * s_finish, FILE * output_file);
121	static void output_hilite_word (u_char * s_start, u_char * s_finish,
122	FILE * output_file);
123	static void process_args (int argc, char *argv[]);
124	static void print_line (u_char * line, int n, FILE * stream);
125
126
127	/****************************** word set **********************************/
128
129	#include "hash.h"
130	#include "locallib.h"
131
132	#define SIZE_FACTOR 2
133
134	typedef Word *WordSet;
135	static WordSet set_of_words = NULL;
136	static int hash_size = 0;
137
138	/* prototypes - set routines */
139	static void set_create (int num_of_words);
140	static void set_add (Word word);
141	static int set_member (Word word);
142	static void set_print (void);
143
144	/* =========================================================================
145	* Function: set_print
146	* Description:
147	* Input:
148	* Output:
149	* ========================================================================= */
150
151	static void
152	set_print (void)
153	{
154	int i = 0;
155
156	for (i = 0; i < hash_size; i++)
157	{
158	Word word = set_of_words[i];
159
160	if (word)
161	{
162	int len = *word++;
163
164	fprintf (stderr, "[%d] = ", i);
165	while (len--)
166	fputc (*word++, stderr);
167	fputc ('\n', stderr);
168	}
169	}
170
171	}
172
173
174	/* =========================================================================
175	* Function: set_create
176	* Description:
177	* Allocate memory for the set
178	* Input:
179	* Output:
180	* ========================================================================= */
181
182	static void
183	set_create (int num_of_words)
184	{
185	WordSet set;
186
187	hash_size = prime (num_of_words * SIZE_FACTOR);
188	set = (WordSet) malloc (hash_size * sizeof (Word));
189	if (!set)
190	FatalError (1, "Runout of memory for word hashtable");
191	bzero ((char ) set, hash_size sizeof (Word)); /* [RPAP - Feb 97: WIN32 Port] */
192
193	set_of_words = set;
194
195	}
196
197	/* =========================================================================
198	* Function: set_add
199	* Description:
200	* Add a string element to the set.
201	* Input:
202	* Output:
203	* ========================================================================= */
204
205	static void
206	set_add (Word word)
207	{
208	int hash_val;
209	int hash_step;
210
211	HASH (hash_val, hash_step, word, hash_size);
212
213	/* loop around in case of collisions and need to step */
214	while (1)
215	{
216
217	Word entry = set_of_words[hash_val];
218
219	/* if doesn't exist then */
220	if (!entry)
221	{
222	set_of_words[hash_val] = word;
223	break;
224	}
225
226	/* if we have a matching word */
227	if (compare (entry, word) == 0)
228	break;
229
230	/* if collides with a different word */
231	hash_val = (hash_val + hash_step) % hash_size;
232
233	}
234	}
235
236
237	/* =========================================================================
238	* Function: set_member
239	* Description:
240	* Tests whether a string is a member of the set
241	* Input:
242	* Output:
243	* ========================================================================= */
244
245	static int
246	set_member (Word word)
247	{
248	int hash_val;
249	int hash_step;
250
251	HASH (hash_val, hash_step, word, hash_size);
252
253	/* loop around in case of collisions and need to step */
254	while (1)
255	{
256
257	Word entry = set_of_words[hash_val];
258
259	/* if doesn't exist then */
260	if (!entry)
261	return 0;
262
263	/* if we have a matching word */
264	if (compare (entry, word) == 0)
265	return 1;
266
267	/* if collides with a different word */
268	hash_val = (hash_val + hash_step) % hash_size;
269
270	}
271	}
272
273	/****************************** end of word set ****************************/
274
275
276
277	/* =========================================================================
278	* Function: copy_c_word
279	* Description:
280	* Allocate enough memory and copy word over
281	* Input:
282	* w = null terminated string (c_word)
283	* Output:
284	* word with length in 1st byte
285	* ========================================================================= */
286
287	static Word
288	copy_c_word (char *w)
289	{
290	int len = strlen (w);
291	Word w_copy = (Word) malloc (len + 1);
292	Word w_ptr = NULL;
293	int j = 0;
294
295	if (!w_copy)
296	FatalError (1, "Not enough memory to copy a word");
297
298	w_copy[0] = len;
299	w_ptr = w_copy + 1;
300	for (j = 0; j < len; j++)
301	w_ptr++ = w++;
302
303	return w_copy;
304	}
305
306	/* =========================================================================
307	* Function: process_words
308	* Description:
309	* Go through the stemmed words and add to word set
310	* Input:
311	* Output:
312	* ========================================================================= */
313
314	static void
315	process_words (char **words, int num_of_words)
316	{
317	int i = 0;
318
319	set_create (num_of_words);
320
321	for (i = 0; i < num_of_words; i++)
322	{
323	Word word = copy_c_word (words[i]);
324	set_add (word);
325	}
326
327	}
328
329	/* =========================================================================
330	* Function: get_line
331	* Description:
332	* Equivalent of fgets for u_char*.
333	* But returns length of read-in line.
334	* Expects to see a '\n' before an EOF
335	* Input:
336	* Output:
337	* ========================================================================= */
338
339	static int
340	get_line (u_char * line, int n, FILE * stream)
341	{
342	int i = 0;
343	int ch = '\0';
344
345	while (1)
346	{
347	if (i == n)
348	return i;
349
350	ch = fgetc (stream);
351
352	if (ch == EOF)
353	{
354	if (!feof (stream))
355	FatalError (1, "Error on reading a line from stdin");
356	return EOF;
357	}
358
359	if (ch == '\n')
360	return i;
361
362	*line++ = ch;
363	i++;
364	}
365
366	}
367
368	/* =========================================================================
369	* Function: print_line
370	* Description:
371	* Input:
372	* Output:
373	* ========================================================================= */
374
375	static void
376	print_line (u_char * line, int n, FILE * stream)
377	{
378
379	while (n--)
380	{
381	fputc (*line++, stream);
382	}
383	fputc ('\n', stream);
384
385	}
386
387	/* =========================================================================
388	* Function: process_text
389	* Description:
390	* Go through the text from input_file and highlight to output_file
391	* Input:
392	* Output:
393	* ========================================================================= */
394
395	static void
396	process_text (FILE * input_file, FILE * output_file)
397	{
398	static u_char line_buffer[MAX_LINE_BUFFER];
399
400
401	while (1)
402	{
403	int len = get_line (line_buffer, MAX_LINE_BUFFER, input_file);
404
405	if (len == EOF)
406	break;
407	process_buffer (line_buffer, len, output_file);
408
409	}
410	}
411
412	/* =========================================================================
413	* Function: copy_word
414	* Description:
415	* Copies w2 into w1. Assumes both have storage allocated.
416	* Input:
417	* Output:
418	* ========================================================================= */
419
420	static void
421	copy_word (u_char * w1, u_char * w2)
422	{
423	int i;
424	int len = w2[0];
425
426	for (i = 0; i <= len; i++)
427	w1++ = w2++;
428
429	}
430
431	/* =========================================================================
432	* Function: process_buffer
433	* Description:
434	* Parse & stem words of line buffer
435	* Based on the usage of PARSEing in other mg files.
436	* Input:
437	* Output:
438	* ========================================================================= */
439
440	static void
441	process_buffer (u_char * s_in, int len, FILE * output_file)
442	{
443	u_char *end = s_in + len - 1;
444	u_char *s_start = NULL;
445
446	if (!INAWORD (*s_in))
447	{
448	s_start = s_in;
449	PARSE_NON_STEM_WORD (s_in, end);
450	output_word (s_start, s_in - 1, output_file);
451	}
452
453	while (s_in <= end)
454	{
455	u_char word[MAXSTEMLEN + 1];
456
457	s_start = s_in;
458	PARSE_STEM_WORD (word, s_in, end);
459
460	stemmer (stem_method, word);
461
462	if (set_member (word)) /* output with highlighting */
463	{
464	output_hilite_word (s_start, s_start + word[0] - 1, output_file);
465	s_start += word[0]; /* step over hilited output */
466	}
467	output_word (s_start, s_in - 1, output_file);
468
469	s_start = s_in;
470	PARSE_NON_STEM_WORD (s_in, end);
471	output_word (s_start, s_in - 1, output_file);
472
473	} /while /
474
475	fputc ('\n', output_file);
476	fflush (output_file);
477
478	} /process_buffer /
479
480	/* =========================================================================
481	* Function: output_word
482	* Description:
483	* Output a word which lies from s_start to s_finish in buffer
484	* Input:
485	* s_start = ptr to 1st char
486	* s_finish = ptr to last char
487	* Output:
488	* ========================================================================= */
489
490	static void
491	output_word (u_char * s_start, u_char * s_finish, FILE * output_file)
492	{
493	while (s_start <= s_finish)
494	{
495	fputc (*s_start++, output_file);
496	}
497	}
498
499
500	/* =========================================================================
501	* Function: output_hilite_word
502	* Description:
503	* Highlight a word (with length in 1st byte)
504	* Pager highlighting:
505	* Highlighting is either by bolding or underlining using
506	* the method used by UNIX utilities More(1) and Less(1)
507	* HTML highlighting:
508	* use the appropriate start and end tags around the word
509	* ========================================================================= */
510
511	static void
512	output_hilite_word (u_char * s_start, u_char * s_finish, FILE * output_file)
513	{
514
515	if (output_type == HTML)
516	{
517	char *hilite_tag = hilite_tags[hilite_style];
518
519	/* print start tag */
520	fprintf (output_file, "<%s>", hilite_tag);
521
522	output_word (s_start, s_finish, output_file);
523
524	/* print end tag */
525	fprintf (output_file, "</%s>", hilite_tag);
526	}
527
528	else
529	/* PAGER */
530	{
531	/* use backspaces around each letter */
532	while (s_start <= s_finish)
533	{
534	switch (hilite_style)
535	{
536	case BOLD:
537	fputc (*s_start, output_file);
538	fputc ('\b', output_file);
539	fputc (*s_start, output_file);
540	break;
541	case UNDERLINE:
542	fputc ('_', output_file);
543	fputc ('\b', output_file);
544	fputc (*s_start, output_file);
545	break;
546	default:
547	fputc (*s_start, output_file);
548	}
549	s_start++;
550	} /while /
551	}
552	}
553
554	/* =========================================================================
555	* Function: process_args
556	* Description:
557	* sets the global variables:
558	* hilite_style, pager, num_of_words, word_list
559	* Input:
560	* Output:
561	* ========================================================================= */
562
563	struct option long_opts[] =
564	{
565	{"style", required_argument, 0, 's'},
566	{"terminator", required_argument, 0, 't'},
567	{"pager", required_argument, 0, 'p'},
568	{"stem_method", required_argument, 0, 'm'},
569	{0, 0, 0, 0}
570	};
571
572	static void
573	process_args (int argc, char *argv[])
574	{
575	int ch;
576
577
578	opterr = 0;
579	while ((ch = getopt_long (argc, argv, "s:p:t:m:", long_opts, (int *) 0)) != -1)
580	{
581	switch (ch)
582	{
583	case 's':
584	{
585	int i;
586	for (i = 0; i < HILITE_MAX; i++)
587	if (strcmp (optarg, hilite_names[i]) == 0)
588	break;
589
590	if (i < HILITE_MAX)
591	hilite_style = i;
592	}
593	break;
594	case 't':
595	terminator = optarg;
596	break;
597	case 'm':
598	stem_method = atoi (optarg);
599	break;
600	case 'p':
601	pager = optarg;
602	break;
603	default:
604	FatalError (1, "Usage: \n"
605	"mg_hilite_words --stem_method [0-3]\n"
606	" --style [bold\|underline\|italic\|emphasis\|strong]\n"
607	" --pager [less\|more\|html\|???]\n");
608	}
609	}
610
611	num_of_words = argc - optind;
612
613	word_list = &argv[optind];
614
615	/* fix up output type */
616	if (strcmp (pager, HTML_OUT) == 0)
617	output_type = HTML;
618	else
619	output_type = PAGER;
620
621	}
622
623	/* =========================================================================
624	* Function: main
625	* Description:
626	* Input:
627	* Output:
628	* ========================================================================= */
629
630	int
631	main (int argc, char *argv[])
632	{
633	FILE *output = NULL;
634
635	process_args (argc, argv);
636
637	/* set output file */
638	if (output_type == PAGER)
639	/* [RPAP - Feb 97: WIN32 Port] */
640	#ifdef __WIN32__
641	output = _popen (pager, "w");
642	#else
643	output = popen (pager, "w");
644	#endif
645	else
646	output = stdout;
647
648	if (!output)
649	FatalError (1, "Unable to run \"%s\"\n", pager);
650
651	if (num_of_words < 1)
652	{
653	int ch;
654
655	/* just echo the input */
656	/* better not to call this program at all ;-) */
657	while ((ch = fgetc (stdin)) != EOF)
658	{
659	fputc (ch, output);
660	}
661	}
662	else
663	{
664	/* set up hash table for words */
665	process_words (word_list, num_of_words);
666
667
668	/* Go thru lines of text from stdin and
669	* output words with hilite info if
670	* words parse into existence in the hash table
671	*/
672	process_text (stdin, output);
673	}
674
675	if (terminator)
676	fprintf (output, "%s\n", terminator);
677
678	if (output != stdout)
679	/* [RPAP - Feb 97: WIN32 Port] */
680	#ifdef __WIN32__
681	_pclose (output);
682	#else
683	pclose (output);
684	#endif
685
686	return 0;
687	}

Note: See TracBrowser for help on using the repository browser.

Download in other formats: