Context Navigation

source: trunk/gsdl/packages/mg-1.3d/src/text/mg_hilite_words.c@ 34

Last change on this file since 34 was 34, checked in by rjmcnab, 26 years ago
Modified mg to that you can specify the stemmer you want to use via a command line option. You specify it to mg_passes during the build process. The number of the stemmer that you used is stored within the inverted dictionary header and the stemmed dictionary header so the correct stemmer is used in later stages of building and querying.
Property svn:executable set to ``* Property svn:keywords set to `Author Date Id Revision`
File size: 16.2 KB

Line
1	/**************************************************************************
2	*
3	* mg_hilite_words -- display text and highlight particular words which
4	* it contains
5	* Copyright (C) 1994 Tim Shimmin
6	*
7	* This program is free software; you can redistribute it and/or modify
8	* it under the terms of the GNU General Public License as published by
9	* the Free Software Foundation; either version 2 of the License, or
10	* (at your option) any later version.
11	*
12	* This program is distributed in the hope that it will be useful,
13	* but WITHOUT ANY WARRANTY; without even the implied warranty of
14	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15	* GNU General Public License for more details.
16	*
17	* You should have received a copy of the GNU General Public License
18	* along with this program; if not, write to the Free Software
19	* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
20	*
21	* $Id: mg_hilite_words.c 34 1998-11-25 07:55:52Z rjmcnab $
22	*
23	**************************************************************************/
24
25	static char *RCSID = "$Id: mg_hilite_words.c 34 1998-11-25 07:55:52Z rjmcnab $";
26
27	#include "sysfuncs.h"
28
29	#include "getopt.h"
30	#include "messages.h"
31	#include "local_strings.h"
32	#include "stemmer.h"
33	#include "words.h"
34
35
36	/*
37	* Description
38	* -----------
39	* Hilite_words reads text from stdin and outputs it into a pager such as
40	* less. It's command arguments include a list of stemmed words
41	* which if extracted from the text will be highlighted on the output.
42	*
43	* Implementation
44	* --------------
45	* Extracting of words - using PARSE_STEM_WORD & stemmer
46	*
47	* Highlighting of words - for standard pager:
48	* using back-space character code
49	* e.g. bolded a = "a\ba", underlined a = "_\ba"
50	* - for pager==html:
51	* use some of the standard html character
52	* formatting tags
53	*
54	* Storage of words - using hashtable (set) of size equalling a constant
55	* times the number of words
56	*
57	* Usage
58	* -----
59	* mg_hilite_words --stem_method [0-3]
60	* --style [bold\|underline\|italic\|emphasis\|strong]
61	* --pager [less\|more\|html\|???]
62	* --terminator [terminator-string]
63	* list-of-words-to-highlight
64	*/
65
66	/*
67	* Modifications:
68	*
69	* 21/Apr/95: To handle outputting html tags to stdout
70	*
71	*/
72
73	/* --- constants --- */
74
75	/* highlighting styles */
76	#define HILITE_MAX 5 /* the number of styles */
77	#define BOLD 0
78	#define UNDERLINE 1
79	#define ITALIC 2
80	#define EMPHASIS 3
81	#define STRONG 4
82
83	/* maximum length of line buffer */
84	#define MAX_LINE_BUFFER 200
85
86	/* set pager to this to get html hilited text to stdout */
87	#define HTML_OUT "html"
88
89	/* output types */
90	#define PAGER 0
91	#define HTML 1
92
93	/* --- types --- */
94
95	typedef u_char *Word;
96
97	/* --- globals --- */
98
99	/* keep in synch with constants */
100	static char *hilite_names[] =
101	{"bold", "underline", "italic", "emphasis", "strong"};
102	static char *hilite_tags[] =
103	{"B", "", "I", "EM", "STRONG"};
104	static short hilite_style = BOLD;
105	static char *pager = "less";
106	static int stemmer_num = 0; /* Lovin's stemmer */
107	static int stem_method = 3; /* fold & stem */
108	static char **word_list;
109	static int num_of_words = 0;
110	static int output_type = PAGER;
111	static char *terminator = NULL;
112
113	/* --- prototypes --- */
114
115	static Word copy_c_word (char *w);
116	static void process_words (char **words, int num_of_words);
117	static int get_line (u_char * line, int n, FILE * stream);
118	static void process_text (FILE * input_file, FILE * output_file);
119	static void copy_word (u_char * w1, u_char * w2);
120	static void process_buffer (u_char * s_in, int len, FILE * output_file);
121	static void output_word (u_char * s_start, u_char * s_finish, FILE * output_file);
122	static void output_hilite_word (u_char * s_start, u_char * s_finish,
123	FILE * output_file);
124	static void process_args (int argc, char *argv[]);
125	static void print_line (u_char * line, int n, FILE * stream);
126
127
128	/****************************** word set **********************************/
129
130	#include "hash.h"
131	#include "locallib.h"
132
133	#define SIZE_FACTOR 2
134
135	typedef Word *WordSet;
136	static WordSet set_of_words = NULL;
137	static int hash_size = 0;
138
139	/* prototypes - set routines */
140	static void set_create (int num_of_words);
141	static void set_add (Word word);
142	static int set_member (Word word);
143	static void set_print (void);
144
145	/* =========================================================================
146	* Function: set_print
147	* Description:
148	* Input:
149	* Output:
150	* ========================================================================= */
151
152	static void
153	set_print (void)
154	{
155	int i = 0;
156
157	for (i = 0; i < hash_size; i++)
158	{
159	Word word = set_of_words[i];
160
161	if (word)
162	{
163	int len = *word++;
164
165	fprintf (stderr, "[%d] = ", i);
166	while (len--)
167	fputc (*word++, stderr);
168	fputc ('\n', stderr);
169	}
170	}
171
172	}
173
174
175	/* =========================================================================
176	* Function: set_create
177	* Description:
178	* Allocate memory for the set
179	* Input:
180	* Output:
181	* ========================================================================= */
182
183	static void
184	set_create (int num_of_words)
185	{
186	WordSet set;
187
188	hash_size = prime (num_of_words * SIZE_FACTOR);
189	set = (WordSet) malloc (hash_size * sizeof (Word));
190	if (!set)
191	FatalError (1, "Runout of memory for word hashtable");
192	bzero ((char ) set, hash_size sizeof (Word)); /* [RPAP - Feb 97: WIN32 Port] */
193
194	set_of_words = set;
195
196	}
197
198	/* =========================================================================
199	* Function: set_add
200	* Description:
201	* Add a string element to the set.
202	* Input:
203	* Output:
204	* ========================================================================= */
205
206	static void
207	set_add (Word word)
208	{
209	int hash_val;
210	int hash_step;
211
212	HASH (hash_val, hash_step, word, hash_size);
213
214	/* loop around in case of collisions and need to step */
215	while (1)
216	{
217
218	Word entry = set_of_words[hash_val];
219
220	/* if doesn't exist then */
221	if (!entry)
222	{
223	set_of_words[hash_val] = word;
224	break;
225	}
226
227	/* if we have a matching word */
228	if (compare (entry, word) == 0)
229	break;
230
231	/* if collides with a different word */
232	hash_val = (hash_val + hash_step) % hash_size;
233
234	}
235	}
236
237
238	/* =========================================================================
239	* Function: set_member
240	* Description:
241	* Tests whether a string is a member of the set
242	* Input:
243	* Output:
244	* ========================================================================= */
245
246	static int
247	set_member (Word word)
248	{
249	int hash_val;
250	int hash_step;
251
252	HASH (hash_val, hash_step, word, hash_size);
253
254	/* loop around in case of collisions and need to step */
255	while (1)
256	{
257
258	Word entry = set_of_words[hash_val];
259
260	/* if doesn't exist then */
261	if (!entry)
262	return 0;
263
264	/* if we have a matching word */
265	if (compare (entry, word) == 0)
266	return 1;
267
268	/* if collides with a different word */
269	hash_val = (hash_val + hash_step) % hash_size;
270
271	}
272	}
273
274	/****************************** end of word set ****************************/
275
276
277
278	/* =========================================================================
279	* Function: copy_c_word
280	* Description:
281	* Allocate enough memory and copy word over
282	* Input:
283	* w = null terminated string (c_word)
284	* Output:
285	* word with length in 1st byte
286	* ========================================================================= */
287
288	static Word
289	copy_c_word (char *w)
290	{
291	int len = strlen (w);
292	Word w_copy = (Word) malloc (len + 1);
293	Word w_ptr = NULL;
294	int j = 0;
295
296	if (!w_copy)
297	FatalError (1, "Not enough memory to copy a word");
298
299	w_copy[0] = len;
300	w_ptr = w_copy + 1;
301	for (j = 0; j < len; j++)
302	w_ptr++ = w++;
303
304	return w_copy;
305	}
306
307	/* =========================================================================
308	* Function: process_words
309	* Description:
310	* Go through the stemmed words and add to word set
311	* Input:
312	* Output:
313	* ========================================================================= */
314
315	static void
316	process_words (char **words, int num_of_words)
317	{
318	int i = 0;
319
320	set_create (num_of_words);
321
322	for (i = 0; i < num_of_words; i++)
323	{
324	Word word = copy_c_word (words[i]);
325	set_add (word);
326	}
327
328	}
329
330	/* =========================================================================
331	* Function: get_line
332	* Description:
333	* Equivalent of fgets for u_char*.
334	* But returns length of read-in line.
335	* Expects to see a '\n' before an EOF
336	* Input:
337	* Output:
338	* ========================================================================= */
339
340	static int
341	get_line (u_char * line, int n, FILE * stream)
342	{
343	int i = 0;
344	int ch = '\0';
345
346	while (1)
347	{
348	if (i == n)
349	return i;
350
351	ch = fgetc (stream);
352
353	if (ch == EOF)
354	{
355	if (!feof (stream))
356	FatalError (1, "Error on reading a line from stdin");
357	return EOF;
358	}
359
360	if (ch == '\n')
361	return i;
362
363	*line++ = ch;
364	i++;
365	}
366
367	}
368
369	/* =========================================================================
370	* Function: print_line
371	* Description:
372	* Input:
373	* Output:
374	* ========================================================================= */
375
376	static void
377	print_line (u_char * line, int n, FILE * stream)
378	{
379
380	while (n--)
381	{
382	fputc (*line++, stream);
383	}
384	fputc ('\n', stream);
385
386	}
387
388	/* =========================================================================
389	* Function: process_text
390	* Description:
391	* Go through the text from input_file and highlight to output_file
392	* Input:
393	* Output:
394	* ========================================================================= */
395
396	static void
397	process_text (FILE * input_file, FILE * output_file)
398	{
399	static u_char line_buffer[MAX_LINE_BUFFER];
400
401
402	while (1)
403	{
404	int len = get_line (line_buffer, MAX_LINE_BUFFER, input_file);
405
406	if (len == EOF)
407	break;
408	process_buffer (line_buffer, len, output_file);
409
410	}
411	}
412
413	/* =========================================================================
414	* Function: copy_word
415	* Description:
416	* Copies w2 into w1. Assumes both have storage allocated.
417	* Input:
418	* Output:
419	* ========================================================================= */
420
421	static void
422	copy_word (u_char * w1, u_char * w2)
423	{
424	int i;
425	int len = w2[0];
426
427	for (i = 0; i <= len; i++)
428	w1++ = w2++;
429
430	}
431
432	/* =========================================================================
433	* Function: process_buffer
434	* Description:
435	* Parse & stem words of line buffer
436	* Based on the usage of PARSEing in other mg files.
437	* Input:
438	* Output:
439	* ========================================================================= */
440
441	static void
442	process_buffer (u_char * s_in, int len, FILE * output_file)
443	{
444	u_char *end = s_in + len - 1;
445	u_char *s_start = NULL;
446
447	if (!INAWORD (*s_in))
448	{
449	s_start = s_in;
450	PARSE_NON_STEM_WORD (s_in, end);
451	output_word (s_start, s_in - 1, output_file);
452	}
453
454	while (s_in <= end)
455	{
456	u_char word[MAXSTEMLEN + 1];
457
458	s_start = s_in;
459	PARSE_STEM_WORD (word, s_in, end);
460
461	stemmer (stem_method, stemmer_num, word);
462
463	if (set_member (word)) /* output with highlighting */
464	{
465	output_hilite_word (s_start, s_start + word[0] - 1, output_file);
466	s_start += word[0]; /* step over hilited output */
467	}
468	output_word (s_start, s_in - 1, output_file);
469
470	s_start = s_in;
471	PARSE_NON_STEM_WORD (s_in, end);
472	output_word (s_start, s_in - 1, output_file);
473
474	} /while /
475
476	fputc ('\n', output_file);
477	fflush (output_file);
478
479	} /process_buffer /
480
481	/* =========================================================================
482	* Function: output_word
483	* Description:
484	* Output a word which lies from s_start to s_finish in buffer
485	* Input:
486	* s_start = ptr to 1st char
487	* s_finish = ptr to last char
488	* Output:
489	* ========================================================================= */
490
491	static void
492	output_word (u_char * s_start, u_char * s_finish, FILE * output_file)
493	{
494	while (s_start <= s_finish)
495	{
496	fputc (*s_start++, output_file);
497	}
498	}
499
500
501	/* =========================================================================
502	* Function: output_hilite_word
503	* Description:
504	* Highlight a word (with length in 1st byte)
505	* Pager highlighting:
506	* Highlighting is either by bolding or underlining using
507	* the method used by UNIX utilities More(1) and Less(1)
508	* HTML highlighting:
509	* use the appropriate start and end tags around the word
510	* ========================================================================= */
511
512	static void
513	output_hilite_word (u_char * s_start, u_char * s_finish, FILE * output_file)
514	{
515
516	if (output_type == HTML)
517	{
518	char *hilite_tag = hilite_tags[hilite_style];
519
520	/* print start tag */
521	fprintf (output_file, "<%s>", hilite_tag);
522
523	output_word (s_start, s_finish, output_file);
524
525	/* print end tag */
526	fprintf (output_file, "</%s>", hilite_tag);
527	}
528
529	else
530	/* PAGER */
531	{
532	/* use backspaces around each letter */
533	while (s_start <= s_finish)
534	{
535	switch (hilite_style)
536	{
537	case BOLD:
538	fputc (*s_start, output_file);
539	fputc ('\b', output_file);
540	fputc (*s_start, output_file);
541	break;
542	case UNDERLINE:
543	fputc ('_', output_file);
544	fputc ('\b', output_file);
545	fputc (*s_start, output_file);
546	break;
547	default:
548	fputc (*s_start, output_file);
549	}
550	s_start++;
551	} /while /
552	}
553	}
554
555	/* =========================================================================
556	* Function: process_args
557	* Description:
558	* sets the global variables:
559	* hilite_style, pager, num_of_words, word_list
560	* Input:
561	* Output:
562	* ========================================================================= */
563
564	struct option long_opts[] =
565	{
566	{"style", required_argument, 0, 's'},
567	{"terminator", required_argument, 0, 't'},
568	{"pager", required_argument, 0, 'p'},
569	{"stem_method", required_argument, 0, 'm'},
570	{"stemmer", required_argument, 0, 'a'},
571	{0, 0, 0, 0}
572	};
573
574	static void
575	process_args (int argc, char *argv[])
576	{
577	int ch;
578
579
580	opterr = 0;
581	while ((ch = getopt_long (argc, argv, "s:p:t:m:a:", long_opts, (int *) 0)) != -1)
582	{
583	switch (ch)
584	{
585	case 's':
586	{
587	int i;
588	for (i = 0; i < HILITE_MAX; i++)
589	if (strcmp (optarg, hilite_names[i]) == 0)
590	break;
591
592	if (i < HILITE_MAX)
593	hilite_style = i;
594	}
595	break;
596	case 'a':
597	stemmer_num = stemmernumber (optarg);
598	break;
599	case 't':
600	terminator = optarg;
601	break;
602	case 'm':
603	stem_method = atoi (optarg);
604	break;
605	case 'p':
606	pager = optarg;
607	break;
608	default:
609	FatalError (1, "Usage: \n"
610	"mg_hilite_words --stem_method [0-3]\n"
611	" --stemmer [english\|lovin\|french\|simplefrench]\n"
612	" --style [bold\|underline\|italic\|emphasis\|strong]\n"
613	" --pager [less\|more\|html\|???]\n");
614	}
615	}
616
617	num_of_words = argc - optind;
618
619	word_list = &argv[optind];
620
621	/* fix up output type */
622	if (strcmp (pager, HTML_OUT) == 0)
623	output_type = HTML;
624	else
625	output_type = PAGER;
626
627	}
628
629	/* =========================================================================
630	* Function: main
631	* Description:
632	* Input:
633	* Output:
634	* ========================================================================= */
635
636	int
637	main (int argc, char *argv[])
638	{
639	FILE *output = NULL;
640
641	process_args (argc, argv);
642
643	/* set output file */
644	if (output_type == PAGER)
645	/* [RPAP - Feb 97: WIN32 Port] */
646	#ifdef __WIN32__
647	output = _popen (pager, "w");
648	#else
649	output = popen (pager, "w");
650	#endif
651	else
652	output = stdout;
653
654	if (!output)
655	FatalError (1, "Unable to run \"%s\"\n", pager);
656
657	if (num_of_words < 1)
658	{
659	int ch;
660
661	/* just echo the input */
662	/* better not to call this program at all ;-) */
663	while ((ch = fgetc (stdin)) != EOF)
664	{
665	fputc (ch, output);
666	}
667	}
668	else
669	{
670	/* set up hash table for words */
671	process_words (word_list, num_of_words);
672
673
674	/* Go thru lines of text from stdin and
675	* output words with hilite info if
676	* words parse into existence in the hash table
677	*/
678	process_text (stdin, output);
679	}
680
681	if (terminator)
682	fprintf (output, "%s\n", terminator);
683
684	if (output != stdout)
685	/* [RPAP - Feb 97: WIN32 Port] */
686	#ifdef __WIN32__
687	_pclose (output);
688	#else
689	pclose (output);
690	#endif
691
692	return 0;
693	}

Note: See TracBrowser for help on using the repository browser.

Download in other formats: