Context Navigation

← Previous Revision
Latest Revision
Next Revision →
Blame
Revision Log

source: trunk/gsdl/src/mgpp/text/words.h@ 879

Last change on this file since 879 was 856, checked in by sjboddie, 24 years ago
Rodgers new C++ mg
Property svn:executable set to ``* Property svn:keywords set to `Author Date Id Revision`
File size: 6.5 KB

Line
1	/**************************************************************************
2	*
3	* words.h -- Macros for parsing out words from the source text
4	* Copyright (C) 1994 Neil Sharman
5	*
6	* This program is free software; you can redistribute it and/or modify
7	* it under the terms of the GNU General Public License as published by
8	* the Free Software Foundation; either version 2 of the License, or
9	* (at your option) any later version.
10	*
11	* This program is distributed in the hope that it will be useful,
12	* but WITHOUT ANY WARRANTY; without even the implied warranty of
13	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14	* GNU General Public License for more details.
15	*
16	* You should have received a copy of the GNU General Public License
17	* along with this program; if not, write to the Free Software
18	* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
19	*
20	* $Id: words.h 856 2000-01-14 02:26:25Z sjboddie $
21	*
22	**************************************************************************/
23
24	#include "sysfuncs.h"
25
26	#include "unitool.h"
27	#include "UCArray.h"
28
29
30	/*
31	* This has been cleaned up by Tim Shimmin.
32	*/
33
34	/*
35	* ---NOTE---
36	*
37	* "WORD" refers to a word in the compressed text.
38	* "STEM" or "STEM_WORD" refers to a word for indexing on
39	*
40	*/
41
42	#define MAXWORDLEN 15
43	/* Maximum length in bytes of any word or non-word. Note that
44	variations to MAXWORDLEN may have dramatic effects on the rest
45	of the program, as the length and the prefix match are packed
46	together into a four bit nibble, and there is not check that
47	this is possible, i.e., leave MAXWORDLEN alone... */
48
49	#define MAXSTEMLEN 255
50	/* Maximum length in bytes of any stem. Note that
51	variations to MAXSTEMLEN may have dramatic effects on the rest
52	of the program, , i.e., leave MAXSTEMLEN alone... */
53
54	#define MAXNUMERIC 4
55	/* Maximum number of numeric characters permitted in a word.
56	This avoids long sequences of numbers creating just one
57	word occurrence for each number. At most 10,000 all numeric
58	words will be permitted. */
59
60	/* [RPAP - Jan 97: Stem Index Change] */
61	#define MAXPARAMLEN 20
62	/* Maximum number of bytes to read for a parameter value for a
63	term in a query. */
64	#define WEIGHTPARAM '/'
65	#define STEMPARAM '#'
66
67	/* [RJM 07/97: Ranked Required Terms] */
68	#define MUSTMATCHPARAM '+'
69
70
71	#define PESINAWORD(c) (isalnum(c) \|\| ((c) >= 0x80 && (c) <= 0xff))
72	/* The definition of what characters are permitted in a word.
73	This macro is pessimistic, you cannot tell from a particular
74	byte above 0x80 whether it is a character or not. This function
75	is needed by various functions relating to huffman coding
76	where frequency counts need to be primed, it should not be
77	used in parsing the UTF-8 encoded input. */
78
79	#ifdef __cplusplus
80	extern "C"
81	#endif
82	int inaword (const u_char here, const u_char end);
83	/* Takes the place of the old INAWORD macro. It determines
84	whether a given place in a UTF-8 encoded Unicode string
85	is part of a word. */
86
87
88	const unsigned char ParseIndexWord (const unsigned char textHere,
89	const unsigned char *textEnd,
90	UCArray &word);
91	const unsigned char ParseIndexMGWord (const unsigned char textHere,
92	const unsigned char *textEnd,
93	unsigned char *mgWord);
94	const unsigned char ParseNonindexWord (const unsigned char textHere,
95	const unsigned char *textEnd);
96
97
98
99	/* =========================================================================
100	* Macro: PARSE_WORD
101	* Description:
102	* Extract a word out for compressing text
103	* Input:
104	* s_in = string start in buffer
105	* end = string end in buffer
106	* Output:
107	* Word = extracted word with length in 1st byte
108	* s_in = ptr to next character in buffer yet to be processed
109	* ========================================================================= */
110	#define PARSE_WORD(Word, s_in, end) \
111	do { \
112	register int charlength = 0; \
113	register u_char *wptr = (Word)+1; \
114	register int length = 0; \
115	register int numeric = 0; \
116	unsigned short c; \
117	\
118	charlength = parse_utf8_char((s_in),(end),&c); \
119	\
120	while (length+charlength <= MAXWORDLEN && charlength > 0 && \
121	(is_unicode_letter(c) \|\| (is_unicode_digit(c) && \
122	++numeric <= MAXNUMERIC))) { \
123	while (charlength-- > 0) { \
124	wptr++ = (s_in)++; length++; \
125	} \
126	charlength = parse_utf8_char((s_in),(end),&c); \
127	} \
128	*(Word) = length; \
129	}while(0)
130
131
132
133	/* =========================================================================
134	* Macro: PARSE_NON_WORD
135	* Description:
136	* Extract a non-word out for storing compressed text
137	* Input: as above
138	* Output: as above
139	* ========================================================================= */
140	#define PARSE_NON_WORD(Word, s_in, end) \
141	do { \
142	register int charlength = 0; \
143	register u_char *wptr = (Word)+1; \
144	register int length = 0; \
145	unsigned short c; \
146	\
147	charlength = parse_utf8_char((s_in),(end),&c); \
148	\
149	while (length+charlength <= MAXWORDLEN && charlength > 0 && \
150	!is_unicode_letdig(c)) { \
151	while (charlength-- > 0) { \
152	wptr++ = (s_in)++; length++; \
153	} \
154	charlength = parse_utf8_char((s_in),(end),&c); \
155	} \
156	*(Word) = length; \
157	}while(0)

Note: See TracBrowser for help on using the repository browser.

Download in other formats: