Context Navigation

source: trunk/gsdl/src/mgpp/text/words.h@ 2468

Last change on this file since 2468 was 2468, checked in by sjboddie, 23 years ago

Fiddled about with mgpp to get it compiling on Windows under VC++ 6.0. I
still can't get it to compile under VC++ 4.2 because of some weird
behaviour in STLport.

Also tidied up a little and removed some of the old log information
that was scattered about in some of the files.

Property svn:executable set to *
Property svn:keywords set to Author Date Id Revision

File size: 6.4 KB

Line
1	/**************************************************************************
2	*
3	* words.h -- Macros for parsing out words from the source text
4	* Copyright (C) 1994 Neil Sharman
5	*
6	* This program is free software; you can redistribute it and/or modify
7	* it under the terms of the GNU General Public License as published by
8	* the Free Software Foundation; either version 2 of the License, or
9	* (at your option) any later version.
10	*
11	* This program is distributed in the hope that it will be useful,
12	* but WITHOUT ANY WARRANTY; without even the implied warranty of
13	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14	* GNU General Public License for more details.
15	*
16	* You should have received a copy of the GNU General Public License
17	* along with this program; if not, write to the Free Software
18	* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
19	*
20	**************************************************************************/
21
22	#include "sysfuncs.h"
23
24	#include "unitool.h"
25	#include "UCArray.h"
26
27
28	/*
29	* This has been cleaned up by Tim Shimmin.
30	*/
31
32	/*
33	* ---NOTE---
34	*
35	* "WORD" refers to a word in the compressed text.
36	* "STEM" or "STEM_WORD" refers to a word for indexing on
37	*
38	*/
39
40	#define MAXWORDLEN 15
41	/* Maximum length in bytes of any word or non-word. Note that
42	variations to MAXWORDLEN may have dramatic effects on the rest
43	of the program, as the length and the prefix match are packed
44	together into a four bit nibble, and there is not check that
45	this is possible, i.e., leave MAXWORDLEN alone... */
46
47	#define MAXSTEMLEN 255
48	/* Maximum length in bytes of any stem. Note that
49	variations to MAXSTEMLEN may have dramatic effects on the rest
50	of the program, , i.e., leave MAXSTEMLEN alone... */
51
52	#define MAXNUMERIC 4
53	/* Maximum number of numeric characters permitted in a word.
54	This avoids long sequences of numbers creating just one
55	word occurrence for each number. At most 10,000 all numeric
56	words will be permitted. */
57
58	/* [RPAP - Jan 97: Stem Index Change] */
59	#define MAXPARAMLEN 20
60	/* Maximum number of bytes to read for a parameter value for a
61	term in a query. */
62	#define WEIGHTPARAM '/'
63	#define STEMPARAM '#'
64
65	/* [RJM 07/97: Ranked Required Terms] */
66	#define MUSTMATCHPARAM '+'
67
68
69	#define PESINAWORD(c) (isalnum(c) \|\| ((c) >= 0x80 && (c) <= 0xff))
70	/* The definition of what characters are permitted in a word.
71	This macro is pessimistic, you cannot tell from a particular
72	byte above 0x80 whether it is a character or not. This function
73	is needed by various functions relating to huffman coding
74	where frequency counts need to be primed, it should not be
75	used in parsing the UTF-8 encoded input. */
76
77	#ifdef __cplusplus
78	extern "C"
79	#endif
80	int inaword (const u_char here, const u_char end);
81	/* Takes the place of the old INAWORD macro. It determines
82	whether a given place in a UTF-8 encoded Unicode string
83	is part of a word. */
84
85
86	const unsigned char ParseIndexWord (const unsigned char textHere,
87	const unsigned char *textEnd,
88	UCArray &word);
89	const unsigned char ParseIndexMGWord (const unsigned char textHere,
90	const unsigned char *textEnd,
91	unsigned char *mgWord);
92	const unsigned char ParseNonindexWord (const unsigned char textHere,
93	const unsigned char *textEnd);
94
95
96
97	/* =========================================================================
98	* Macro: PARSE_WORD
99	* Description:
100	* Extract a word out for compressing text
101	* Input:
102	* s_in = string start in buffer
103	* end = string end in buffer
104	* Output:
105	* Word = extracted word with length in 1st byte
106	* s_in = ptr to next character in buffer yet to be processed
107	* ========================================================================= */
108	#define PARSE_WORD(Word, s_in, end) \
109	do { \
110	register int charlength = 0; \
111	register u_char *wptr = (Word)+1; \
112	register int length = 0; \
113	register int numeric = 0; \
114	unsigned short c; \
115	\
116	charlength = parse_utf8_char((s_in),(end),&c); \
117	\
118	while (length+charlength <= MAXWORDLEN && charlength > 0 && \
119	(is_unicode_letter(c) \|\| (is_unicode_digit(c) && \
120	++numeric <= MAXNUMERIC))) { \
121	while (charlength-- > 0) { \
122	wptr++ = (s_in)++; length++; \
123	} \
124	charlength = parse_utf8_char((s_in),(end),&c); \
125	} \
126	*(Word) = length; \
127	}while(0)
128
129
130
131	/* =========================================================================
132	* Macro: PARSE_NON_WORD
133	* Description:
134	* Extract a non-word out for storing compressed text
135	* Input: as above
136	* Output: as above
137	* ========================================================================= */
138	#define PARSE_NON_WORD(Word, s_in, end) \
139	do { \
140	register int charlength = 0; \
141	register u_char *wptr = (Word)+1; \
142	register int length = 0; \
143	unsigned short c; \
144	\
145	charlength = parse_utf8_char((s_in),(end),&c); \
146	\
147	while (length+charlength <= MAXWORDLEN && charlength > 0 && \
148	!is_unicode_letdig(c)) { \
149	while (charlength-- > 0) { \
150	wptr++ = (s_in)++; length++; \
151	} \
152	charlength = parse_utf8_char((s_in),(end),&c); \
153	} \
154	*(Word) = length; \
155	}while(0)

Note: See TracBrowser for help on using the repository browser.

Download in other formats: