Context Navigation

← Previous Revision
Latest Revision
Next Revision →
Blame
Revision Log

source: indexers/trunk/mgpp/text/words.h@ 18717

Last change on this file since 18717 was 16583, checked in by davidb, 16 years ago
Undoing change commited in r16582
Property svn:executable set to ``* Property svn:keywords set to `Author Date Id Revision`
File size: 6.8 KB

Line
1	/**************************************************************************
2	*
3	* words.h -- Macros for parsing out words from the source text
4	* Copyright (C) 1994 Neil Sharman
5	*
6	* This program is free software; you can redistribute it and/or modify
7	* it under the terms of the GNU General Public License as published by
8	* the Free Software Foundation; either version 2 of the License, or
9	* (at your option) any later version.
10	*
11	* This program is distributed in the hope that it will be useful,
12	* but WITHOUT ANY WARRANTY; without even the implied warranty of
13	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14	* GNU General Public License for more details.
15	*
16	* You should have received a copy of the GNU General Public License
17	* along with this program; if not, write to the Free Software
18	* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
19	*
20	**************************************************************************/
21	#ifndef H_WORDS
22	#define H_WORDS
23
24
25	#include "sysfuncs.h"
26
27	#include "unitool.h"
28	#include "UCArray.h"
29
30
31	/*
32	* This has been cleaned up by Tim Shimmin.
33	*/
34
35	/*
36	* ---NOTE---
37	*
38	* "WORD" refers to a word in the compressed text.
39	* "STEM" or "STEM_WORD" refers to a word for indexing on
40	*
41	*/
42
43	#define MAXWORDLEN 15
44	/* Maximum length in bytes of any word or non-word. Note that
45	variations to MAXWORDLEN may have dramatic effects on the rest
46	of the program, as the length and the prefix match are packed
47	together into a four bit nibble, and there is not check that
48	this is possible, i.e., leave MAXWORDLEN alone... */
49
50	#define MAXSTEMLEN 255
51	/* Maximum length in bytes of any stem. Note that
52	variations to MAXSTEMLEN may have dramatic effects on the rest
53	of the program, , i.e., leave MAXSTEMLEN alone... */
54
55	//#define MAXNUMERIC 4
56	/* Maximum number of numeric characters permitted in a word.
57	This avoids long sequences of numbers creating just one
58	word occurrence for each number. At most 10,000 all numeric
59	words will be permitted. */
60
61	extern int MAXNUMERIC;
62
63	/* [RPAP - Jan 97: Stem Index Change] */
64	#define MAXPARAMLEN 20
65	/* Maximum number of bytes to read for a parameter value for a
66	term in a query. */
67	#define WEIGHTPARAM '/'
68	#define STEMPARAM '#'
69
70	/* [RJM 07/97: Ranked Required Terms] */
71	#define MUSTMATCHPARAM '+'
72
73
74	#define PESINAWORD(c) (isalnum(c) \|\| ((c) >= 0x80 && (c) <= 0xff))
75	/* The definition of what characters are permitted in a word.
76	This macro is pessimistic, you cannot tell from a particular
77	byte above 0x80 whether it is a character or not. This function
78	is needed by various functions relating to huffman coding
79	where frequency counts need to be primed, it should not be
80	used in parsing the UTF-8 encoded input. */
81
82	#ifdef __cplusplus
83	extern "C" {
84	#endif
85	int inaword_mgpp (const u_char here, const u_char end);
86	/* Takes the place of the old INAWORD macro. It determines
87	whether a given place in a UTF-8 encoded Unicode string
88	is part of a word. */
89
90	int isaspace_mgpp (const u_char here, const u_char end);
91	/* It determines whether a given place in a UTF-8 encoded Unicode string is a unicode space. */
92
93	u_char skipspace_mgpp(u_char here, u_char *end);
94	/* Return a the UTF-8 encoded Unicode string with begining unicode spaces skippend. */
95
96	#ifdef __cplusplus
97	}
98	#endif
99
100	const unsigned char ParseIndexWord (const unsigned char textHere,
101	const unsigned char *textEnd,
102	UCArray &word);
103	const unsigned char ParseIndexMGWord (const unsigned char textHere,
104	const unsigned char *textEnd,
105	unsigned char *mgWord);
106	const unsigned char ParseNonindexWord (const unsigned char textHere,
107	const unsigned char *textEnd);
108
109
110
111	/* =========================================================================
112	* Macro: PARSE_WORD
113	* Description:
114	* Extract a word out for compressing text
115	* Input:
116	* s_in = string start in buffer
117	* end = string end in buffer
118	* Output:
119	* Word = extracted word with length in 1st byte
120	* s_in = ptr to next character in buffer yet to be processed
121	* ========================================================================= */
122	#define PARSE_WORD(Word, s_in, end) \
123	do { \
124	register int charlength = 0; \
125	register u_char *wptr = (Word)+1; \
126	register int length = 0; \
127	register int numeric = 0; \
128	unsigned short c; \
129	\
130	charlength = parse_utf8_char((s_in),(end),&c); \
131	\
132	while (length+charlength <= MAXWORDLEN && charlength > 0 && \
133	(is_unicode_letter(c) \|\| (is_unicode_digit(c) && \
134	++numeric <= MAXNUMERIC))) { \
135	while (charlength-- > 0) { \
136	wptr++ = (s_in)++; ++length; \
137	} \
138	charlength = parse_utf8_char((s_in),(end),&c); \
139	} \
140	*(Word) = length; \
141	}while(0)
142
143
144
145	/* =========================================================================
146	* Macro: PARSE_NON_WORD
147	* Description:
148	* Extract a non-word out for storing compressed text
149	* Input: as above
150	* Output: as above
151	* ========================================================================= */
152	#define PARSE_NON_WORD(Word, s_in, end) \
153	do { \
154	register int charlength = 0; \
155	register u_char *wptr = (Word)+1; \
156	register int length = 0; \
157	unsigned short c; \
158	\
159	charlength = parse_utf8_char((s_in),(end),&c); \
160	\
161	while (length+charlength <= MAXWORDLEN && charlength > 0 && \
162	!is_unicode_letdig(c)) { \
163	while (charlength-- > 0) { \
164	wptr++ = (s_in)++; ++length; \
165	} \
166	charlength = parse_utf8_char((s_in),(end),&c); \
167	} \
168	*(Word) = length; \
169	}while(0)
170
171	#endif

Note: See TracBrowser for help on using the repository browser.

Download in other formats: