source: gsdl/trunk/trunk/mgpp/text/words.h@ 16583

Last change on this file since 16583 was 16583, checked in by davidb, 16 years ago

Undoing change commited in r16582

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 6.8 KB
Line 
1/**************************************************************************
2 *
3 * words.h -- Macros for parsing out words from the source text
4 * Copyright (C) 1994 Neil Sharman
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
19 *
20 **************************************************************************/
21#ifndef H_WORDS
22#define H_WORDS
23
24
25#include "sysfuncs.h"
26
27#include "unitool.h"
28#include "UCArray.h"
29
30
31/*
32 * This has been cleaned up by Tim Shimmin.
33 */
34
35/*
36 * ---NOTE---
37 *
38 * "WORD" refers to a word in the compressed text.
39 * "STEM" or "STEM_WORD" refers to a word for indexing on
40 *
41 */
42
43#define MAXWORDLEN 15
44 /* Maximum length in bytes of any word or non-word. Note that
45 variations to MAXWORDLEN may have dramatic effects on the rest
46 of the program, as the length and the prefix match are packed
47 together into a four bit nibble, and there is not check that
48 this is possible, i.e., leave MAXWORDLEN alone... */
49
50#define MAXSTEMLEN 255
51 /* Maximum length in bytes of any stem. Note that
52 variations to MAXSTEMLEN may have dramatic effects on the rest
53 of the program, , i.e., leave MAXSTEMLEN alone... */
54
55//#define MAXNUMERIC 4
56 /* Maximum number of numeric characters permitted in a word.
57 This avoids long sequences of numbers creating just one
58 word occurrence for each number. At most 10,000 all numeric
59 words will be permitted. */
60
61extern int MAXNUMERIC;
62
63/* [RPAP - Jan 97: Stem Index Change] */
64#define MAXPARAMLEN 20
65 /* Maximum number of bytes to read for a parameter value for a
66 term in a query. */
67#define WEIGHTPARAM '/'
68#define STEMPARAM '#'
69
70/* [RJM 07/97: Ranked Required Terms] */
71#define MUSTMATCHPARAM '+'
72
73
74#define PESINAWORD(c) (isalnum(c) || ((c) >= 0x80 && (c) <= 0xff))
75 /* The definition of what characters are permitted in a word.
76 This macro is pessimistic, you cannot tell from a particular
77 byte above 0x80 whether it is a character or not. This function
78 is needed by various functions relating to huffman coding
79 where frequency counts need to be primed, it should not be
80 used in parsing the UTF-8 encoded input. */
81
82#ifdef __cplusplus
83extern "C" {
84#endif
85int inaword_mgpp (const u_char *here, const u_char *end);
86 /* Takes the place of the old INAWORD macro. It determines
87 whether a given place in a UTF-8 encoded Unicode string
88 is part of a word. */
89
90int isaspace_mgpp (const u_char *here, const u_char *end);
91 /* It determines whether a given place in a UTF-8 encoded Unicode string is a unicode space. */
92
93u_char *skipspace_mgpp(u_char *here, u_char *end);
94 /* Return a the UTF-8 encoded Unicode string with begining unicode spaces skippend. */
95
96#ifdef __cplusplus
97}
98#endif
99
100const unsigned char *ParseIndexWord (const unsigned char *textHere,
101 const unsigned char *textEnd,
102 UCArray &word);
103const unsigned char *ParseIndexMGWord (const unsigned char *textHere,
104 const unsigned char *textEnd,
105 unsigned char *mgWord);
106const unsigned char *ParseNonindexWord (const unsigned char *textHere,
107 const unsigned char *textEnd);
108
109
110
111/* =========================================================================
112 * Macro: PARSE_WORD
113 * Description:
114 * Extract a word out for compressing text
115 * Input:
116 * s_in = string start in buffer
117 * end = string end in buffer
118 * Output:
119 * Word = extracted word with length in 1st byte
120 * s_in = ptr to next character in buffer yet to be processed
121 * ========================================================================= */
122#define PARSE_WORD(Word, s_in, end) \
123 do { \
124 register int charlength = 0; \
125 register u_char *wptr = (Word)+1; \
126 register int length = 0; \
127 register int numeric = 0; \
128 unsigned short c; \
129 \
130 charlength = parse_utf8_char((s_in),(end),&c); \
131 \
132 while (length+charlength <= MAXWORDLEN && charlength > 0 && \
133 (is_unicode_letter(c) || (is_unicode_digit(c) && \
134 ++numeric <= MAXNUMERIC))) { \
135 while (charlength-- > 0) { \
136 *wptr++ = *(s_in)++; ++length; \
137 } \
138 charlength = parse_utf8_char((s_in),(end),&c); \
139 } \
140 *(Word) = length; \
141 }while(0)
142
143
144
145/* =========================================================================
146 * Macro: PARSE_NON_WORD
147 * Description:
148 * Extract a non-word out for storing compressed text
149 * Input: as above
150 * Output: as above
151 * ========================================================================= */
152#define PARSE_NON_WORD(Word, s_in, end) \
153 do { \
154 register int charlength = 0; \
155 register u_char *wptr = (Word)+1; \
156 register int length = 0; \
157 unsigned short c; \
158 \
159 charlength = parse_utf8_char((s_in),(end),&c); \
160 \
161 while (length+charlength <= MAXWORDLEN && charlength > 0 && \
162 !is_unicode_letdig(c)) { \
163 while (charlength-- > 0) { \
164 *wptr++ = *(s_in)++; ++length; \
165 } \
166 charlength = parse_utf8_char((s_in),(end),&c); \
167 } \
168 *(Word) = length; \
169 }while(0)
170
171#endif
Note: See TracBrowser for help on using the repository browser.