source: trunk/mgpp/text/words.h@ 10912

Last change on this file since 10912 was 9613, checked in by kjdon, 19 years ago

added in x++ -> ++x changes submitted by Emanuel Dejanu

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 6.8 KB
Line 
1/**************************************************************************
2 *
3 * words.h -- Macros for parsing out words from the source text
4 * Copyright (C) 1994 Neil Sharman
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
19 *
20 **************************************************************************/
21
22#include "sysfuncs.h"
23
24#include "unitool.h"
25#include "UCArray.h"
26
27
28/*
29 * This has been cleaned up by Tim Shimmin.
30 */
31
32/*
33 * ---NOTE---
34 *
35 * "WORD" refers to a word in the compressed text.
36 * "STEM" or "STEM_WORD" refers to a word for indexing on
37 *
38 */
39
40#define MAXWORDLEN 15
41 /* Maximum length in bytes of any word or non-word. Note that
42 variations to MAXWORDLEN may have dramatic effects on the rest
43 of the program, as the length and the prefix match are packed
44 together into a four bit nibble, and there is not check that
45 this is possible, i.e., leave MAXWORDLEN alone... */
46
47#define MAXSTEMLEN 255
48 /* Maximum length in bytes of any stem. Note that
49 variations to MAXSTEMLEN may have dramatic effects on the rest
50 of the program, , i.e., leave MAXSTEMLEN alone... */
51
52#define MAXNUMERIC 4
53 /* Maximum number of numeric characters permitted in a word.
54 This avoids long sequences of numbers creating just one
55 word occurrence for each number. At most 10,000 all numeric
56 words will be permitted. */
57
58/* [RPAP - Jan 97: Stem Index Change] */
59#define MAXPARAMLEN 20
60 /* Maximum number of bytes to read for a parameter value for a
61 term in a query. */
62#define WEIGHTPARAM '/'
63#define STEMPARAM '#'
64
65/* [RJM 07/97: Ranked Required Terms] */
66#define MUSTMATCHPARAM '+'
67
68
69#define PESINAWORD(c) (isalnum(c) || ((c) >= 0x80 && (c) <= 0xff))
70 /* The definition of what characters are permitted in a word.
71 This macro is pessimistic, you cannot tell from a particular
72 byte above 0x80 whether it is a character or not. This function
73 is needed by various functions relating to huffman coding
74 where frequency counts need to be primed, it should not be
75 used in parsing the UTF-8 encoded input. */
76
77#ifdef __cplusplus
78extern "C" {
79#endif
80int inaword (const u_char *here, const u_char *end);
81 /* Takes the place of the old INAWORD macro. It determines
82 whether a given place in a UTF-8 encoded Unicode string
83 is part of a word. */
84
85int isaspace (const u_char *here, const u_char *end);
86 /* It determines whether a given place in a UTF-8 encoded Unicode string is a unicode space. */
87
88u_char *skipspace(u_char *here, u_char *end);
89 /* Return a the UTF-8 encoded Unicode string with begining unicode spaces skippend. */
90
91#ifdef __cplusplus
92}
93#endif
94
95const unsigned char *ParseIndexWord (const unsigned char *textHere,
96 const unsigned char *textEnd,
97 UCArray &word);
98const unsigned char *ParseIndexMGWord (const unsigned char *textHere,
99 const unsigned char *textEnd,
100 unsigned char *mgWord);
101const unsigned char *ParseNonindexWord (const unsigned char *textHere,
102 const unsigned char *textEnd);
103
104
105
106/* =========================================================================
107 * Macro: PARSE_WORD
108 * Description:
109 * Extract a word out for compressing text
110 * Input:
111 * s_in = string start in buffer
112 * end = string end in buffer
113 * Output:
114 * Word = extracted word with length in 1st byte
115 * s_in = ptr to next character in buffer yet to be processed
116 * ========================================================================= */
117#define PARSE_WORD(Word, s_in, end) \
118 do { \
119 register int charlength = 0; \
120 register u_char *wptr = (Word)+1; \
121 register int length = 0; \
122 register int numeric = 0; \
123 unsigned short c; \
124 \
125 charlength = parse_utf8_char((s_in),(end),&c); \
126 \
127 while (length+charlength <= MAXWORDLEN && charlength > 0 && \
128 (is_unicode_letter(c) || (is_unicode_digit(c) && \
129 ++numeric <= MAXNUMERIC))) { \
130 while (charlength-- > 0) { \
131 *wptr++ = *(s_in)++; ++length; \
132 } \
133 charlength = parse_utf8_char((s_in),(end),&c); \
134 } \
135 *(Word) = length; \
136 }while(0)
137
138
139
140/* =========================================================================
141 * Macro: PARSE_NON_WORD
142 * Description:
143 * Extract a non-word out for storing compressed text
144 * Input: as above
145 * Output: as above
146 * ========================================================================= */
147#define PARSE_NON_WORD(Word, s_in, end) \
148 do { \
149 register int charlength = 0; \
150 register u_char *wptr = (Word)+1; \
151 register int length = 0; \
152 unsigned short c; \
153 \
154 charlength = parse_utf8_char((s_in),(end),&c); \
155 \
156 while (length+charlength <= MAXWORDLEN && charlength > 0 && \
157 !is_unicode_letdig(c)) { \
158 while (charlength-- > 0) { \
159 *wptr++ = *(s_in)++; ++length; \
160 } \
161 charlength = parse_utf8_char((s_in),(end),&c); \
162 } \
163 *(Word) = length; \
164 }while(0)
Note: See TracBrowser for help on using the repository browser.