Context Navigation

source: trunk/mgpp/text/words.h@ 10912

Last change on this file since 10912 was 9613, checked in by kjdon, 19 years ago
added in x++ -> ++x changes submitted by Emanuel Dejanu
Property svn:executable set to ``* Property svn:keywords set to `Author Date Id Revision`
File size: 6.8 KB

Rev	Line
[3365]	1	/**************************************************************************
	2	*
	3	* words.h -- Macros for parsing out words from the source text
	4	* Copyright (C) 1994 Neil Sharman
	5	*
	6	* This program is free software; you can redistribute it and/or modify
	7	* it under the terms of the GNU General Public License as published by
	8	* the Free Software Foundation; either version 2 of the License, or
	9	* (at your option) any later version.
	10	*
	11	* This program is distributed in the hope that it will be useful,
	12	* but WITHOUT ANY WARRANTY; without even the implied warranty of
	13	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	14	* GNU General Public License for more details.
	15	*
	16	* You should have received a copy of the GNU General Public License
	17	* along with this program; if not, write to the Free Software
	18	* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
	19	*
	20	**************************************************************************/
	21
	22	#include "sysfuncs.h"
	23
	24	#include "unitool.h"
	25	#include "UCArray.h"
	26
	27
	28	/*
	29	* This has been cleaned up by Tim Shimmin.
	30	*/
	31
	32	/*
	33	* ---NOTE---
	34	*
	35	* "WORD" refers to a word in the compressed text.
	36	* "STEM" or "STEM_WORD" refers to a word for indexing on
	37	*
	38	*/
	39
	40	#define MAXWORDLEN 15
	41	/* Maximum length in bytes of any word or non-word. Note that
	42	variations to MAXWORDLEN may have dramatic effects on the rest
	43	of the program, as the length and the prefix match are packed
	44	together into a four bit nibble, and there is not check that
	45	this is possible, i.e., leave MAXWORDLEN alone... */
	46
	47	#define MAXSTEMLEN 255
	48	/* Maximum length in bytes of any stem. Note that
	49	variations to MAXSTEMLEN may have dramatic effects on the rest
	50	of the program, , i.e., leave MAXSTEMLEN alone... */
	51
	52	#define MAXNUMERIC 4
	53	/* Maximum number of numeric characters permitted in a word.
	54	This avoids long sequences of numbers creating just one
	55	word occurrence for each number. At most 10,000 all numeric
	56	words will be permitted. */
	57
	58	/* [RPAP - Jan 97: Stem Index Change] */
	59	#define MAXPARAMLEN 20
	60	/* Maximum number of bytes to read for a parameter value for a
	61	term in a query. */
	62	#define WEIGHTPARAM '/'
	63	#define STEMPARAM '#'
	64
	65	/* [RJM 07/97: Ranked Required Terms] */
	66	#define MUSTMATCHPARAM '+'
	67
	68
	69	#define PESINAWORD(c) (isalnum(c) \|\| ((c) >= 0x80 && (c) <= 0xff))
	70	/* The definition of what characters are permitted in a word.
	71	This macro is pessimistic, you cannot tell from a particular
	72	byte above 0x80 whether it is a character or not. This function
	73	is needed by various functions relating to huffman coding
	74	where frequency counts need to be primed, it should not be
	75	used in parsing the UTF-8 encoded input. */
	76
	77	#ifdef __cplusplus
[8692]	78	extern "C" {
[3365]	79	#endif
	80	int inaword (const u_char here, const u_char end);
	81	/* Takes the place of the old INAWORD macro. It determines
	82	whether a given place in a UTF-8 encoded Unicode string
	83	is part of a word. */
	84
[8692]	85	int isaspace (const u_char here, const u_char end);
	86	/* It determines whether a given place in a UTF-8 encoded Unicode string is a unicode space. */
[3365]	87
[8692]	88	u_char skipspace(u_char here, u_char *end);
	89	/* Return a the UTF-8 encoded Unicode string with begining unicode spaces skippend. */
	90
	91	#ifdef __cplusplus
	92	}
	93	#endif
	94
[3365]	95	const unsigned char ParseIndexWord (const unsigned char textHere,
	96	const unsigned char *textEnd,
	97	UCArray &word);
	98	const unsigned char ParseIndexMGWord (const unsigned char textHere,
	99	const unsigned char *textEnd,
	100	unsigned char *mgWord);
	101	const unsigned char ParseNonindexWord (const unsigned char textHere,
	102	const unsigned char *textEnd);
	103
	104
	105
	106	/* =========================================================================
	107	* Macro: PARSE_WORD
	108	* Description:
	109	* Extract a word out for compressing text
	110	* Input:
	111	* s_in = string start in buffer
	112	* end = string end in buffer
	113	* Output:
	114	* Word = extracted word with length in 1st byte
	115	* s_in = ptr to next character in buffer yet to be processed
	116	* ========================================================================= */
	117	#define PARSE_WORD(Word, s_in, end) \
	118	do { \
	119	register int charlength = 0; \
	120	register u_char *wptr = (Word)+1; \
	121	register int length = 0; \
	122	register int numeric = 0; \
	123	unsigned short c; \
	124	\
	125	charlength = parse_utf8_char((s_in),(end),&c); \
	126	\
	127	while (length+charlength <= MAXWORDLEN && charlength > 0 && \
	128	(is_unicode_letter(c) \|\| (is_unicode_digit(c) && \
	129	++numeric <= MAXNUMERIC))) { \
	130	while (charlength-- > 0) { \
[9613]	131	wptr++ = (s_in)++; ++length; \
[3365]	132	} \
	133	charlength = parse_utf8_char((s_in),(end),&c); \
	134	} \
	135	*(Word) = length; \
	136	}while(0)
	137
	138
	139
	140	/* =========================================================================
	141	* Macro: PARSE_NON_WORD
	142	* Description:
	143	* Extract a non-word out for storing compressed text
	144	* Input: as above
	145	* Output: as above
	146	* ========================================================================= */
	147	#define PARSE_NON_WORD(Word, s_in, end) \
	148	do { \
	149	register int charlength = 0; \
	150	register u_char *wptr = (Word)+1; \
	151	register int length = 0; \
	152	unsigned short c; \
	153	\
	154	charlength = parse_utf8_char((s_in),(end),&c); \
	155	\
	156	while (length+charlength <= MAXWORDLEN && charlength > 0 && \
	157	!is_unicode_letdig(c)) { \
	158	while (charlength-- > 0) { \
[9613]	159	wptr++ = (s_in)++; ++length; \
[3365]	160	} \
	161	charlength = parse_utf8_char((s_in),(end),&c); \
	162	} \
	163	*(Word) = length; \
	164	}while(0)

Note: See TracBrowser for help on using the repository browser.

Download in other formats: