Context Navigation

source: trunk/indexers/mg/src/text/words.h@ 13660

Last change on this file since 13660 was 13660, checked in by kjdon, 17 years ago
added some x++ -> ++x changes submitted by Emanuel Dejanu
Property svn:executable set to ``* Property svn:keywords set to `Author Date Id Revision`
File size: 18.6 KB

Rev	Line
[3745]	1	/**************************************************************************
	2	*
	3	* words.h -- Macros for parsing out words from the source text
	4	* Copyright (C) 1994 Neil Sharman
	5	*
	6	* This program is free software; you can redistribute it and/or modify
	7	* it under the terms of the GNU General Public License as published by
	8	* the Free Software Foundation; either version 2 of the License, or
	9	* (at your option) any later version.
	10	*
	11	* This program is distributed in the hope that it will be useful,
	12	* but WITHOUT ANY WARRANTY; without even the implied warranty of
	13	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	14	* GNU General Public License for more details.
	15	*
	16	* You should have received a copy of the GNU General Public License
	17	* along with this program; if not, write to the Free Software
	18	* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
	19	*
	20	* $Id: words.h 13660 2007-01-16 23:13:03Z kjdon $
	21	*
	22	**************************************************************************/
	23
[7228]	24
[3745]	25	#include "sysfuncs.h"
	26	#include "unitool.h"
	27
	28	/*
	29	* This has been cleaned up by Tim Shimmin.
	30	*/
	31
	32	/*
	33	* ---NOTE---
	34	*
	35	* "WORD" refers to a word in the compressed text.
	36	* "STEM" or "STEM_WORD" refers to a word for indexing on
	37	*
	38	*/
	39
	40	#define MAXWORDLEN 15
	41	/* Maximum length in bytes of any word or non-word. Note that
	42	variations to MAXWORDLEN may have dramatic effects on the rest
	43	of the program, as the length and the prefix match are packed
	44	together into a four bit nibble, and there is not check that
	45	this is possible, i.e., leave MAXWORDLEN alone... */
	46
	47	#define MAXSTEMLEN 255
	48	/* Maximum length in bytes of any stem. Note that
	49	variations to MAXSTEMLEN may have dramatic effects on the rest
	50	of the program, , i.e., leave MAXSTEMLEN alone... */
	51
[7228]	52	/#define MAXNUMERIC 4/
	53
[3745]	54	/* Maximum number of numeric characters permitted in a word.
	55	This avoids long sequences of numbers creating just one
	56	word occurrence for each number. At most 10,000 all numeric
	57	words will be permitted. */
	58
	59	/* [RPAP - Jan 97: Stem Index Change] */
	60	#define MAXPARAMLEN 20
	61	/* Maximum number of bytes to read for a parameter value for a
	62	term in a query. */
	63	#define WEIGHTPARAM '/'
	64	#define STEMPARAM '#'
	65
	66	/* [RJM 07/97: Ranked Required Terms] */
	67	#define MUSTMATCHPARAM '+'
	68
	69
	70	#define PESINAWORD(c) (isalnum(c) \|\| ((c) >= 0x80 && (c) <= 0xff))
	71	/* The definition of what characters are permitted in a word.
	72	This macro is pessimistic, you cannot tell from a particular
	73	byte above 0x80 whether it is a character or not. This function
	74	is needed by various functions relating to huffman coding
	75	where frequency counts need to be primed, it should not be
	76	used in parsing the UTF-8 encoded input. */
	77
	78	int inaword (const u_char here, const u_char end);
	79	/* Takes the place of the old INAWORD macro. It determines
	80	whether a given place in a UTF-8 encoded Unicode string
	81	is part of a word. */
	82
[8694]	83	int isaspace (const u_char here, const u_char end);
	84	/* It determines whether a given place in a UTF-8 encoded
	85	Unicode string is a unicode space. */
[13660]	86
[8694]	87	u_char skipspace(u_char here, u_char *end);
[13660]	88	/* Return a the UTF-8 encoded Unicode string with beginning
	89	unicode spaces skipped. */
	90
[8694]	91
[3745]	92	/* =========================================================================
	93	* Macro: PARSE_WORD
	94	* Description:
	95	* Extract a word out for compressing text
	96	* Input:
	97	* s_in = string start in buffer
	98	* end = string end in buffer
	99	* Output:
	100	* Word = extracted word with length in 1st byte
	101	* s_in = ptr to next character in buffer yet to be processed
	102	* ========================================================================= */
	103	#define PARSE_WORD(Word, s_in, end) \
	104	do { \
	105	register int charlength = 0; \
	106	register u_char *wptr = (Word)+1; \
	107	register int length = 0; \
	108	register int numeric = 0; \
	109	unsigned short c; \
[7228]	110	register int maxnumeric = IntEnv (GetEnv ("maxnumeric"), 4); \
[3745]	111	\
	112	charlength = parse_utf8_char((s_in),(end),&c); \
	113	\
	114	while (length+charlength <= MAXWORDLEN && charlength > 0 && \
	115	(is_unicode_letter(c) \|\| (is_unicode_digit(c) && \
[7228]	116	++numeric <= maxnumeric))) { \
[3745]	117	while (charlength-- > 0) { \
[13660]	118	wptr++ = (s_in)++; ++length; \
[3745]	119	} \
	120	charlength = parse_utf8_char((s_in),(end),&c); \
	121	} \
	122	*(Word) = length; \
	123	}while(0)
	124
	125	/*
	126	#define PARSE_WORD(Word, s_in, end) \
	127	do { \
	128	register u_char *wptr = (Word)+1; \
	129	register int length = 0; \
	130	register int c = *(s_in); \
	131	register int numeric = 0; \
	132	\
	133	while( length < MAXWORDLEN && INAWORD(c) && (s_in)<=(end)) \
	134	{ \
	135	if ((numeric += INNUMBER(c)) > MAXNUMERIC) \
	136	break; \
	137	*wptr++ = c; \
[13660]	138	++length; \
[3745]	139	c = *++(s_in); \
	140	} \
	141	*(Word) = length; \
	142	}while(0)
	143	*/
	144
	145
	146	/* =========================================================================
	147	* Macro: PARSE_NON_WORD
	148	* Description:
	149	* Extract a non-word out for storing compressed text
	150	* Input: as above
	151	* Output: as above
	152	* ========================================================================= */
	153	#define PARSE_NON_WORD(Word, s_in, end) \
	154	do { \
	155	register int charlength = 0; \
	156	register u_char *wptr = (Word)+1; \
	157	register int length = 0; \
	158	unsigned short c; \
	159	\
	160	charlength = parse_utf8_char((s_in),(end),&c); \
	161	\
	162	while (length+charlength <= MAXWORDLEN && charlength > 0 && \
	163	!is_unicode_letdig(c)) { \
	164	while (charlength-- > 0) { \
[13660]	165	wptr++ = (s_in)++; ++length; \
[3745]	166	} \
	167	charlength = parse_utf8_char((s_in),(end),&c); \
	168	} \
	169	*(Word) = length; \
	170	}while(0)
	171
	172	/*
	173	#define PARSE_NON_WORD(Word, s_in, end) \
	174	do { \
	175	register u_char *wptr = (Word)+1; \
	176	register int length = 0; \
	177	register int c = *(s_in); \
	178	\
	179	while( length < MAXWORDLEN && !INAWORD(c) && (s_in)<=(end) ) \
	180	{ \
	181	*wptr++ = c; \
[13660]	182	++length; \
[3745]	183	c = *++(s_in); \
	184	} \
	185	*(Word) = length; \
	186	}while(0)
	187	*/
	188
	189
	190	/* =========================================================================
	191	* Macro: PARSE_STEM_WORD
	192	* Description:
	193	* Extracts out Word.
	194	* Input:
	195	* s_in points to 1st letter in buffer to test
	196	* end points to last letter in buffer
	197	* Output:
	198	* s_in is modified to move to next word
	199	* Returns Word filled in with length in 1st byte.
	200	* ========================================================================= */
	201	#define PARSE_STEM_WORD(Word, s_in, end) \
	202	do { \
	203	register int charlength = 0; \
	204	register u_char *wptr = (Word)+1; \
	205	register int length = 0; \
	206	register int numeric = 0; \
	207	unsigned short c; \
[7228]	208	register int maxnumeric = IntEnv (GetEnv ("maxnumeric"), 4); \
[3745]	209	\
	210	charlength = parse_utf8_char((s_in),(end),&c); \
	211	\
	212	while (length+charlength <= MAXSTEMLEN && charlength > 0 && \
	213	(is_unicode_letter(c) \|\| (is_unicode_digit(c) && \
[7228]	214	++numeric <= maxnumeric))) { \
[3745]	215	while (charlength-- > 0) { \
[13660]	216	wptr++ = (s_in)++; ++length; \
[3745]	217	} \
	218	charlength = parse_utf8_char((s_in),(end),&c); \
	219	} \
	220	*(Word) = length; \
	221	}while(0)
	222	/*
	223	#define PARSE_STEM_WORD(Word, s_in, end) \
	224	do \
	225	{ \
	226	register u_char *wptr = (Word)+1; \
	227	register int length = 0; \
	228	register int c = *(s_in); \
	229	register int numeric = 0; \
	230	\
	231	while ( length < MAXSTEMLEN && INAWORD(c) && (s_in)<=(end)) \
	232	{ \
	233	if ((numeric += INNUMBER(c)) > MAXNUMERIC) \
	234	break; \
	235	*wptr++ = c; \
[13660]	236	++length; \
[3745]	237	c = *++(s_in); \
	238	} \
	239	*(Word) = length; \
	240	}while(0)
	241	*/
	242
	243
	244	/* =========================================================================
	245	* Macro: PARSE_NON_STEM_WORD
	246	* Description:
	247	* Eat up non-word. Do not store non-word.
	248	* It is not needed in index only in text !
	249	*
	250	* Input: as above but no Word needed
	251	* Output: as above
	252	* ========================================================================= */
	253	#define PARSE_NON_STEM_WORD(s_in, end) \
	254	do { \
	255	register int charlength = 0; \
	256	unsigned short c; \
	257	\
	258	charlength = parse_utf8_char((s_in),(end),&c); \
	259	\
	260	while (charlength > 0 && !is_unicode_letdig(c)) { \
	261	(s_in) += charlength; \
	262	charlength = parse_utf8_char((s_in),(end),&c); \
	263	} \
	264	}while(0)
	265
	266	/*
	267	#define PARSE_NON_STEM_WORD(s_in, end) \
	268	do \
	269	{ \
	270	while (!INAWORD(*(s_in)) && (s_in)<=(end)) \
	271	(s_in)++; \
	272	}while(0)
	273	*/
	274
	275
	276	/* =========================================================================
	277	* Macro: PARSE_NON_STEM_WORD_OR_SGML_TAG
	278	* Description:
	279	* Like PARSE_NON_STEM_WORD but also eats up SGML tags
	280	* Input: as above
	281	* Output: as above
	282	* ========================================================================= */
	283	#define PARSE_NON_STEM_WORD_OR_SGML_TAG(s_in, end) \
	284	do { \
	285	register int charlength = 0; \
	286	unsigned short c; \
	287	\
	288	charlength = parse_utf8_char((s_in),(end),&c); \
	289	\
	290	while (charlength > 0 && !is_unicode_letdig(c)) { \
	291	if (c == '<') { \
	292	while (charlength > 0 && c != '>') { \
	293	(s_in) += charlength; \
	294	charlength = parse_utf8_char((s_in),(end),&c); \
	295	} \
	296	} \
	297	(s_in) += charlength; \
	298	charlength = parse_utf8_char((s_in),(end),&c); \
	299	} \
	300	}while(0)
	301
	302	/*
	303	#define PARSE_NON_STEM_WORD_OR_SGML_TAG(s_in, end) \
	304	do \
	305	{ \
	306	register int c = *(s_in); \
	307	\
	308	while (!INAWORD(c) && (s_in)<=(end)) \
	309	{ \
	310	if (c == '<') \
	311	{ \
	312	while (c != '>' && (s_in)<=(end)) \
	313	c = *++(s_in); \
	314	} \
	315	if ((s_in)<=(end)) \
	316	c = *++(s_in); \
	317	} \
	318	}while(0)
	319	*/
	320
	321
	322	/* =========================================================================
	323	* Macro: PARSE_OPT_TERM_PARAM [RPAP - Jan 97: Stem Index Change]
	324	* Description:
	325	* Extracts out optional paramater for query term.
	326	* Needed only in parsing the query line !
	327	*
	328	* Note: that this function has not been converted to use UTF-8
	329	* as it should still work as it is (only uses ascii
	330	* characters)
	331	*
	332	* Input: as above but no Word needed
	333	* Output: as above
	334	* ========================================================================= */
	335	#define PARSE_OPT_TERM_PARAM(Param, type, s_in, end) \
	336	do \
	337	{ \
	338	register u_char *wptr = (Param); \
	339	register int length = 0; \
	340	register int c = *(s_in); \
	341	\
	342	if (c == WEIGHTPARAM \|\| c == STEMPARAM) \
	343	{ \
	344	type = c; \
	345	c = *++(s_in); \
	346	while( length < MAXPARAMLEN && isdigit(c) && (s_in)<=(end)) \
	347	{ \
	348	*wptr++ = c; \
[13660]	349	++length; \
[3745]	350	c = *++(s_in); \
	351	} \
	352	*wptr = '\0'; \
	353	for (; isdigit(c) && (s_in)<=(end); c = *++(s_in)) \
	354	; \
	355	} \
	356	}while(0)
	357
	358	/* =========================================================================
	359	* Macro: PARSE_RANKED_NON_STEM_WORD [RJM 07/97: Ranked Required Terms]
	360	* Description:
	361	* Eat up non-word. Do not store non-word.
	362	* If come across a match requirement store it in require_match
	363	* It is not needed in index only in text !
	364	*
	365	* Input: as above
	366	* Output: the requirement mode for the next term. -1=must not match,
	367	* 0=optional match, 1=must match
	368	* ========================================================================= */
	369	#define PARSE_RANKED_NON_STEM_WORD(require_match, s_in, end) \
	370	do { \
	371	register int charlength = 0; \
	372	unsigned short c; \
	373	(require_match) = 0; \
	374	\
	375	charlength = parse_utf8_char((s_in),(end),&c); \
	376	\
	377	while (charlength > 0 && !is_unicode_letdig(c)) { \
	378	if (c == MUSTMATCHPARAM) (require_match) = 1; \
	379	(s_in) += charlength; \
	380	charlength = parse_utf8_char((s_in),(end),&c); \
	381	} \
	382	}while(0)
	383
	384	/*
	385	#define PARSE_RANKED_NON_STEM_WORD(require_match, s_in, end) \
	386	do { \
	387	(require_match) = 0; \
	388	while (!INAWORD(*(s_in)) && (s_in)<=(end)) { \
	389	if (*(s_in) == MUSTMATCHPARAM) { \
	390	(require_match) = 1; \
	391	} \
	392	(s_in)++; \
	393	} \
	394	} while (0)
	395	*/

Note: See TracBrowser for help on using the repository browser.

Download in other formats: