Context Navigation

source: trunk/indexers/mgpp/text/words.h@ 12331

Last change on this file since 12331 was 12321, checked in by kjdon, 18 years ago
made MAXNUMERIC a global variable instead of a \#define. Its now a command line arg to mgpp_passes, and an option in Queryer, and a parameter to ParseQuery
Property svn:executable set to ``* Property svn:keywords set to `Author Date Id Revision`
File size: 6.8 KB

Rev	Line
[3365]	1	/**************************************************************************
	2	*
	3	* words.h -- Macros for parsing out words from the source text
	4	* Copyright (C) 1994 Neil Sharman
	5	*
	6	* This program is free software; you can redistribute it and/or modify
	7	* it under the terms of the GNU General Public License as published by
	8	* the Free Software Foundation; either version 2 of the License, or
	9	* (at your option) any later version.
	10	*
	11	* This program is distributed in the hope that it will be useful,
	12	* but WITHOUT ANY WARRANTY; without even the implied warranty of
	13	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	14	* GNU General Public License for more details.
	15	*
	16	* You should have received a copy of the GNU General Public License
	17	* along with this program; if not, write to the Free Software
	18	* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
	19	*
	20	**************************************************************************/
[12321]	21	#ifndef H_WORDS
	22	#define H_WORDS
[3365]	23
[12321]	24
[3365]	25	#include "sysfuncs.h"
	26
	27	#include "unitool.h"
	28	#include "UCArray.h"
	29
	30
	31	/*
	32	* This has been cleaned up by Tim Shimmin.
	33	*/
	34
	35	/*
	36	* ---NOTE---
	37	*
	38	* "WORD" refers to a word in the compressed text.
	39	* "STEM" or "STEM_WORD" refers to a word for indexing on
	40	*
	41	*/
	42
	43	#define MAXWORDLEN 15
	44	/* Maximum length in bytes of any word or non-word. Note that
	45	variations to MAXWORDLEN may have dramatic effects on the rest
	46	of the program, as the length and the prefix match are packed
	47	together into a four bit nibble, and there is not check that
	48	this is possible, i.e., leave MAXWORDLEN alone... */
	49
	50	#define MAXSTEMLEN 255
	51	/* Maximum length in bytes of any stem. Note that
	52	variations to MAXSTEMLEN may have dramatic effects on the rest
	53	of the program, , i.e., leave MAXSTEMLEN alone... */
	54
[12321]	55	//#define MAXNUMERIC 4
[3365]	56	/* Maximum number of numeric characters permitted in a word.
	57	This avoids long sequences of numbers creating just one
	58	word occurrence for each number. At most 10,000 all numeric
	59	words will be permitted. */
	60
[12321]	61	extern int MAXNUMERIC;
	62
[3365]	63	/* [RPAP - Jan 97: Stem Index Change] */
	64	#define MAXPARAMLEN 20
	65	/* Maximum number of bytes to read for a parameter value for a
	66	term in a query. */
	67	#define WEIGHTPARAM '/'
	68	#define STEMPARAM '#'
	69
	70	/* [RJM 07/97: Ranked Required Terms] */
	71	#define MUSTMATCHPARAM '+'
	72
	73
	74	#define PESINAWORD(c) (isalnum(c) \|\| ((c) >= 0x80 && (c) <= 0xff))
	75	/* The definition of what characters are permitted in a word.
	76	This macro is pessimistic, you cannot tell from a particular
	77	byte above 0x80 whether it is a character or not. This function
	78	is needed by various functions relating to huffman coding
	79	where frequency counts need to be primed, it should not be
	80	used in parsing the UTF-8 encoded input. */
	81
	82	#ifdef __cplusplus
[8692]	83	extern "C" {
[3365]	84	#endif
	85	int inaword (const u_char here, const u_char end);
	86	/* Takes the place of the old INAWORD macro. It determines
	87	whether a given place in a UTF-8 encoded Unicode string
	88	is part of a word. */
	89
[8692]	90	int isaspace (const u_char here, const u_char end);
	91	/* It determines whether a given place in a UTF-8 encoded Unicode string is a unicode space. */
[3365]	92
[8692]	93	u_char skipspace(u_char here, u_char *end);
	94	/* Return a the UTF-8 encoded Unicode string with begining unicode spaces skippend. */
	95
	96	#ifdef __cplusplus
	97	}
	98	#endif
	99
[3365]	100	const unsigned char ParseIndexWord (const unsigned char textHere,
	101	const unsigned char *textEnd,
	102	UCArray &word);
	103	const unsigned char ParseIndexMGWord (const unsigned char textHere,
	104	const unsigned char *textEnd,
	105	unsigned char *mgWord);
	106	const unsigned char ParseNonindexWord (const unsigned char textHere,
	107	const unsigned char *textEnd);
	108
	109
	110
	111	/* =========================================================================
	112	* Macro: PARSE_WORD
	113	* Description:
	114	* Extract a word out for compressing text
	115	* Input:
	116	* s_in = string start in buffer
	117	* end = string end in buffer
	118	* Output:
	119	* Word = extracted word with length in 1st byte
	120	* s_in = ptr to next character in buffer yet to be processed
	121	* ========================================================================= */
	122	#define PARSE_WORD(Word, s_in, end) \
	123	do { \
	124	register int charlength = 0; \
	125	register u_char *wptr = (Word)+1; \
	126	register int length = 0; \
	127	register int numeric = 0; \
	128	unsigned short c; \
	129	\
	130	charlength = parse_utf8_char((s_in),(end),&c); \
	131	\
	132	while (length+charlength <= MAXWORDLEN && charlength > 0 && \
	133	(is_unicode_letter(c) \|\| (is_unicode_digit(c) && \
	134	++numeric <= MAXNUMERIC))) { \
	135	while (charlength-- > 0) { \
[9613]	136	wptr++ = (s_in)++; ++length; \
[3365]	137	} \
	138	charlength = parse_utf8_char((s_in),(end),&c); \
	139	} \
	140	*(Word) = length; \
	141	}while(0)
	142
	143
	144
	145	/* =========================================================================
	146	* Macro: PARSE_NON_WORD
	147	* Description:
	148	* Extract a non-word out for storing compressed text
	149	* Input: as above
	150	* Output: as above
	151	* ========================================================================= */
	152	#define PARSE_NON_WORD(Word, s_in, end) \
	153	do { \
	154	register int charlength = 0; \
	155	register u_char *wptr = (Word)+1; \
	156	register int length = 0; \
	157	unsigned short c; \
	158	\
	159	charlength = parse_utf8_char((s_in),(end),&c); \
	160	\
	161	while (length+charlength <= MAXWORDLEN && charlength > 0 && \
	162	!is_unicode_letdig(c)) { \
	163	while (charlength-- > 0) { \
[9613]	164	wptr++ = (s_in)++; ++length; \
[3365]	165	} \
	166	charlength = parse_utf8_char((s_in),(end),&c); \
	167	} \
	168	*(Word) = length; \
	169	}while(0)
[12321]	170
	171	#endif

Note: See TracBrowser for help on using the repository browser.

Download in other formats: