Context Navigation

source: trunk/indexers/mgpp/text/words.cpp@ 13715

Last change on this file since 13715 was 13715, checked in by kjdon, 17 years ago
renamed inaword, isaspace and skipspace so they don't conflict with mg versions
Property svn:executable set to ``* Property svn:keywords set to `Author Date Id Revision`
File size: 3.7 KB

Rev	Line
[3365]	1	/**************************************************************************
	2	*
	3	* words.cpp -- Functions for parsing out words from the source text
	4	* Copyright (C) 1999 Rodger McNab
	5	*
	6	* This program is free software; you can redistribute it and/or modify
	7	* it under the terms of the GNU General Public License as published by
	8	* the Free Software Foundation; either version 2 of the License, or
	9	* (at your option) any later version.
	10	*
	11	* This program is distributed in the hope that it will be useful,
	12	* but WITHOUT ANY WARRANTY; without even the implied warranty of
	13	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	14	* GNU General Public License for more details.
	15	*
	16	* You should have received a copy of the GNU General Public License
	17	* along with this program; if not, write to the Free Software
	18	* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
	19	*
	20	**************************************************************************/
	21
	22	// need this to avoid bizarre compiler problems under VC++ 6.0
	23	#if defined (__WIN32__) && !defined (GSDL_USE_IOS_H)
	24	# include <iostream>
	25	#endif
	26
	27	#include "words.h"
	28
[12321]	29	int MAXNUMERIC = 4;
[3365]	30
	31	/* Takes the place of the old INAWORD macro. It determines
	32	whether a given place in a UTF-8 encoded Unicode string
	33	is part of a word. */
[13715]	34	int inaword_mgpp (const u_char here, const u_char end) {
[3365]	35	unsigned short c;
	36	if (parse_utf8_char(here, end, &c) > 0) return is_unicode_letdig (c);
	37	return 0;
	38	}
	39
[8692]	40	/* It determines whether a given place in a UTF-8 encoded Unicode string is a unicode space. */
[13715]	41	int isaspace_mgpp (const u_char here, const u_char end)
[8692]	42	{
	43	unsigned short c;
	44	if (parse_utf8_char(here, end, &c) > 0) return is_unicode_space(c);
	45	return 0;
	46	}
	47
	48	/* Return a the UTF-8 encoded Unicode string with begining
	49	unicode spaces skippend. */
[13715]	50	u_char skipspace_mgpp(u_char here, u_char *end)
[8692]	51	{
	52	unsigned short c;
	53	int length;
	54	while(here != end) {
	55	length = parse_utf8_char(here, end, &c);
	56	if (length == 0 \|\| !is_unicode_space(c)) break;
	57	here += length;
	58	}
	59	return here;
	60	}
	61
[3365]	62	const unsigned char ParseIndexWord (const unsigned char textHere,
	63	const unsigned char *textEnd,
	64	UCArray &word) {
	65	word.erase (word.begin(), word.end());
	66
	67	register int charlength = 0;
	68	register int length = 0;
	69	register int numeric = 0;
	70	unsigned short c;
	71
	72	charlength = parse_utf8_char (textHere, textEnd, &c);
	73
	74	while (length+charlength <= MAXSTEMLEN && charlength > 0 &&
	75	(is_unicode_letter(c) \|\| (is_unicode_digit(c) &&
	76	++numeric <= MAXNUMERIC))) {
	77	while (charlength-- > 0) {
[8692]	78	word.push_back (*textHere++); ++length;
[3365]	79	}
	80	charlength = parse_utf8_char (textHere, textEnd, &c);
	81	}
	82
	83	return textHere;
	84	}
	85
	86	const unsigned char ParseIndexMGWord (const unsigned char textHere,
	87	const unsigned char *textEnd,
	88	unsigned char *mgWord) {
	89	register int charlength = 0;
	90	register int length = 0;
	91	register int numeric = 0;
	92	unsigned short c;
	93
	94	charlength = parse_utf8_char (textHere, textEnd, &c);
	95
	96	while (length+charlength <= MAXSTEMLEN && charlength > 0 &&
	97	(is_unicode_letter(c) \|\| (is_unicode_digit(c) &&
	98	++numeric <= MAXNUMERIC))) {
	99	while (charlength-- > 0) {
	100	mgWord[++length] = *textHere++;
	101	}
	102	charlength = parse_utf8_char (textHere, textEnd, &c);
	103	}
	104
	105	mgWord[0] = length;
	106
	107	return textHere;
	108	}
	109
	110	const unsigned char ParseNonindexWord (const unsigned char textHere,
	111	const unsigned char *textEnd) {
	112	register int charlength = 0;
	113	unsigned short c;
	114
	115	charlength = parse_utf8_char(textHere, textEnd, &c);
	116
	117	while (charlength > 0 && !is_unicode_letdig(c)) {
	118	textHere += charlength;
	119	charlength = parse_utf8_char (textHere, textEnd, &c);
	120	}
	121
	122	return textHere;
	123	}

Note: See TracBrowser for help on using the repository browser.

Download in other formats: