Context Navigation

source: trunk/indexers/mgpp/text/words.cpp@ 13715

Last change on this file since 13715 was 13715, checked in by kjdon, 17 years ago
renamed inaword, isaspace and skipspace so they don't conflict with mg versions
Property svn:executable set to ``* Property svn:keywords set to `Author Date Id Revision`
File size: 3.7 KB

Line
1	/**************************************************************************
2	*
3	* words.cpp -- Functions for parsing out words from the source text
4	* Copyright (C) 1999 Rodger McNab
5	*
6	* This program is free software; you can redistribute it and/or modify
7	* it under the terms of the GNU General Public License as published by
8	* the Free Software Foundation; either version 2 of the License, or
9	* (at your option) any later version.
10	*
11	* This program is distributed in the hope that it will be useful,
12	* but WITHOUT ANY WARRANTY; without even the implied warranty of
13	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14	* GNU General Public License for more details.
15	*
16	* You should have received a copy of the GNU General Public License
17	* along with this program; if not, write to the Free Software
18	* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
19	*
20	**************************************************************************/
21
22	// need this to avoid bizarre compiler problems under VC++ 6.0
23	#if defined (__WIN32__) && !defined (GSDL_USE_IOS_H)
24	# include <iostream>
25	#endif
26
27	#include "words.h"
28
29	int MAXNUMERIC = 4;
30
31	/* Takes the place of the old INAWORD macro. It determines
32	whether a given place in a UTF-8 encoded Unicode string
33	is part of a word. */
34	int inaword_mgpp (const u_char here, const u_char end) {
35	unsigned short c;
36	if (parse_utf8_char(here, end, &c) > 0) return is_unicode_letdig (c);
37	return 0;
38	}
39
40	/* It determines whether a given place in a UTF-8 encoded Unicode string is a unicode space. */
41	int isaspace_mgpp (const u_char here, const u_char end)
42	{
43	unsigned short c;
44	if (parse_utf8_char(here, end, &c) > 0) return is_unicode_space(c);
45	return 0;
46	}
47
48	/* Return a the UTF-8 encoded Unicode string with begining
49	unicode spaces skippend. */
50	u_char skipspace_mgpp(u_char here, u_char *end)
51	{
52	unsigned short c;
53	int length;
54	while(here != end) {
55	length = parse_utf8_char(here, end, &c);
56	if (length == 0 \|\| !is_unicode_space(c)) break;
57	here += length;
58	}
59	return here;
60	}
61
62	const unsigned char ParseIndexWord (const unsigned char textHere,
63	const unsigned char *textEnd,
64	UCArray &word) {
65	word.erase (word.begin(), word.end());
66
67	register int charlength = 0;
68	register int length = 0;
69	register int numeric = 0;
70	unsigned short c;
71
72	charlength = parse_utf8_char (textHere, textEnd, &c);
73
74	while (length+charlength <= MAXSTEMLEN && charlength > 0 &&
75	(is_unicode_letter(c) \|\| (is_unicode_digit(c) &&
76	++numeric <= MAXNUMERIC))) {
77	while (charlength-- > 0) {
78	word.push_back (*textHere++); ++length;
79	}
80	charlength = parse_utf8_char (textHere, textEnd, &c);
81	}
82
83	return textHere;
84	}
85
86	const unsigned char ParseIndexMGWord (const unsigned char textHere,
87	const unsigned char *textEnd,
88	unsigned char *mgWord) {
89	register int charlength = 0;
90	register int length = 0;
91	register int numeric = 0;
92	unsigned short c;
93
94	charlength = parse_utf8_char (textHere, textEnd, &c);
95
96	while (length+charlength <= MAXSTEMLEN && charlength > 0 &&
97	(is_unicode_letter(c) \|\| (is_unicode_digit(c) &&
98	++numeric <= MAXNUMERIC))) {
99	while (charlength-- > 0) {
100	mgWord[++length] = *textHere++;
101	}
102	charlength = parse_utf8_char (textHere, textEnd, &c);
103	}
104
105	mgWord[0] = length;
106
107	return textHere;
108	}
109
110	const unsigned char ParseNonindexWord (const unsigned char textHere,
111	const unsigned char *textEnd) {
112	register int charlength = 0;
113	unsigned short c;
114
115	charlength = parse_utf8_char(textHere, textEnd, &c);
116
117	while (charlength > 0 && !is_unicode_letdig(c)) {
118	textHere += charlength;
119	charlength = parse_utf8_char (textHere, textEnd, &c);
120	}
121
122	return textHere;
123	}

Note: See TracBrowser for help on using the repository browser.

Download in other formats: