source: trunk/gsdl/src/mgpp/text/words.cpp@ 879

Last change on this file since 879 was 856, checked in by sjboddie, 24 years ago

Rodgers new C++ mg

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 2.9 KB
Line 
1/**************************************************************************
2 *
3 * words.cpp -- Functions for parsing out words from the source text
4 * Copyright (C) 1999 Rodger McNab
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
19 *
20 * $Id: words.cpp 856 2000-01-14 02:26:25Z sjboddie $
21 *
22 **************************************************************************/
23
24#include "words.h"
25
26
27/* Takes the place of the old INAWORD macro. It determines
28 whether a given place in a UTF-8 encoded Unicode string
29 is part of a word. */
30int inaword (const u_char *here, const u_char *end) {
31 unsigned short c;
32 if (parse_utf8_char(here, end, &c) > 0) return is_unicode_letdig (c);
33 return 0;
34}
35
36const unsigned char *ParseIndexWord (const unsigned char *textHere,
37 const unsigned char *textEnd,
38 UCArray &word) {
39 word.erase (word.begin(), word.end());
40
41 register int charlength = 0;
42 register int length = 0;
43 register int numeric = 0;
44 unsigned short c;
45
46 charlength = parse_utf8_char (textHere, textEnd, &c);
47
48 while (length+charlength <= MAXSTEMLEN && charlength > 0 &&
49 (is_unicode_letter(c) || (is_unicode_digit(c) &&
50 ++numeric <= MAXNUMERIC))) {
51 while (charlength-- > 0) {
52 word.push_back (*textHere++); length++;
53 }
54 charlength = parse_utf8_char (textHere, textEnd, &c);
55 }
56
57 return textHere;
58}
59
60const unsigned char *ParseIndexMGWord (const unsigned char *textHere,
61 const unsigned char *textEnd,
62 unsigned char *mgWord) {
63 register int charlength = 0;
64 register int length = 0;
65 register int numeric = 0;
66 unsigned short c;
67
68 charlength = parse_utf8_char (textHere, textEnd, &c);
69
70 while (length+charlength <= MAXSTEMLEN && charlength > 0 &&
71 (is_unicode_letter(c) || (is_unicode_digit(c) &&
72 ++numeric <= MAXNUMERIC))) {
73 while (charlength-- > 0) {
74 mgWord[++length] = *textHere++;
75 }
76 charlength = parse_utf8_char (textHere, textEnd, &c);
77 }
78
79 mgWord[0] = length;
80
81 return textHere;
82}
83
84const unsigned char *ParseNonindexWord (const unsigned char *textHere,
85 const unsigned char *textEnd) {
86 register int charlength = 0;
87 unsigned short c;
88
89 charlength = parse_utf8_char(textHere, textEnd, &c);
90
91 while (charlength > 0 && !is_unicode_letdig(c)) {
92 textHere += charlength;
93 charlength = parse_utf8_char (textHere, textEnd, &c);
94 }
95
96 return textHere;
97}
Note: See TracBrowser for help on using the repository browser.