source: trunk/indexers/mg/src/text/stemmer.c@ 7627

Last change on this file since 7627 was 7627, checked in by kjdon, 20 years ago

make stemmerdescription a const

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 4.0 KB
Line 
1/**************************************************************************
2 *
3 * stemmer.c -- The stemmer/case folder
4 * Copyright (C) 1994 Neil Sharman
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
19 *
20 * $Id: stemmer.c 7627 2004-06-22 04:17:06Z kjdon $
21 *
22 **************************************************************************/
23
24#include "sysfuncs.h"
25#include "stemmer.h"
26
27#include "lovinstem.h"
28#include "simplefrenchstem.h"
29#include "unitool.h"
30
31/*
32 $Log$
33 Revision 1.2 2004/06/22 04:17:06 kjdon
34 make stemmerdescription a const
35
36 Revision 1.1 2003/02/20 21:18:24 mdewsnip
37 Addition of MG package for search and retrieval
38
39 Revision 1.1 1999/08/10 21:18:23 sjboddie
40 renamed mg-1.3d directory mg
41
42 Revision 1.3 1998/12/17 09:12:54 rjmcnab
43
44 Altered mg to process utf-8 encoded Unicode. The main changes
45 are in the parsing of the input, the casefolding, and the stemming.
46
47 Revision 1.2 1998/11/25 07:55:51 rjmcnab
48
49 Modified mg to that you can specify the stemmer you want
50 to use via a command line option. You specify it to
51 mg_passes during the build process. The number of the
52 stemmer that you used is stored within the inverted
53 dictionary header and the stemmed dictionary header so
54 the correct stemmer is used in later stages of building
55 and querying.
56
57 Revision 1.1 1998/11/17 09:35:42 rjmcnab
58 *** empty log message ***
59
60 * Revision 1.3 1994/10/20 03:57:05 tes
61 * I have rewritten the boolean query optimiser and abstracted out the
62 * components of the boolean query.
63 *
64 * Revision 1.2 1994/09/20 04:42:10 tes
65 * For version 1.1
66 *
67 */
68
69static char *RCSID = "$Id: stemmer.c 7627 2004-06-22 04:17:06Z kjdon $";
70
71
72#define LOVINSTEMMER 0
73#define SIMPLEFRENCHSTEMMER 1
74
75
76/* decode the utf-8 encoded unicode, casefold and then recode
77 * making sure the final length doesn't exceed the original
78 * length */
79static void unicode_casefold (u_char *word) {
80 unsigned short out[256]; /* temp space */
81 int i;
82 int len;
83
84 /* decode */
85 utf8_word_to_unicode (word, out, 255);
86 len = out[0];
87
88 /* casefold and simplify-fold */
89 for (i=0; i<len; i++) {
90 out[i+1] = unicode_tosimplified(unicode_tolower(out[i+1]));
91 }
92
93 /* re-code */
94 unicode_to_utf8_word (out, word, word[0]+1);
95}
96
97
98int stemmernumber (const u_char *stemmerdescription) {
99 u_char descript[MAX_STEM_DESCRIPTION_LEN];
100 int i;
101
102 /* copy and case-fold the description */
103 for (i=0; (stemmerdescription[i] != '\0') &&
104 (i < MAX_STEM_DESCRIPTION_LEN-1); i++)
105 descript[i] = tolower (stemmerdescription[i]);
106 descript[i] = '\0';
107
108 /* map the description to its number */
109
110 if ((strcmp (descript, "0") == 0) ||
111 (strcmp (descript, "english") == 0) ||
112 (strcmp (descript, "lovin") == 0))
113 return LOVINSTEMMER;
114
115 if ((strcmp (descript, "1") == 1) ||
116 (strcmp (descript, "french") == 0) ||
117 (strcmp (descript, "simplefrench") == 0))
118 return SIMPLEFRENCHSTEMMER;
119
120 return -1;
121}
122
123
124
125/*
126 * Method 0 - Do not stem or case fold.
127 * Method 1 - Case fold.
128 * Method 2 - Stem.
129 * Method 3 - Case fold and stem.
130 *
131 * The stemmer number should be obtained using
132 * the stemmernumber function above.
133 */
134void
135stemmer (int method, int stemmer, u_char *word) {
136 if (method & 1) {
137 unicode_casefold (word);
138 }
139
140 if (method & 2) {
141 switch (stemmer) {
142 case LOVINSTEMMER: lovinstem (word);
143 break;
144 case SIMPLEFRENCHSTEMMER: simplefrenchstem (word);
145 break;
146 }
147 }
148}
Note: See TracBrowser for help on using the repository browser.