source: trunk/indexers/mg/src/text/stemmer.c@ 3745

Last change on this file since 3745 was 3745, checked in by mdewsnip, 21 years ago

Addition of MG package for search and retrieval

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 3.9 KB
Line 
1/**************************************************************************
2 *
3 * stemmer.c -- The stemmer/case folder
4 * Copyright (C) 1994 Neil Sharman
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
19 *
20 * $Id: stemmer.c 3745 2003-02-20 21:20:24Z mdewsnip $
21 *
22 **************************************************************************/
23
24#include "sysfuncs.h"
25#include "stemmer.h"
26
27#include "lovinstem.h"
28#include "simplefrenchstem.h"
29#include "unitool.h"
30
31/*
32 $Log$
33 Revision 1.1 2003/02/20 21:18:24 mdewsnip
34 Addition of MG package for search and retrieval
35
36 Revision 1.1 1999/08/10 21:18:23 sjboddie
37 renamed mg-1.3d directory mg
38
39 Revision 1.3 1998/12/17 09:12:54 rjmcnab
40
41 Altered mg to process utf-8 encoded Unicode. The main changes
42 are in the parsing of the input, the casefolding, and the stemming.
43
44 Revision 1.2 1998/11/25 07:55:51 rjmcnab
45
46 Modified mg to that you can specify the stemmer you want
47 to use via a command line option. You specify it to
48 mg_passes during the build process. The number of the
49 stemmer that you used is stored within the inverted
50 dictionary header and the stemmed dictionary header so
51 the correct stemmer is used in later stages of building
52 and querying.
53
54 Revision 1.1 1998/11/17 09:35:42 rjmcnab
55 *** empty log message ***
56
57 * Revision 1.3 1994/10/20 03:57:05 tes
58 * I have rewritten the boolean query optimiser and abstracted out the
59 * components of the boolean query.
60 *
61 * Revision 1.2 1994/09/20 04:42:10 tes
62 * For version 1.1
63 *
64 */
65
66static char *RCSID = "$Id: stemmer.c 3745 2003-02-20 21:20:24Z mdewsnip $";
67
68
69#define LOVINSTEMMER 0
70#define SIMPLEFRENCHSTEMMER 1
71
72
73/* decode the utf-8 encoded unicode, casefold and then recode
74 * making sure the final length doesn't exceed the original
75 * length */
76static void unicode_casefold (u_char *word) {
77 unsigned short out[256]; /* temp space */
78 int i;
79 int len;
80
81 /* decode */
82 utf8_word_to_unicode (word, out, 255);
83 len = out[0];
84
85 /* casefold and simplify-fold */
86 for (i=0; i<len; i++) {
87 out[i+1] = unicode_tosimplified(unicode_tolower(out[i+1]));
88 }
89
90 /* re-code */
91 unicode_to_utf8_word (out, word, word[0]+1);
92}
93
94
95int stemmernumber (u_char *stemmerdescription) {
96 u_char descript[MAX_STEM_DESCRIPTION_LEN];
97 int i;
98
99 /* copy and case-fold the description */
100 for (i=0; (stemmerdescription[i] != '\0') &&
101 (i < MAX_STEM_DESCRIPTION_LEN-1); i++)
102 descript[i] = tolower (stemmerdescription[i]);
103 descript[i] = '\0';
104
105 /* map the description to its number */
106
107 if ((strcmp (descript, "0") == 0) ||
108 (strcmp (descript, "english") == 0) ||
109 (strcmp (descript, "lovin") == 0))
110 return LOVINSTEMMER;
111
112 if ((strcmp (descript, "1") == 1) ||
113 (strcmp (descript, "french") == 0) ||
114 (strcmp (descript, "simplefrench") == 0))
115 return SIMPLEFRENCHSTEMMER;
116
117 return -1;
118}
119
120
121
122/*
123 * Method 0 - Do not stem or case fold.
124 * Method 1 - Case fold.
125 * Method 2 - Stem.
126 * Method 3 - Case fold and stem.
127 *
128 * The stemmer number should be obtained using
129 * the stemmernumber function above.
130 */
131void
132stemmer (int method, int stemmer, u_char *word) {
133 if (method & 1) {
134 unicode_casefold (word);
135 }
136
137 if (method & 2) {
138 switch (stemmer) {
139 case LOVINSTEMMER: lovinstem (word);
140 break;
141 case SIMPLEFRENCHSTEMMER: simplefrenchstem (word);
142 break;
143 }
144 }
145}
Note: See TracBrowser for help on using the repository browser.