source: trunk/gsdl/packages/mg/src/text/stemmer.c@ 1014

Last change on this file since 1014 was 439, checked in by sjboddie, 25 years ago

renamed mg-1.3d directory mg

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 3.8 KB
Line 
1/**************************************************************************
2 *
3 * stemmer.c -- The stemmer/case folder
4 * Copyright (C) 1994 Neil Sharman
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
19 *
20 * $Id: stemmer.c 439 1999-08-10 21:23:37Z sjboddie $
21 *
22 **************************************************************************/
23
24#include "sysfuncs.h"
25#include "stemmer.h"
26
27#include "lovinstem.h"
28#include "simplefrenchstem.h"
29#include "unitool.h"
30
31/*
32 $Log$
33 Revision 1.1 1999/08/10 21:18:23 sjboddie
34 renamed mg-1.3d directory mg
35
36 Revision 1.3 1998/12/17 09:12:54 rjmcnab
37
38 Altered mg to process utf-8 encoded Unicode. The main changes
39 are in the parsing of the input, the casefolding, and the stemming.
40
41 Revision 1.2 1998/11/25 07:55:51 rjmcnab
42
43 Modified mg to that you can specify the stemmer you want
44 to use via a command line option. You specify it to
45 mg_passes during the build process. The number of the
46 stemmer that you used is stored within the inverted
47 dictionary header and the stemmed dictionary header so
48 the correct stemmer is used in later stages of building
49 and querying.
50
51 Revision 1.1 1998/11/17 09:35:42 rjmcnab
52 *** empty log message ***
53
54 * Revision 1.3 1994/10/20 03:57:05 tes
55 * I have rewritten the boolean query optimiser and abstracted out the
56 * components of the boolean query.
57 *
58 * Revision 1.2 1994/09/20 04:42:10 tes
59 * For version 1.1
60 *
61 */
62
63static char *RCSID = "$Id: stemmer.c 439 1999-08-10 21:23:37Z sjboddie $";
64
65
66#define LOVINSTEMMER 0
67#define SIMPLEFRENCHSTEMMER 1
68
69
70/* decode the utf-8 encoded unicode, casefold and then recode
71 * making sure the final length doesn't exceed the original
72 * length */
73static void unicode_casefold (u_char *word) {
74 unsigned short out[256]; /* temp space */
75 int i;
76 int len;
77
78 /* decode */
79 utf8_word_to_unicode (word, out, 255);
80 len = out[0];
81
82 /* casefold and simplify-fold */
83 for (i=0; i<len; i++) {
84 out[i+1] = unicode_tosimplified(unicode_tolower(out[i+1]));
85 }
86
87 /* re-code */
88 unicode_to_utf8_word (out, word, word[0]+1);
89}
90
91
92int stemmernumber (u_char *stemmerdescription) {
93 u_char descript[MAX_STEM_DESCRIPTION_LEN];
94 int i;
95
96 /* copy and case-fold the description */
97 for (i=0; (stemmerdescription[i] != '\0') &&
98 (i < MAX_STEM_DESCRIPTION_LEN-1); i++)
99 descript[i] = tolower (stemmerdescription[i]);
100 descript[i] = '\0';
101
102 /* map the description to its number */
103
104 if ((strcmp (descript, "0") == 0) ||
105 (strcmp (descript, "english") == 0) ||
106 (strcmp (descript, "lovin") == 0))
107 return LOVINSTEMMER;
108
109 if ((strcmp (descript, "1") == 1) ||
110 (strcmp (descript, "french") == 0) ||
111 (strcmp (descript, "simplefrench") == 0))
112 return SIMPLEFRENCHSTEMMER;
113
114 return -1;
115}
116
117
118
119/*
120 * Method 0 - Do not stem or case fold.
121 * Method 1 - Case fold.
122 * Method 2 - Stem.
123 * Method 3 - Case fold and stem.
124 *
125 * The stemmer number should be obtained using
126 * the stemmernumber function above.
127 */
128void
129stemmer (int method, int stemmer, u_char *word) {
130 if (method & 1) {
131 unicode_casefold (word);
132 }
133
134 if (method & 2) {
135 switch (stemmer) {
136 case LOVINSTEMMER: lovinstem (word);
137 break;
138 case SIMPLEFRENCHSTEMMER: simplefrenchstem (word);
139 break;
140 }
141 }
142}
Note: See TracBrowser for help on using the repository browser.