/************************************************************************** * * stemmer.cpp -- The stemmer/case folder * Copyright (C) 1994 Neil Sharman * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. * * $Id: stemmer.cpp 856 2000-01-14 02:26:25Z sjboddie $ * **************************************************************************/ #include "sysfuncs.h" #include "stemmer.h" #include "lovinstem.h" #include "simplefrenchstem.h" #include "unitool.h" /* $Log$ Revision 1.1 2000/01/14 02:26:22 sjboddie Rodgers new C++ mg Revision 1.1 1999/10/11 02:58:34 cs025 Base install of MG-PP Revision 1.1 1999/08/10 21:18:23 sjboddie renamed mg-1.3d directory mg Revision 1.3 1998/12/17 09:12:54 rjmcnab Altered mg to process utf-8 encoded Unicode. The main changes are in the parsing of the input, the casefolding, and the stemming. Revision 1.2 1998/11/25 07:55:51 rjmcnab Modified mg to that you can specify the stemmer you want to use via a command line option. You specify it to mg_passes during the build process. The number of the stemmer that you used is stored within the inverted dictionary header and the stemmed dictionary header so the correct stemmer is used in later stages of building and querying. Revision 1.1 1998/11/17 09:35:42 rjmcnab *** empty log message *** * Revision 1.3 1994/10/20 03:57:05 tes * I have rewritten the boolean query optimiser and abstracted out the * components of the boolean query. * * Revision 1.2 1994/09/20 04:42:10 tes * For version 1.1 * */ #define LOVINSTEMMER 0 #define SIMPLEFRENCHSTEMMER 1 /* decode the utf-8 encoded unicode, casefold and then recode * making sure the final length doesn't exceed the original * length */ static void unicode_casefold (u_char *word) { unsigned short out[256]; /* temp space */ int i; int len; /* decode */ utf8_word_to_unicode (word, out, 255); len = out[0]; /* casefold and simplify-fold */ for (i=0; i