Context Navigation

source: trunk/indexers/mg/src/text/stemmer.c@ 7627

Last change on this file since 7627 was 7627, checked in by kjdon, 20 years ago
make stemmerdescription a const
Property svn:executable set to ``* Property svn:keywords set to `Author Date Id Revision`
File size: 4.0 KB

Rev	Line
[3745]	1	/**************************************************************************
	2	*
	3	* stemmer.c -- The stemmer/case folder
	4	* Copyright (C) 1994 Neil Sharman
	5	*
	6	* This program is free software; you can redistribute it and/or modify
	7	* it under the terms of the GNU General Public License as published by
	8	* the Free Software Foundation; either version 2 of the License, or
	9	* (at your option) any later version.
	10	*
	11	* This program is distributed in the hope that it will be useful,
	12	* but WITHOUT ANY WARRANTY; without even the implied warranty of
	13	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	14	* GNU General Public License for more details.
	15	*
	16	* You should have received a copy of the GNU General Public License
	17	* along with this program; if not, write to the Free Software
	18	* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
	19	*
	20	* $Id: stemmer.c 7627 2004-06-22 04:17:06Z kjdon $
	21	*
	22	**************************************************************************/
	23
	24	#include "sysfuncs.h"
	25	#include "stemmer.h"
	26
	27	#include "lovinstem.h"
	28	#include "simplefrenchstem.h"
	29	#include "unitool.h"
	30
	31	/*
	32	$Log$
[7627]	33	Revision 1.2 2004/06/22 04:17:06 kjdon
	34	make stemmerdescription a const
	35
[3745]	36	Revision 1.1 2003/02/20 21:18:24 mdewsnip
	37	Addition of MG package for search and retrieval
	38
	39	Revision 1.1 1999/08/10 21:18:23 sjboddie
	40	renamed mg-1.3d directory mg
	41
	42	Revision 1.3 1998/12/17 09:12:54 rjmcnab
	43
	44	Altered mg to process utf-8 encoded Unicode. The main changes
	45	are in the parsing of the input, the casefolding, and the stemming.
	46
	47	Revision 1.2 1998/11/25 07:55:51 rjmcnab
	48
	49	Modified mg to that you can specify the stemmer you want
	50	to use via a command line option. You specify it to
	51	mg_passes during the build process. The number of the
	52	stemmer that you used is stored within the inverted
	53	dictionary header and the stemmed dictionary header so
	54	the correct stemmer is used in later stages of building
	55	and querying.
	56
	57	Revision 1.1 1998/11/17 09:35:42 rjmcnab
	58	* empty log message *
	59
	60	* Revision 1.3 1994/10/20 03:57:05 tes
	61	* I have rewritten the boolean query optimiser and abstracted out the
	62	* components of the boolean query.
	63	*
	64	* Revision 1.2 1994/09/20 04:42:10 tes
	65	* For version 1.1
	66	*
	67	*/
	68
	69	static char *RCSID = "$Id: stemmer.c 7627 2004-06-22 04:17:06Z kjdon $";
	70
	71
	72	#define LOVINSTEMMER 0
	73	#define SIMPLEFRENCHSTEMMER 1
	74
	75
	76	/* decode the utf-8 encoded unicode, casefold and then recode
	77	* making sure the final length doesn't exceed the original
	78	* length */
	79	static void unicode_casefold (u_char *word) {
	80	unsigned short out[256]; /* temp space */
	81	int i;
	82	int len;
	83
	84	/* decode */
	85	utf8_word_to_unicode (word, out, 255);
	86	len = out[0];
	87
	88	/* casefold and simplify-fold */
	89	for (i=0; i<len; i++) {
	90	out[i+1] = unicode_tosimplified(unicode_tolower(out[i+1]));
	91	}
	92
	93	/* re-code */
	94	unicode_to_utf8_word (out, word, word[0]+1);
	95	}
	96
	97
[7627]	98	int stemmernumber (const u_char *stemmerdescription) {
[3745]	99	u_char descript[MAX_STEM_DESCRIPTION_LEN];
	100	int i;
	101
	102	/* copy and case-fold the description */
	103	for (i=0; (stemmerdescription[i] != '\0') &&
	104	(i < MAX_STEM_DESCRIPTION_LEN-1); i++)
	105	descript[i] = tolower (stemmerdescription[i]);
	106	descript[i] = '\0';
	107
	108	/* map the description to its number */
	109
	110	if ((strcmp (descript, "0") == 0) \|\|
	111	(strcmp (descript, "english") == 0) \|\|
	112	(strcmp (descript, "lovin") == 0))
	113	return LOVINSTEMMER;
	114
	115	if ((strcmp (descript, "1") == 1) \|\|
	116	(strcmp (descript, "french") == 0) \|\|
	117	(strcmp (descript, "simplefrench") == 0))
	118	return SIMPLEFRENCHSTEMMER;
	119
	120	return -1;
	121	}
	122
	123
	124
	125	/*
	126	* Method 0 - Do not stem or case fold.
	127	* Method 1 - Case fold.
	128	* Method 2 - Stem.
	129	* Method 3 - Case fold and stem.
	130	*
	131	* The stemmer number should be obtained using
	132	* the stemmernumber function above.
	133	*/
	134	void
	135	stemmer (int method, int stemmer, u_char *word) {
	136	if (method & 1) {
	137	unicode_casefold (word);
	138	}
	139
	140	if (method & 2) {
	141	switch (stemmer) {
	142	case LOVINSTEMMER: lovinstem (word);
	143	break;
	144	case SIMPLEFRENCHSTEMMER: simplefrenchstem (word);
	145	break;
	146	}
	147	}
	148	}

Note: See TracBrowser for help on using the repository browser.

Download in other formats: