Context Navigation

← Previous Revision
Latest Revision
Next Revision →
Normal
Revision Log

source: gsdl/trunk/trunk/mgpp/text/stemmer.cpp@ 16583

Last change on this file since 16583 was 16583, checked in by davidb, 16 years ago
Undoing change commited in r16582
Property svn:executable set to ``* Property svn:keywords set to `Author Date Id Revision`
File size: 3.9 KB

Rev	Line
[3365]	1	/**************************************************************************
	2	*
	3	* stemmer.cpp -- The stemmer/case folder
	4	* Copyright (C) 1994 Neil Sharman
	5	*
	6	* This program is free software; you can redistribute it and/or modify
	7	* it under the terms of the GNU General Public License as published by
	8	* the Free Software Foundation; either version 2 of the License, or
	9	* (at your option) any later version.
	10	*
	11	* This program is distributed in the hope that it will be useful,
	12	* but WITHOUT ANY WARRANTY; without even the implied warranty of
	13	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	14	* GNU General Public License for more details.
	15	*
	16	* You should have received a copy of the GNU General Public License
	17	* along with this program; if not, write to the Free Software
	18	* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
	19	*
	20	**************************************************************************/
	21
	22	#include "sysfuncs.h"
	23	#include "stemmer.h"
	24	#include "lovinstem.h"
	25	#include "simplefrenchstem.h"
	26	#include "unitool.h"
	27
[13477]	28	#ifdef ENABLE_ACCENTFOLD
	29	/* [JFG - Mar 06: Accent folding patch] */
	30	#include "unac.h"
	31	#endif
	32
[3365]	33	#define LOVINSTEMMER 0
	34	#define SIMPLEFRENCHSTEMMER 1
	35
	36
	37	/* decode the utf-8 encoded unicode, casefold and then recode
	38	* making sure the final length doesn't exceed the original
	39	* length */
[13477]	40	static void mgpp_unicode_casefold (u_char *word) {
[3365]	41	unsigned short out[256]; /* temp space */
	42	int i;
	43	int len;
	44
	45	/* decode */
	46	utf8_word_to_unicode (word, out, 255);
	47	len = out[0];
	48
	49	/* casefold and simplify-fold */
[9613]	50	for (i=0; i<len; ++i) {
[3365]	51	out[i+1] = unicode_tosimplified(unicode_tolower(out[i+1]));
	52	}
	53
	54	/* re-code */
	55	unicode_to_utf8_word (out, word, word[0]+1);
	56	}
	57
[13477]	58	#ifdef ENABLE_ACCENTFOLD
	59	/* [JFG - Mar 06: Accent folding patch] */
	60	/* =========================================================================
	61	* Function: unicode_accentfold
	62
	63	* Description: remove accents from characters
	64	* Input: a word string with the length in the first byte
	65	* Output: the unaccented word
	66	* ========================================================================= */
	67	void mgpp_unicode_accentfold (unsigned char *word) {
	68	size_t unac_size = 0;
	69	char *unac = NULL;
[3365]	70
[13477]	71
	72	unac_string("utf-8", (char*)word+1, word[0], &unac, &unac_size);
	73	strncpy((char*)word+1, unac, word[0]+1);
	74	word[0] = unac_size;
	75
	76	free(unac);
	77	return;
	78	}
	79	#endif
	80
	81	int mgpp_stemmernumber (u_char *stemmerdescription) {
[3365]	82	u_char descript[MAX_STEM_DESCRIPTION_LEN];
	83	int i;
	84
	85	/* copy and case-fold the description */
	86	for (i=0; (stemmerdescription[i] != '\0') &&
[9613]	87	(i < MAX_STEM_DESCRIPTION_LEN-1); ++i)
[3365]	88	descript[i] = tolower (stemmerdescription[i]);
	89	descript[i] = '\0';
	90
	91	/* map the description to its number */
	92
	93	if ((strcmp ((char *) descript, "0") == 0) \|\|
	94	(strcmp ((char *) descript, "english") == 0) \|\|
	95	(strcmp ((char *) descript, "lovin") == 0))
	96	return LOVINSTEMMER;
	97
	98	if ((strcmp ((char *) descript, "1") == 1) \|\|
	99	(strcmp ((char *) descript, "french") == 0) \|\|
	100	(strcmp ((char *) descript, "simplefrench") == 0))
	101	return SIMPLEFRENCHSTEMMER;
	102
	103	return -1;
	104	}
	105
	106
	107
	108	/*
	109	* Method 0 - Do not stem or case fold.
	110	* Method 1 - Case fold.
	111	* Method 2 - Stem.
	112	* Method 3 - Case fold and stem.
[13477]	113	* Method 4 - Accent fold
	114	* Method 5 - Accent fold and case fold
	115	* Method 6 - Accent fold and stem
	116	* Method 7 - Accent fold, stem and case fold
	117
[3365]	118	* The stemmer number should be obtained using
	119	* the stemmernumber function above.
	120	*/
	121	void
[13477]	122	mgpp_stemmer (int method, int stemmer, u_char *word) {
	123	if (method & STEM_CaseFolding) {
	124	mgpp_unicode_casefold (word);
[3365]	125	}
	126
[13477]	127	#ifdef ENABLE_ACCENTFOLD
	128	if (method & STEM_AccentFolding) {
	129	mgpp_unicode_accentfold (word);
	130	}
	131	#endif
	132
	133	if (method & STEM_Stemming) {
[3365]	134	switch (stemmer) {
	135	case LOVINSTEMMER: lovinstem (word);
	136	break;
	137	case SIMPLEFRENCHSTEMMER: simplefrenchstem (word);
	138	break;
	139	}
	140	}
	141	}

Note: See TracBrowser for help on using the repository browser.

Download in other formats: