Changeset 12879


Ignore:
Timestamp:
2006-09-28T10:23:36+12:00 (18 years ago)
Author:
kjdon
Message:

Accent Folding patch thanks to Juan Grigera. mgpp_stemmer now does accent folding too if needed. stemmernumber and stemmer functions renamed to mgpp_stemmer and mgpp_stemmernumber so that they don't clash with mg versions when linked in to the library

Location:
trunk/gsdl/src/mgpp/text
Files:
2 edited

Legend:

Unmodified
Added
Removed
  • trunk/gsdl/src/mgpp/text/stemmer.cpp

    r9611 r12879  
    2222#include "sysfuncs.h"
    2323#include "stemmer.h"
    24 
     24#include "mg_files.h"
    2525#include "lovinstem.h"
    2626#include "simplefrenchstem.h"
    2727#include "unitool.h"
     28
     29/* [JFG - Mar 06: Accent folding patch] */
     30#include "unac.h"
    2831
    2932#define LOVINSTEMMER        0
     
    3437 * making sure the final length doesn't exceed the original
    3538 * length */
    36 static void unicode_casefold (u_char *word) {
     39static void mgpp_unicode_casefold (u_char *word) {
    3740  unsigned short out[256]; /* temp space */
    3841  int i;
     
    5255}
    5356
     57/* [JFG - Mar 06: Accent folding patch] */
     58/* =========================================================================
     59 * Function: unicode_accentfold
     60 
     61 * Description: remove accents from characters
     62 * Input: a word string with the length in the first byte
     63 * Output: the unaccented word
     64 * ========================================================================= */
     65void mgpp_unicode_accentfold (unsigned char *word) {     
     66  size_t unac_size = 0;
     67  char *unac = NULL;
    5468
    55 int stemmernumber (u_char *stemmerdescription) {
     69
     70  unac_string("utf-8", (char*)word+1, word[0], &unac, &unac_size);
     71  strncpy((char*)word+1, unac, word[0]+1);
     72  word[0] = unac_size;
     73 
     74  free(unac);
     75  return;
     76}
     77
     78     
     79int mgpp_stemmernumber (u_char *stemmerdescription) {
    5680  u_char descript[MAX_STEM_DESCRIPTION_LEN];
    5781  int i;
     
    85109 * Method 2 - Stem.
    86110 * Method 3 - Case fold and stem.
    87  *
     111 * Method 4 - Accent fold
     112 * Method 5 - Accent fold and case fold
     113 * Method 6 - Accent fold and stem
     114 * Method 7 - Accent fold, stem and case fold
     115
    88116 * The stemmer number should be obtained using
    89117 * the stemmernumber function above.
    90118 */
    91119void
    92 stemmer (int method, int stemmer, u_char *word) {
    93   if (method & 1) {
    94     unicode_casefold (word);
     120mgpp_stemmer (int method, int stemmer, u_char *word) {
     121  if (method & STEM_CaseFolding) {
     122    mgpp_unicode_casefold (word);
    95123  }
    96124
    97   if (method & 2) {
     125  if (method & STEM_AccentFolding) {
     126    mgpp_unicode_accentfold (word);
     127  }
     128
     129  if (method & STEM_Stemming) {
    98130    switch (stemmer) {
    99131    case LOVINSTEMMER: lovinstem (word);
  • trunk/gsdl/src/mgpp/text/stemmer.h

    r2468 r12879  
    4747 * stemmer description.
    4848 */
    49 int stemmernumber (u_char *stemmerdescription);
     49int mgpp_stemmernumber (u_char *stemmerdescription);
    5050
    5151/*
     
    5454 * Method 2 - Stem.
    5555 * Method 3 - Case fold and stem.
    56  *
     56 * Method 4 - Accent fold
     57 * Method 5 - Case fold and accent fold
     58 * Method 6 - Stem and accent fold
     59 * Method 7 - Case fold, stem and accent fold
    5760 * The stemmer number should be obtained using function
    5861 * stemmernumber above.
     
    6164extern "C"
    6265#endif
    63 void stemmer (int method, int stemmer, u_char * word);
     66void mgpp_stemmer (int method, int stemmer, u_char * word);
    6467
    6568#endif
Note: See TracChangeset for help on using the changeset viewer.