Changeset 13477 for trunk/indexers


Ignore:
Timestamp:
2006-12-11T11:22:20+13:00 (17 years ago)
Author:
shaoqun
Message:

added code for accentfolding

Location:
trunk/indexers/mgpp/text
Files:
9 edited

Legend:

Unmodified
Added
Removed
  • trunk/indexers/mgpp/text/IndexData.cpp

    r8692 r13477  
    3030  dictFile = NULL;
    3131
    32   stem1File = NULL;
    33   stem2File = NULL;
    34   stem3File = NULL;
     32  /* [JFG - Mar 06: Accent folding patch] */
     33  for(int i=STEM_MIN;i <= STEM_MAX;i++)
     34    stemFile[i-1] = NULL;
    3535
    3636  invfFile = NULL;
     
    7979  if (!ReadBlockIdx (dictFile, biTags)) { UnloadData (); return false; }
    8080
    81   // blocked stem index 1
    82   stem1File = open_file (filename, INVF_DICT_BLOCKED_1_SUFFIX,
    83              "rb", MAGIC_STEM_1, MG_ABORT);
    84   if (!sih1.Read (stem1File)) { UnloadData (); return false; }
    85 
    86   fseek (stem1File, sih1.block_idx_start, SEEK_SET);
    87   if (!ReadBlockIdx (stem1File, sii1)) { UnloadData (); return false; }
    88  
    89   // blocked stem index 2
    90   stem2File = open_file (filename, INVF_DICT_BLOCKED_2_SUFFIX,
    91              "rb", MAGIC_STEM_2, MG_ABORT);
    92   if (!sih2.Read (stem2File)) { UnloadData (); return false; }
    93 
    94   fseek (stem2File, sih2.block_idx_start, SEEK_SET);
    95   if (!ReadBlockIdx (stem2File, sii2)) { UnloadData (); return false; }
    96  
    97   // blocked stem index 3
    98   stem3File = open_file (filename, INVF_DICT_BLOCKED_3_SUFFIX,
    99              "rb", MAGIC_STEM_3, MG_ABORT);
    100   if (!sih3.Read (stem3File)) { UnloadData (); return false; }
    101 
    102   fseek (stem3File, sih3.block_idx_start, SEEK_SET);
    103   if (!ReadBlockIdx (stem3File, sii3)) { UnloadData (); return false; }
    104 
     81  /* [JFG - Mar 06: Accent folding patch] */
     82  // read stem indexes
     83  // [KJD - optional stemming patch]
     84  // allow no stem indexes
     85  for(int stem = STEM_MIN; stem <= STEM_MAX; stem++) {
     86    char *suffix = make_suffix (INVF_DICT_BLOCKED_SUFFIX_PAT, stem, NULL);
     87    stemFile[stem-1] = open_file (filename, suffix,
     88                  "rb", MAGIC_STEM_GEN(stem + '0'), MG_MESSAGE);
     89    if (stemFile[stem-1]!= NULL) {
     90      if (!sih[stem-1].Read (stemFile[stem-1])) {
     91    fclose (stemFile[stem-1]);
     92    stemFile[stem-1] = NULL;
     93    //UnloadData (); return false;
     94      }
     95     
     96      fseek (stemFile[stem-1], sih[stem-1].block_idx_start, SEEK_SET);
     97      if (!ReadBlockIdx (stemFile[stem-1], sii[stem-1])) {
     98    fclose (stemFile[stem-1]);
     99    stemFile[stem-1] = NULL;
     100    //UnloadData (); return false;
     101      }
     102    }
     103  }
     104 
     105 
    105106  // inverted file
    106107  invfFile = open_file (filename, INVF_SUFFIX, "rb",
     
    125126  }
    126127
    127   if (stem1File != NULL) {
    128     fclose (stem1File); stem1File = NULL;
    129   }
    130   if (stem2File != NULL) {
    131     fclose (stem2File); stem2File = NULL;
    132   }
    133   if (stem3File != NULL) {
    134     fclose (stem3File); stem3File = NULL;
     128  for(int i=STEM_MIN;i <= STEM_MAX;i++) {
     129    if (stemFile[i-1] != NULL) {
     130      fclose (stemFile[i-1]); stemFile[i-1] = NULL;
     131    }
    135132  }
    136133
  • trunk/indexers/mgpp/text/IndexData.h

    r4205 r13477  
    2828#include "FragLevelConvert.h"
    2929#include "Weights.h"
    30 
     30#include "stemmer.h"
    3131
    3232class IndexData {
     
    4343  block_idx biTags;
    4444
     45  /* [JFG - Mar 06: Accent folding patch] */
    4546  // stem indexes
    46   FILE *stem1File;
    47   FILE *stem2File;
    48   FILE *stem3File;
    49   stem_idx_header sih1;
    50   stem_idx_header sih2;
    51   stem_idx_header sih3;
    52   block_idx sii1;
    53   block_idx sii2;
    54   block_idx sii3;
    55 
     47  FILE *stemFile[STEM_MAX];
     48  stem_idx_header sih[STEM_MAX];
     49  block_idx sii[STEM_MAX];
     50 
    5651  // inverted file
    5752  FILE *invfFile;
  • trunk/indexers/mgpp/text/Queryer.cpp

    r12321 r13477  
    5050       << "\t.c0/.c1\t\tcasefolding off/on\n"
    5151       << "\t.s0/.s1\t\tstemming off/on\n"
     52#ifdef ENABLE_ACCENTFOLD
     53       << "\t.a0/.a1\t\taccentfolding off/on\n"
     54#endif
    5255       << "\t.o0/.o1\t\tshort output off/on\n"
    5356       << "\t.m\t\tset maxnumeric (enter the number at the prompt)\n\n"
     
    9194 
    9295  // init the text system
    93     TextData textData;
     96  TextData textData;
    9497  if (!textData.LoadData (basePath, textfilename)) {
    9598    FatalError (1, "Couldn't load text information for \"%s\"", textfilename);
     
    134137  //SetCStr(level, "");
    135138   
    136   int defaultStemMethod = 0; // uncasefolded, unstemmed
     139  int defaultStemMethod = 0; // uncasefolded, unstemmed, unaccentfolded
    137140  int defaultBoolCombine = 0; // OR
    138141  bool shortOutput = false;
     
    207210      }
    208211      else if (queryArray[1] == 'c') { // casefolding - on/off
    209     if (queryArray[2] == '1') defaultStemMethod |= 1;
    210     else if (queryArray[2] == '0') defaultStemMethod &= 0xe;
     212    if (queryArray[2] == '1') defaultStemMethod |= STEM_CaseFolding;
     213    else if (queryArray[2] == '0') defaultStemMethod &= (~STEM_CaseFolding);
    211214    else {
    212215      cout << "Error: please enter .c0 (case sensitive) or .c1 (casefolded)\n";
     
    214217      }
    215218      else if (queryArray[1] == 's') { // stemming - on/off
    216     if (queryArray[2] == '1') defaultStemMethod |=2;
    217     else if (queryArray[2] == '0') defaultStemMethod &=0xd;
     219    if (queryArray[2] == '1') defaultStemMethod |= STEM_Stemming;
     220    else if (queryArray[2] == '0') defaultStemMethod &= (~STEM_Stemming);
    218221    else {
    219222      cout << "Error: please enter .s0 (unstemmed) or .s1 (stemmed)\n";
    220223    }
    221224      }     
     225#ifdef ENABLE_ACCENTFOLD
     226      else if (queryArray[1] == 'a') { // accentfolding - on/off
     227    if (queryArray[2] == '1') defaultStemMethod |= STEM_AccentFolding;
     228    else if (queryArray[2] == '0') defaultStemMethod &= (~STEM_AccentFolding);
     229    else {
     230      cout << "Error: please enter .a0 (accent sensitive) or .a1 (accentfolded)\n";
     231    }
     232      }
     233#endif
    222234      else if (queryArray[1] == 'o') { // output - short/long
    223235    if (queryArray[2] == '1') shortOutput = true;
     
    274286
    275287  // clean up, everybody clean up
    276     textData.UnloadData ();
     288  textData.UnloadData ();
    277289  indexData.UnloadData ();
    278290
  • trunk/indexers/mgpp/text/Terms.cpp

    r8692 r13477  
    210210              vector<unsigned long> &equivWords) {
    211211  equivWords.erase (equivWords.begin(), equivWords.end());
    212  
    213   if (stemMethod == 0 || stemMethod==4 || stemMethod==5) {
     212
     213  // if the stem method specified is not a valid one (i.e. there was no appropriate stem index, then we set it to 0)
     214  // unless we have partial matching, in which case we are not doing stem indexes anyway.
     215  if (!(stemMethod & STEM_PARTIAL_MATCH) && indexData.stemFile[stemMethod-1] == NULL) {
     216    cerr << "Stem index for method "<<stemMethod<< " was not built, so not doing stemming\n";
     217    stemMethod = 0;
     218  }
     219  /* [JFG - Mar 06: Accent folding patch] */
     220  /* use flag PARTIAL_MATCH */ 
     221  if (stemMethod == 0 || (stemMethod & STEM_PARTIAL_MATCH)) {
    214222    // don't need to stem the word,
    215223    // find the word number(s) for this term
     
    218226    word_block_dict_el wordDictEl;
    219227    wordDictEl.SetNumLevels (numLevels);
    220     if (stemMethod ==0) {
     228    if (stemMethod == 0) {
    221229      if (SearchWordBlockDictEl (indexData.dictFile, indexData.biWords,
    222230                 indexData.bdh.entries_per_wblk,
     
    228236    } else {
    229237      // partial matching,
    230       PartialMatchSearchWordBlockDictEl (indexData.dictFile, indexData.biWords, indexData.bdh.entries_per_wblk, indexData.bdh.word_dict_size, numLevels, term, wordDictEl, equivWords, (stemMethod==5?true:false) );
     238      PartialMatchSearchWordBlockDictEl (indexData.dictFile, indexData.biWords, indexData.bdh.entries_per_wblk, indexData.bdh.word_dict_size, numLevels, term, wordDictEl, equivWords, (stemMethod & STEM_CaseFolding)? true : false);
     239      // TODO: Accent Folding is not handled here!!
    231240      return;
    232241    }
     
    234243             
    235244  // need to stem this word and find it in the blocked stem index
    236  
    237   unsigned char  mgWord[MAXSTEMLEN + 1];
     245  unsigned char mgWord[MAXSTEMLEN + 1];
    238246  UCArray stemTerm;
    239247  unsigned long stemmerNum = 0;
    240   if (stemMethod == 1) stemmerNum = indexData.sih1.stemmer_num;
    241   else if (stemMethod == 2) stemmerNum = indexData.sih2.stemmer_num;
    242   else if (stemMethod == 3) stemmerNum = indexData.sih3.stemmer_num;
    243    
     248
     249  /* [JFG - Mar 06: Accent folding patch] */
     250  if(stemMethod > STEM_MAX) {
     251    return;
     252    //TODO: throw an error here
     253  }
     254
     255  stemmerNum = indexData.sih[stemMethod-1].stemmer_num;
     256 
    244257  // convert the word to an "mg word"
    245258  mgWord[0] = term.size();
     
    247260 
    248261  // stem the word
    249   stemmer (stemMethod, stemmerNum, mgWord);
    250 
     262  mgpp_stemmer (stemMethod, stemmerNum, mgWord);
    251263  // convert the result back to a UCArray
    252264  stemTerm.insert (stemTerm.end(), &mgWord[1], &mgWord[1] + mgWord[0]);
     
    256268  unsigned long stemElNum;
    257269  bool result = false;
    258   if (stemMethod == 1) {
    259     result = SearchStemBlockDictEl (indexData.stem1File,
    260                indexData.sii1,
    261                indexData.sih1.entries_per_block,
    262                indexData.sih1.dict_size,
     270 
     271  /* [JFG - Mar 06: Accent folding patch] */
     272  result = SearchStemBlockDictEl (indexData.stemFile[stemMethod-1],
     273               indexData.sii[stemMethod-1],
     274               indexData.sih[stemMethod-1].entries_per_block,
     275               indexData.sih[stemMethod-1].dict_size,
    263276               stemTerm,
    264277               stemDictEl,
    265278               stemElNum);
    266 
    267   } else if (stemMethod == 2) {
    268     result = SearchStemBlockDictEl (indexData.stem2File,
    269                indexData.sii2,
    270                indexData.sih2.entries_per_block,
    271                indexData.sih2.dict_size,
    272                stemTerm,
    273                stemDictEl,
    274                stemElNum);
    275 
    276   } else if (stemMethod == 3) {
    277     result = SearchStemBlockDictEl (indexData.stem3File,
    278                indexData.sii3,
    279                indexData.sih3.entries_per_block,
    280                indexData.sih3.dict_size,
    281                stemTerm,
    282                stemDictEl,
    283                stemElNum);
    284   }
    285 
     279 
    286280  if (result) {
    287281    equivWords = stemDictEl.equivWords; 
  • trunk/indexers/mgpp/text/mg_files.cpp

    r8692 r13477  
    9393}
    9494
     95
     96/* [JFG - Mar 06: Accent folding patch] */
     97/* This generates a suffixe for a file name. It places the name in the
     98   buffer specified or if that is NULL, it uses a static buffer.
     99   Please do not specify buffers under 512 or the data to be written. */
     100char *
     101make_suffix (const char *suffix_format, const char suffix_arg, char *buffer)
     102{
     103  static char suffix[512];
     104  if (!buffer)
     105    buffer = suffix;
     106  sprintf (buffer, suffix_format, suffix_arg);
     107  return buffer;
     108}
     109 
    95110
    96111
  • trunk/indexers/mgpp/text/mg_files.h

    r3365 r13477  
    5050#define MAGIC_WGHT_APPROX   GEN_MAGIC('M','G','w', 0 )
    5151#define MAGIC_PARAGRAPH     GEN_MAGIC('M','G','P', 0 )
    52 /* [RPAP - Jan 97: Stem Index Change] */
    53 #define MAGIC_STEM_1            GEN_MAGIC('M','G','s','1')
    54 #define MAGIC_STEM_2            GEN_MAGIC('M','G','s','2')
    55 #define MAGIC_STEM_3            GEN_MAGIC('M','G','s','3')
    56 
     52#define MAGIC_STEM_GEN(x)   GEN_MAGIC('M', 'G', 's', x)
    5753#define IS_MAGIC(a) ((((u_long)(a)) & 0xffff0000) == MAGIC_XXXX)
    5854
     
    219215
    220216/* [RPAP - Jan 97: Stem Index Change] */
     217/* [JFG - Mar 06: Accent folding patch]
     218 * Use the pattern with make_suffix */
    221219/* The casefolded index into the stemmed dictionary */
    222220#ifdef SHORT_SUFFIX
    223 # define INVF_DICT_BLOCKED_1_SUFFIX ".ib1"
    224 #else
    225 # define INVF_DICT_BLOCKED_1_SUFFIX ".invf.dict.blocked.1"
    226 #endif
    227 
    228 /* [RPAP - Jan 97: Stem Index Change] */
    229 /* The stemmed index into the stemmed dictionary */
    230 #ifdef SHORT_SUFFIX
    231 # define INVF_DICT_BLOCKED_2_SUFFIX ".ib2"
    232 #else
    233 # define INVF_DICT_BLOCKED_2_SUFFIX ".invf.dict.blocked.2"
    234 #endif
    235 
    236 /* [RPAP - Jan 97: Stem Index Change] */
    237 /* The casefolded and stemmed  index into the stemmed dictionary */
    238 #ifdef SHORT_SUFFIX
    239 # define INVF_DICT_BLOCKED_3_SUFFIX ".ib3"
    240 #else
    241 # define INVF_DICT_BLOCKED_3_SUFFIX ".invf.dict.blocked.3"
    242 #endif
     221# define INVF_DICT_BLOCKED_SUFFIX_PAT ".ib%d"
     222#else
     223# define INVF_DICT_BLOCKED_SUFFIX_PAT ".invf.dict.blocked.%d"
     224#endif
     225
    243226
    244227/* [RPAP - Feb 97: WIN32 Port] */
     
    264247char *make_name (const char *name, const char *suffix, char *buffer);
    265248
    266 
     249/* [JFG - Mar 06: Accent folding patch] */
     250/* This generates the suffix of a file. It places the name in the buffer
     251   specified or if that is NULL it uses a static buffer. */
     252char *make_suffix (const char *suffix_format, const char suffix_arg, char *buffer);
    267253
    268254
  • trunk/indexers/mgpp/text/mgpp_stem_idx.cpp

    r9613 r13477  
    8080
    8181    // stem the word
    82     stemmer (stemMethod, stemmerNum, mgWord);
     82    mgpp_stemmer (stemMethod, stemmerNum, mgWord);
    8383
    8484    // convert the result back to a UCArray
     
    101101            int stemmerNum,
    102102            unsigned long entriesPerBlock) {
     103 
     104  /* [JFG - Mar 06: Accent folding patch] */
    103105  // Create appropriate stem index file
    104106  FILE *stemDictFile = NULL;
    105   if (stemMethod == 1) {
    106     stemDictFile = create_file (filename, INVF_DICT_BLOCKED_1_SUFFIX,
    107                 "wb", MAGIC_STEM_1, MG_ABORT);
    108   } else if (stemMethod == 2) {
    109     stemDictFile = create_file (filename, INVF_DICT_BLOCKED_2_SUFFIX,
    110                 "wb", MAGIC_STEM_2, MG_ABORT);
    111   } else if (stemMethod == 3) {
    112     stemDictFile = create_file (filename, INVF_DICT_BLOCKED_3_SUFFIX,
    113                 "wb", MAGIC_STEM_3, MG_ABORT);
    114   } else {
     107  if (stemMethod >= STEM_MIN && stemMethod <= STEM_MAX) {
     108    char *suffix = make_suffix (INVF_DICT_BLOCKED_SUFFIX_PAT, stemMethod, NULL);
     109    stemDictFile = create_file (filename, suffix,
     110                "wb", MAGIC_STEM_GEN(stemMethod + '0'), MG_ABORT); 
     111  }
     112  else {
    115113    FatalError (1, "Unknown stem method %d", stemMethod);
    116114  }
     
    213211      break;
    214212    case 'a':
    215       stemmerNum = stemmernumber ((unsigned char *) optarg);
     213      stemmerNum = mgpp_stemmernumber ((unsigned char *) optarg);
    216214      break;
    217215    case 'h':
    218216    case '?':
    219217      fprintf (stderr, "usage: %s [-d directory] "
    220            "[-b entries-per-block] [-h] -s 1|2|3 "
    221            "[-a stemmer-method] -f name\n", argv[0]);
     218           "[-b entries-per-block] [-h] -s 1|2|3", argv[0]);
     219#ifdef ENABLE_ACCENTFOLD
     220      fprintf (stderr, "|4|5|6|7");
     221#endif
     222      fprintf (stderr, " [-a stemmer-method] -f name\n");
    222223      exit (1);
    223224    }
    224225  }
    225226 
    226   if (stemMethod < 1 || stemMethod > 3)
    227     FatalError (1, "Stem method must be 1, 2 or 3");
    228 
     227  /* [JFG - Mar 06: Accent folding patch] */
     228  if (stemMethod < STEM_MIN || stemMethod > STEM_MAX)
     229    FatalError (1, "Stem method must be between %d and %d", STEM_MIN, STEM_MAX);
     230#ifndef ENABLE_ACCENTFOLD
     231  if (stemMethod & STEM_AccentFolding) {
     232    // accent folding not enabled
     233    return -1;
     234  }
     235#endif
    229236  // read in the dictionary and create the in memory dictionary
    230237  StemMapDict stemDict;
  • trunk/indexers/mgpp/text/stemmer.cpp

    r9613 r13477  
    2222#include "sysfuncs.h"
    2323#include "stemmer.h"
    24 
    2524#include "lovinstem.h"
    2625#include "simplefrenchstem.h"
    2726#include "unitool.h"
     27
     28#ifdef ENABLE_ACCENTFOLD
     29/* [JFG - Mar 06: Accent folding patch] */
     30#include "unac.h"
     31#endif
    2832
    2933#define LOVINSTEMMER        0
     
    3438 * making sure the final length doesn't exceed the original
    3539 * length */
    36 static void unicode_casefold (u_char *word) {
     40static void mgpp_unicode_casefold (u_char *word) {
    3741  unsigned short out[256]; /* temp space */
    3842  int i;
     
    5256}
    5357
     58#ifdef ENABLE_ACCENTFOLD
     59/* [JFG - Mar 06: Accent folding patch] */
     60/* =========================================================================
     61 * Function: unicode_accentfold
     62 
     63 * Description: remove accents from characters
     64 * Input: a word string with the length in the first byte
     65 * Output: the unaccented word
     66 * ========================================================================= */
     67void mgpp_unicode_accentfold (unsigned char *word) {     
     68  size_t unac_size = 0;
     69  char *unac = NULL;
    5470
    55 int stemmernumber (u_char *stemmerdescription) {
     71
     72  unac_string("utf-8", (char*)word+1, word[0], &unac, &unac_size);
     73  strncpy((char*)word+1, unac, word[0]+1);
     74  word[0] = unac_size;
     75 
     76  free(unac);
     77  return;
     78}
     79#endif
     80     
     81int mgpp_stemmernumber (u_char *stemmerdescription) {
    5682  u_char descript[MAX_STEM_DESCRIPTION_LEN];
    5783  int i;
     
    85111 * Method 2 - Stem.
    86112 * Method 3 - Case fold and stem.
    87  *
     113 * Method 4 - Accent fold
     114 * Method 5 - Accent fold and case fold
     115 * Method 6 - Accent fold and stem
     116 * Method 7 - Accent fold, stem and case fold
     117
    88118 * The stemmer number should be obtained using
    89119 * the stemmernumber function above.
    90120 */
    91121void
    92 stemmer (int method, int stemmer, u_char *word) {
    93   if (method & 1) {
    94     unicode_casefold (word);
     122mgpp_stemmer (int method, int stemmer, u_char *word) {
     123  if (method & STEM_CaseFolding) {
     124    mgpp_unicode_casefold (word);
    95125  }
    96126
    97   if (method & 2) {
     127#ifdef ENABLE_ACCENTFOLD
     128  if (method & STEM_AccentFolding) {
     129    mgpp_unicode_accentfold (word);
     130  }
     131#endif
     132
     133  if (method & STEM_Stemming) {
    98134    switch (stemmer) {
    99135    case LOVINSTEMMER: lovinstem (word);
  • trunk/indexers/mgpp/text/stemmer.h

    r3365 r13477  
    2323#define STEMMER_H
    2424
    25 #include "sysfuncs.h"
     25/* [RPAP - Jan 97: Stem Index Change] */
     26/* [JFG - Mar 06: Accent folding patch] */
     27enum stemMethods {
     28    STEM_None       =   0,
     29    STEM_CaseFolding    =   0x1,
     30    STEM_Stemming       =   0x2,
     31    STEM_AccentFolding  =   0x4,
     32};
     33
     34/* This is for the QueryParser */
     35#define CHAR_FLAG_STEM_CaseFold    'i'  // ignore case
     36#define CHAR_FLAG_STEM_NoCaseFold  'c'  // case sensitive
     37#define CHAR_FLAG_STEM_Stemming    's'  // stem words
     38#define CHAR_FLAG_STEM_NoStemming  'u'  // do not stem words
     39#define CHAR_FLAG_STEM_AccentFold  'f'  // accent fold
     40#define CHAR_FLAG_STEM_NoAccentFold  'a'    // do no accent folding
     41#define CHAR_FLAG_STEM_Validator  "icsufa"  // all of the above
     42
     43
     44#define STEM_MIN 1
     45#define STEM_MAX (STEM_CaseFolding | STEM_Stemming | STEM_AccentFolding)
     46#define STEM_PARTIAL_MATCH  (STEM_MAX+1)
     47#define STEM_INVALID        (STEM_MAX+2)
    2648
    2749#define STEMMER_MASK 3
     
    4769 * stemmer description.
    4870 */
    49 int stemmernumber (u_char *stemmerdescription);
     71int mgpp_stemmernumber (unsigned char *stemmerdescription);
    5072
    5173/*
     
    5476 * Method 2 - Stem.
    5577 * Method 3 - Case fold and stem.
    56  *
     78 * Method 4 - Accent fold
     79 * Method 5 - Case fold and accent fold
     80 * Method 6 - Stem and accent fold
     81 * Method 7 - Case fold, stem and accent fold
    5782 * The stemmer number should be obtained using function
    5883 * stemmernumber above.
     
    6186extern "C"
    6287#endif
    63 void stemmer (int method, int stemmer, u_char * word);
     88void mgpp_stemmer (int method, int stemmer, unsigned char * word);
    6489
    6590#endif
Note: See TracChangeset for help on using the changeset viewer.