Changeset 13477
- Timestamp:
- 2006-12-11T11:22:20+13:00 (17 years ago)
- Location:
- trunk
- Files:
-
- 18 edited
Legend:
- Unmodified
- Added
- Removed
-
trunk/indexers/mgpp/text/IndexData.cpp
r8692 r13477 30 30 dictFile = NULL; 31 31 32 stem1File = NULL;33 stem2File = NULL;34 stem3File= NULL;32 /* [JFG - Mar 06: Accent folding patch] */ 33 for(int i=STEM_MIN;i <= STEM_MAX;i++) 34 stemFile[i-1] = NULL; 35 35 36 36 invfFile = NULL; … … 79 79 if (!ReadBlockIdx (dictFile, biTags)) { UnloadData (); return false; } 80 80 81 // blocked stem index 1 82 stem1File = open_file (filename, INVF_DICT_BLOCKED_1_SUFFIX, 83 "rb", MAGIC_STEM_1, MG_ABORT); 84 if (!sih1.Read (stem1File)) { UnloadData (); return false; } 85 86 fseek (stem1File, sih1.block_idx_start, SEEK_SET); 87 if (!ReadBlockIdx (stem1File, sii1)) { UnloadData (); return false; } 88 89 // blocked stem index 2 90 stem2File = open_file (filename, INVF_DICT_BLOCKED_2_SUFFIX, 91 "rb", MAGIC_STEM_2, MG_ABORT); 92 if (!sih2.Read (stem2File)) { UnloadData (); return false; } 93 94 fseek (stem2File, sih2.block_idx_start, SEEK_SET); 95 if (!ReadBlockIdx (stem2File, sii2)) { UnloadData (); return false; } 96 97 // blocked stem index 3 98 stem3File = open_file (filename, INVF_DICT_BLOCKED_3_SUFFIX, 99 "rb", MAGIC_STEM_3, MG_ABORT); 100 if (!sih3.Read (stem3File)) { UnloadData (); return false; } 101 102 fseek (stem3File, sih3.block_idx_start, SEEK_SET); 103 if (!ReadBlockIdx (stem3File, sii3)) { UnloadData (); return false; } 104 81 /* [JFG - Mar 06: Accent folding patch] */ 82 // read stem indexes 83 // [KJD - optional stemming patch] 84 // allow no stem indexes 85 for(int stem = STEM_MIN; stem <= STEM_MAX; stem++) { 86 char *suffix = make_suffix (INVF_DICT_BLOCKED_SUFFIX_PAT, stem, NULL); 87 stemFile[stem-1] = open_file (filename, suffix, 88 "rb", MAGIC_STEM_GEN(stem + '0'), MG_MESSAGE); 89 if (stemFile[stem-1]!= NULL) { 90 if (!sih[stem-1].Read (stemFile[stem-1])) { 91 fclose (stemFile[stem-1]); 92 stemFile[stem-1] = NULL; 93 //UnloadData (); return false; 94 } 95 96 fseek (stemFile[stem-1], sih[stem-1].block_idx_start, SEEK_SET); 97 if (!ReadBlockIdx (stemFile[stem-1], sii[stem-1])) { 98 fclose (stemFile[stem-1]); 99 stemFile[stem-1] = NULL; 100 //UnloadData (); return false; 101 } 102 } 103 } 104 105 105 106 // inverted file 106 107 invfFile = open_file (filename, INVF_SUFFIX, "rb", … … 125 126 } 126 127 127 if (stem1File != NULL) { 128 fclose (stem1File); stem1File = NULL; 129 } 130 if (stem2File != NULL) { 131 fclose (stem2File); stem2File = NULL; 132 } 133 if (stem3File != NULL) { 134 fclose (stem3File); stem3File = NULL; 128 for(int i=STEM_MIN;i <= STEM_MAX;i++) { 129 if (stemFile[i-1] != NULL) { 130 fclose (stemFile[i-1]); stemFile[i-1] = NULL; 131 } 135 132 } 136 133 -
trunk/indexers/mgpp/text/IndexData.h
r4205 r13477 28 28 #include "FragLevelConvert.h" 29 29 #include "Weights.h" 30 30 #include "stemmer.h" 31 31 32 32 class IndexData { … … 43 43 block_idx biTags; 44 44 45 /* [JFG - Mar 06: Accent folding patch] */ 45 46 // stem indexes 46 FILE *stem1File; 47 FILE *stem2File; 48 FILE *stem3File; 49 stem_idx_header sih1; 50 stem_idx_header sih2; 51 stem_idx_header sih3; 52 block_idx sii1; 53 block_idx sii2; 54 block_idx sii3; 55 47 FILE *stemFile[STEM_MAX]; 48 stem_idx_header sih[STEM_MAX]; 49 block_idx sii[STEM_MAX]; 50 56 51 // inverted file 57 52 FILE *invfFile; -
trunk/indexers/mgpp/text/Queryer.cpp
r12321 r13477 50 50 << "\t.c0/.c1\t\tcasefolding off/on\n" 51 51 << "\t.s0/.s1\t\tstemming off/on\n" 52 #ifdef ENABLE_ACCENTFOLD 53 << "\t.a0/.a1\t\taccentfolding off/on\n" 54 #endif 52 55 << "\t.o0/.o1\t\tshort output off/on\n" 53 56 << "\t.m\t\tset maxnumeric (enter the number at the prompt)\n\n" … … 91 94 92 95 // init the text system 93 96 TextData textData; 94 97 if (!textData.LoadData (basePath, textfilename)) { 95 98 FatalError (1, "Couldn't load text information for \"%s\"", textfilename); … … 134 137 //SetCStr(level, ""); 135 138 136 int defaultStemMethod = 0; // uncasefolded, unstemmed 139 int defaultStemMethod = 0; // uncasefolded, unstemmed, unaccentfolded 137 140 int defaultBoolCombine = 0; // OR 138 141 bool shortOutput = false; … … 207 210 } 208 211 else if (queryArray[1] == 'c') { // casefolding - on/off 209 if (queryArray[2] == '1') defaultStemMethod |= 1;210 else if (queryArray[2] == '0') defaultStemMethod &= 0xe;212 if (queryArray[2] == '1') defaultStemMethod |= STEM_CaseFolding; 213 else if (queryArray[2] == '0') defaultStemMethod &= (~STEM_CaseFolding); 211 214 else { 212 215 cout << "Error: please enter .c0 (case sensitive) or .c1 (casefolded)\n"; … … 214 217 } 215 218 else if (queryArray[1] == 's') { // stemming - on/off 216 if (queryArray[2] == '1') defaultStemMethod |= 2;217 else if (queryArray[2] == '0') defaultStemMethod &= 0xd;219 if (queryArray[2] == '1') defaultStemMethod |= STEM_Stemming; 220 else if (queryArray[2] == '0') defaultStemMethod &= (~STEM_Stemming); 218 221 else { 219 222 cout << "Error: please enter .s0 (unstemmed) or .s1 (stemmed)\n"; 220 223 } 221 224 } 225 #ifdef ENABLE_ACCENTFOLD 226 else if (queryArray[1] == 'a') { // accentfolding - on/off 227 if (queryArray[2] == '1') defaultStemMethod |= STEM_AccentFolding; 228 else if (queryArray[2] == '0') defaultStemMethod &= (~STEM_AccentFolding); 229 else { 230 cout << "Error: please enter .a0 (accent sensitive) or .a1 (accentfolded)\n"; 231 } 232 } 233 #endif 222 234 else if (queryArray[1] == 'o') { // output - short/long 223 235 if (queryArray[2] == '1') shortOutput = true; … … 274 286 275 287 // clean up, everybody clean up 276 288 textData.UnloadData (); 277 289 indexData.UnloadData (); 278 290 -
trunk/indexers/mgpp/text/Terms.cpp
r8692 r13477 210 210 vector<unsigned long> &equivWords) { 211 211 equivWords.erase (equivWords.begin(), equivWords.end()); 212 213 if (stemMethod == 0 || stemMethod==4 || stemMethod==5) { 212 213 // if the stem method specified is not a valid one (i.e. there was no appropriate stem index, then we set it to 0) 214 // unless we have partial matching, in which case we are not doing stem indexes anyway. 215 if (!(stemMethod & STEM_PARTIAL_MATCH) && indexData.stemFile[stemMethod-1] == NULL) { 216 cerr << "Stem index for method "<<stemMethod<< " was not built, so not doing stemming\n"; 217 stemMethod = 0; 218 } 219 /* [JFG - Mar 06: Accent folding patch] */ 220 /* use flag PARTIAL_MATCH */ 221 if (stemMethod == 0 || (stemMethod & STEM_PARTIAL_MATCH)) { 214 222 // don't need to stem the word, 215 223 // find the word number(s) for this term … … 218 226 word_block_dict_el wordDictEl; 219 227 wordDictEl.SetNumLevels (numLevels); 220 if (stemMethod == 0) {228 if (stemMethod == 0) { 221 229 if (SearchWordBlockDictEl (indexData.dictFile, indexData.biWords, 222 230 indexData.bdh.entries_per_wblk, … … 228 236 } else { 229 237 // partial matching, 230 PartialMatchSearchWordBlockDictEl (indexData.dictFile, indexData.biWords, indexData.bdh.entries_per_wblk, indexData.bdh.word_dict_size, numLevels, term, wordDictEl, equivWords, (stemMethod==5?true:false) ); 238 PartialMatchSearchWordBlockDictEl (indexData.dictFile, indexData.biWords, indexData.bdh.entries_per_wblk, indexData.bdh.word_dict_size, numLevels, term, wordDictEl, equivWords, (stemMethod & STEM_CaseFolding)? true : false); 239 // TODO: Accent Folding is not handled here!! 231 240 return; 232 241 } … … 234 243 235 244 // need to stem this word and find it in the blocked stem index 236 237 unsigned char mgWord[MAXSTEMLEN + 1]; 245 unsigned char mgWord[MAXSTEMLEN + 1]; 238 246 UCArray stemTerm; 239 247 unsigned long stemmerNum = 0; 240 if (stemMethod == 1) stemmerNum = indexData.sih1.stemmer_num; 241 else if (stemMethod == 2) stemmerNum = indexData.sih2.stemmer_num; 242 else if (stemMethod == 3) stemmerNum = indexData.sih3.stemmer_num; 243 248 249 /* [JFG - Mar 06: Accent folding patch] */ 250 if(stemMethod > STEM_MAX) { 251 return; 252 //TODO: throw an error here 253 } 254 255 stemmerNum = indexData.sih[stemMethod-1].stemmer_num; 256 244 257 // convert the word to an "mg word" 245 258 mgWord[0] = term.size(); … … 247 260 248 261 // stem the word 249 stemmer (stemMethod, stemmerNum, mgWord); 250 262 mgpp_stemmer (stemMethod, stemmerNum, mgWord); 251 263 // convert the result back to a UCArray 252 264 stemTerm.insert (stemTerm.end(), &mgWord[1], &mgWord[1] + mgWord[0]); … … 256 268 unsigned long stemElNum; 257 269 bool result = false; 258 if (stemMethod == 1) { 259 result = SearchStemBlockDictEl (indexData.stem1File, 260 indexData.sii1, 261 indexData.sih1.entries_per_block, 262 indexData.sih1.dict_size, 270 271 /* [JFG - Mar 06: Accent folding patch] */ 272 result = SearchStemBlockDictEl (indexData.stemFile[stemMethod-1], 273 indexData.sii[stemMethod-1], 274 indexData.sih[stemMethod-1].entries_per_block, 275 indexData.sih[stemMethod-1].dict_size, 263 276 stemTerm, 264 277 stemDictEl, 265 278 stemElNum); 266 267 } else if (stemMethod == 2) { 268 result = SearchStemBlockDictEl (indexData.stem2File, 269 indexData.sii2, 270 indexData.sih2.entries_per_block, 271 indexData.sih2.dict_size, 272 stemTerm, 273 stemDictEl, 274 stemElNum); 275 276 } else if (stemMethod == 3) { 277 result = SearchStemBlockDictEl (indexData.stem3File, 278 indexData.sii3, 279 indexData.sih3.entries_per_block, 280 indexData.sih3.dict_size, 281 stemTerm, 282 stemDictEl, 283 stemElNum); 284 } 285 279 286 280 if (result) { 287 281 equivWords = stemDictEl.equivWords; -
trunk/indexers/mgpp/text/mg_files.cpp
r8692 r13477 93 93 } 94 94 95 96 /* [JFG - Mar 06: Accent folding patch] */ 97 /* This generates a suffixe for a file name. It places the name in the 98 buffer specified or if that is NULL, it uses a static buffer. 99 Please do not specify buffers under 512 or the data to be written. */ 100 char * 101 make_suffix (const char *suffix_format, const char suffix_arg, char *buffer) 102 { 103 static char suffix[512]; 104 if (!buffer) 105 buffer = suffix; 106 sprintf (buffer, suffix_format, suffix_arg); 107 return buffer; 108 } 109 95 110 96 111 -
trunk/indexers/mgpp/text/mg_files.h
r3365 r13477 50 50 #define MAGIC_WGHT_APPROX GEN_MAGIC('M','G','w', 0 ) 51 51 #define MAGIC_PARAGRAPH GEN_MAGIC('M','G','P', 0 ) 52 /* [RPAP - Jan 97: Stem Index Change] */ 53 #define MAGIC_STEM_1 GEN_MAGIC('M','G','s','1') 54 #define MAGIC_STEM_2 GEN_MAGIC('M','G','s','2') 55 #define MAGIC_STEM_3 GEN_MAGIC('M','G','s','3') 56 52 #define MAGIC_STEM_GEN(x) GEN_MAGIC('M', 'G', 's', x) 57 53 #define IS_MAGIC(a) ((((u_long)(a)) & 0xffff0000) == MAGIC_XXXX) 58 54 … … 219 215 220 216 /* [RPAP - Jan 97: Stem Index Change] */ 217 /* [JFG - Mar 06: Accent folding patch] 218 * Use the pattern with make_suffix */ 221 219 /* The casefolded index into the stemmed dictionary */ 222 220 #ifdef SHORT_SUFFIX 223 # define INVF_DICT_BLOCKED_1_SUFFIX ".ib1" 224 #else 225 # define INVF_DICT_BLOCKED_1_SUFFIX ".invf.dict.blocked.1" 226 #endif 227 228 /* [RPAP - Jan 97: Stem Index Change] */ 229 /* The stemmed index into the stemmed dictionary */ 230 #ifdef SHORT_SUFFIX 231 # define INVF_DICT_BLOCKED_2_SUFFIX ".ib2" 232 #else 233 # define INVF_DICT_BLOCKED_2_SUFFIX ".invf.dict.blocked.2" 234 #endif 235 236 /* [RPAP - Jan 97: Stem Index Change] */ 237 /* The casefolded and stemmed index into the stemmed dictionary */ 238 #ifdef SHORT_SUFFIX 239 # define INVF_DICT_BLOCKED_3_SUFFIX ".ib3" 240 #else 241 # define INVF_DICT_BLOCKED_3_SUFFIX ".invf.dict.blocked.3" 242 #endif 221 # define INVF_DICT_BLOCKED_SUFFIX_PAT ".ib%d" 222 #else 223 # define INVF_DICT_BLOCKED_SUFFIX_PAT ".invf.dict.blocked.%d" 224 #endif 225 243 226 244 227 /* [RPAP - Feb 97: WIN32 Port] */ … … 264 247 char *make_name (const char *name, const char *suffix, char *buffer); 265 248 266 249 /* [JFG - Mar 06: Accent folding patch] */ 250 /* This generates the suffix of a file. It places the name in the buffer 251 specified or if that is NULL it uses a static buffer. */ 252 char *make_suffix (const char *suffix_format, const char suffix_arg, char *buffer); 267 253 268 254 -
trunk/indexers/mgpp/text/mgpp_stem_idx.cpp
r9613 r13477 80 80 81 81 // stem the word 82 stemmer (stemMethod, stemmerNum, mgWord);82 mgpp_stemmer (stemMethod, stemmerNum, mgWord); 83 83 84 84 // convert the result back to a UCArray … … 101 101 int stemmerNum, 102 102 unsigned long entriesPerBlock) { 103 104 /* [JFG - Mar 06: Accent folding patch] */ 103 105 // Create appropriate stem index file 104 106 FILE *stemDictFile = NULL; 105 if (stemMethod == 1) { 106 stemDictFile = create_file (filename, INVF_DICT_BLOCKED_1_SUFFIX, 107 "wb", MAGIC_STEM_1, MG_ABORT); 108 } else if (stemMethod == 2) { 109 stemDictFile = create_file (filename, INVF_DICT_BLOCKED_2_SUFFIX, 110 "wb", MAGIC_STEM_2, MG_ABORT); 111 } else if (stemMethod == 3) { 112 stemDictFile = create_file (filename, INVF_DICT_BLOCKED_3_SUFFIX, 113 "wb", MAGIC_STEM_3, MG_ABORT); 114 } else { 107 if (stemMethod >= STEM_MIN && stemMethod <= STEM_MAX) { 108 char *suffix = make_suffix (INVF_DICT_BLOCKED_SUFFIX_PAT, stemMethod, NULL); 109 stemDictFile = create_file (filename, suffix, 110 "wb", MAGIC_STEM_GEN(stemMethod + '0'), MG_ABORT); 111 } 112 else { 115 113 FatalError (1, "Unknown stem method %d", stemMethod); 116 114 } … … 213 211 break; 214 212 case 'a': 215 stemmerNum = stemmernumber ((unsigned char *) optarg);213 stemmerNum = mgpp_stemmernumber ((unsigned char *) optarg); 216 214 break; 217 215 case 'h': 218 216 case '?': 219 217 fprintf (stderr, "usage: %s [-d directory] " 220 "[-b entries-per-block] [-h] -s 1|2|3 " 221 "[-a stemmer-method] -f name\n", argv[0]); 218 "[-b entries-per-block] [-h] -s 1|2|3", argv[0]); 219 #ifdef ENABLE_ACCENTFOLD 220 fprintf (stderr, "|4|5|6|7"); 221 #endif 222 fprintf (stderr, " [-a stemmer-method] -f name\n"); 222 223 exit (1); 223 224 } 224 225 } 225 226 226 if (stemMethod < 1 || stemMethod > 3) 227 FatalError (1, "Stem method must be 1, 2 or 3"); 228 227 /* [JFG - Mar 06: Accent folding patch] */ 228 if (stemMethod < STEM_MIN || stemMethod > STEM_MAX) 229 FatalError (1, "Stem method must be between %d and %d", STEM_MIN, STEM_MAX); 230 #ifndef ENABLE_ACCENTFOLD 231 if (stemMethod & STEM_AccentFolding) { 232 // accent folding not enabled 233 return -1; 234 } 235 #endif 229 236 // read in the dictionary and create the in memory dictionary 230 237 StemMapDict stemDict; -
trunk/indexers/mgpp/text/stemmer.cpp
r9613 r13477 22 22 #include "sysfuncs.h" 23 23 #include "stemmer.h" 24 25 24 #include "lovinstem.h" 26 25 #include "simplefrenchstem.h" 27 26 #include "unitool.h" 27 28 #ifdef ENABLE_ACCENTFOLD 29 /* [JFG - Mar 06: Accent folding patch] */ 30 #include "unac.h" 31 #endif 28 32 29 33 #define LOVINSTEMMER 0 … … 34 38 * making sure the final length doesn't exceed the original 35 39 * length */ 36 static void unicode_casefold (u_char *word) {40 static void mgpp_unicode_casefold (u_char *word) { 37 41 unsigned short out[256]; /* temp space */ 38 42 int i; … … 52 56 } 53 57 58 #ifdef ENABLE_ACCENTFOLD 59 /* [JFG - Mar 06: Accent folding patch] */ 60 /* ========================================================================= 61 * Function: unicode_accentfold 62 63 * Description: remove accents from characters 64 * Input: a word string with the length in the first byte 65 * Output: the unaccented word 66 * ========================================================================= */ 67 void mgpp_unicode_accentfold (unsigned char *word) { 68 size_t unac_size = 0; 69 char *unac = NULL; 54 70 55 int stemmernumber (u_char *stemmerdescription) { 71 72 unac_string("utf-8", (char*)word+1, word[0], &unac, &unac_size); 73 strncpy((char*)word+1, unac, word[0]+1); 74 word[0] = unac_size; 75 76 free(unac); 77 return; 78 } 79 #endif 80 81 int mgpp_stemmernumber (u_char *stemmerdescription) { 56 82 u_char descript[MAX_STEM_DESCRIPTION_LEN]; 57 83 int i; … … 85 111 * Method 2 - Stem. 86 112 * Method 3 - Case fold and stem. 87 * 113 * Method 4 - Accent fold 114 * Method 5 - Accent fold and case fold 115 * Method 6 - Accent fold and stem 116 * Method 7 - Accent fold, stem and case fold 117 88 118 * The stemmer number should be obtained using 89 119 * the stemmernumber function above. 90 120 */ 91 121 void 92 stemmer (int method, int stemmer, u_char *word) {93 if (method & 1) {94 unicode_casefold (word);122 mgpp_stemmer (int method, int stemmer, u_char *word) { 123 if (method & STEM_CaseFolding) { 124 mgpp_unicode_casefold (word); 95 125 } 96 126 97 if (method & 2) { 127 #ifdef ENABLE_ACCENTFOLD 128 if (method & STEM_AccentFolding) { 129 mgpp_unicode_accentfold (word); 130 } 131 #endif 132 133 if (method & STEM_Stemming) { 98 134 switch (stemmer) { 99 135 case LOVINSTEMMER: lovinstem (word); -
trunk/indexers/mgpp/text/stemmer.h
r3365 r13477 23 23 #define STEMMER_H 24 24 25 #include "sysfuncs.h" 25 /* [RPAP - Jan 97: Stem Index Change] */ 26 /* [JFG - Mar 06: Accent folding patch] */ 27 enum stemMethods { 28 STEM_None = 0, 29 STEM_CaseFolding = 0x1, 30 STEM_Stemming = 0x2, 31 STEM_AccentFolding = 0x4, 32 }; 33 34 /* This is for the QueryParser */ 35 #define CHAR_FLAG_STEM_CaseFold 'i' // ignore case 36 #define CHAR_FLAG_STEM_NoCaseFold 'c' // case sensitive 37 #define CHAR_FLAG_STEM_Stemming 's' // stem words 38 #define CHAR_FLAG_STEM_NoStemming 'u' // do not stem words 39 #define CHAR_FLAG_STEM_AccentFold 'f' // accent fold 40 #define CHAR_FLAG_STEM_NoAccentFold 'a' // do no accent folding 41 #define CHAR_FLAG_STEM_Validator "icsufa" // all of the above 42 43 44 #define STEM_MIN 1 45 #define STEM_MAX (STEM_CaseFolding | STEM_Stemming | STEM_AccentFolding) 46 #define STEM_PARTIAL_MATCH (STEM_MAX+1) 47 #define STEM_INVALID (STEM_MAX+2) 26 48 27 49 #define STEMMER_MASK 3 … … 47 69 * stemmer description. 48 70 */ 49 int stemmernumber (u_char *stemmerdescription);71 int mgpp_stemmernumber (unsigned char *stemmerdescription); 50 72 51 73 /* … … 54 76 * Method 2 - Stem. 55 77 * Method 3 - Case fold and stem. 56 * 78 * Method 4 - Accent fold 79 * Method 5 - Case fold and accent fold 80 * Method 6 - Stem and accent fold 81 * Method 7 - Case fold, stem and accent fold 57 82 * The stemmer number should be obtained using function 58 83 * stemmernumber above. … … 61 86 extern "C" 62 87 #endif 63 void stemmer (int method, int stemmer, u_char * word);88 void mgpp_stemmer (int method, int stemmer, unsigned char * word); 64 89 65 90 #endif -
trunk/mgpp/text/IndexData.cpp
r8692 r13477 30 30 dictFile = NULL; 31 31 32 stem1File = NULL;33 stem2File = NULL;34 stem3File= NULL;32 /* [JFG - Mar 06: Accent folding patch] */ 33 for(int i=STEM_MIN;i <= STEM_MAX;i++) 34 stemFile[i-1] = NULL; 35 35 36 36 invfFile = NULL; … … 79 79 if (!ReadBlockIdx (dictFile, biTags)) { UnloadData (); return false; } 80 80 81 // blocked stem index 1 82 stem1File = open_file (filename, INVF_DICT_BLOCKED_1_SUFFIX, 83 "rb", MAGIC_STEM_1, MG_ABORT); 84 if (!sih1.Read (stem1File)) { UnloadData (); return false; } 85 86 fseek (stem1File, sih1.block_idx_start, SEEK_SET); 87 if (!ReadBlockIdx (stem1File, sii1)) { UnloadData (); return false; } 88 89 // blocked stem index 2 90 stem2File = open_file (filename, INVF_DICT_BLOCKED_2_SUFFIX, 91 "rb", MAGIC_STEM_2, MG_ABORT); 92 if (!sih2.Read (stem2File)) { UnloadData (); return false; } 93 94 fseek (stem2File, sih2.block_idx_start, SEEK_SET); 95 if (!ReadBlockIdx (stem2File, sii2)) { UnloadData (); return false; } 96 97 // blocked stem index 3 98 stem3File = open_file (filename, INVF_DICT_BLOCKED_3_SUFFIX, 99 "rb", MAGIC_STEM_3, MG_ABORT); 100 if (!sih3.Read (stem3File)) { UnloadData (); return false; } 101 102 fseek (stem3File, sih3.block_idx_start, SEEK_SET); 103 if (!ReadBlockIdx (stem3File, sii3)) { UnloadData (); return false; } 104 81 /* [JFG - Mar 06: Accent folding patch] */ 82 // read stem indexes 83 // [KJD - optional stemming patch] 84 // allow no stem indexes 85 for(int stem = STEM_MIN; stem <= STEM_MAX; stem++) { 86 char *suffix = make_suffix (INVF_DICT_BLOCKED_SUFFIX_PAT, stem, NULL); 87 stemFile[stem-1] = open_file (filename, suffix, 88 "rb", MAGIC_STEM_GEN(stem + '0'), MG_MESSAGE); 89 if (stemFile[stem-1]!= NULL) { 90 if (!sih[stem-1].Read (stemFile[stem-1])) { 91 fclose (stemFile[stem-1]); 92 stemFile[stem-1] = NULL; 93 //UnloadData (); return false; 94 } 95 96 fseek (stemFile[stem-1], sih[stem-1].block_idx_start, SEEK_SET); 97 if (!ReadBlockIdx (stemFile[stem-1], sii[stem-1])) { 98 fclose (stemFile[stem-1]); 99 stemFile[stem-1] = NULL; 100 //UnloadData (); return false; 101 } 102 } 103 } 104 105 105 106 // inverted file 106 107 invfFile = open_file (filename, INVF_SUFFIX, "rb", … … 125 126 } 126 127 127 if (stem1File != NULL) { 128 fclose (stem1File); stem1File = NULL; 129 } 130 if (stem2File != NULL) { 131 fclose (stem2File); stem2File = NULL; 132 } 133 if (stem3File != NULL) { 134 fclose (stem3File); stem3File = NULL; 128 for(int i=STEM_MIN;i <= STEM_MAX;i++) { 129 if (stemFile[i-1] != NULL) { 130 fclose (stemFile[i-1]); stemFile[i-1] = NULL; 131 } 135 132 } 136 133 -
trunk/mgpp/text/IndexData.h
r4205 r13477 28 28 #include "FragLevelConvert.h" 29 29 #include "Weights.h" 30 30 #include "stemmer.h" 31 31 32 32 class IndexData { … … 43 43 block_idx biTags; 44 44 45 /* [JFG - Mar 06: Accent folding patch] */ 45 46 // stem indexes 46 FILE *stem1File; 47 FILE *stem2File; 48 FILE *stem3File; 49 stem_idx_header sih1; 50 stem_idx_header sih2; 51 stem_idx_header sih3; 52 block_idx sii1; 53 block_idx sii2; 54 block_idx sii3; 55 47 FILE *stemFile[STEM_MAX]; 48 stem_idx_header sih[STEM_MAX]; 49 block_idx sii[STEM_MAX]; 50 56 51 // inverted file 57 52 FILE *invfFile; -
trunk/mgpp/text/Queryer.cpp
r12321 r13477 50 50 << "\t.c0/.c1\t\tcasefolding off/on\n" 51 51 << "\t.s0/.s1\t\tstemming off/on\n" 52 #ifdef ENABLE_ACCENTFOLD 53 << "\t.a0/.a1\t\taccentfolding off/on\n" 54 #endif 52 55 << "\t.o0/.o1\t\tshort output off/on\n" 53 56 << "\t.m\t\tset maxnumeric (enter the number at the prompt)\n\n" … … 91 94 92 95 // init the text system 93 96 TextData textData; 94 97 if (!textData.LoadData (basePath, textfilename)) { 95 98 FatalError (1, "Couldn't load text information for \"%s\"", textfilename); … … 134 137 //SetCStr(level, ""); 135 138 136 int defaultStemMethod = 0; // uncasefolded, unstemmed 139 int defaultStemMethod = 0; // uncasefolded, unstemmed, unaccentfolded 137 140 int defaultBoolCombine = 0; // OR 138 141 bool shortOutput = false; … … 207 210 } 208 211 else if (queryArray[1] == 'c') { // casefolding - on/off 209 if (queryArray[2] == '1') defaultStemMethod |= 1;210 else if (queryArray[2] == '0') defaultStemMethod &= 0xe;212 if (queryArray[2] == '1') defaultStemMethod |= STEM_CaseFolding; 213 else if (queryArray[2] == '0') defaultStemMethod &= (~STEM_CaseFolding); 211 214 else { 212 215 cout << "Error: please enter .c0 (case sensitive) or .c1 (casefolded)\n"; … … 214 217 } 215 218 else if (queryArray[1] == 's') { // stemming - on/off 216 if (queryArray[2] == '1') defaultStemMethod |= 2;217 else if (queryArray[2] == '0') defaultStemMethod &= 0xd;219 if (queryArray[2] == '1') defaultStemMethod |= STEM_Stemming; 220 else if (queryArray[2] == '0') defaultStemMethod &= (~STEM_Stemming); 218 221 else { 219 222 cout << "Error: please enter .s0 (unstemmed) or .s1 (stemmed)\n"; 220 223 } 221 224 } 225 #ifdef ENABLE_ACCENTFOLD 226 else if (queryArray[1] == 'a') { // accentfolding - on/off 227 if (queryArray[2] == '1') defaultStemMethod |= STEM_AccentFolding; 228 else if (queryArray[2] == '0') defaultStemMethod &= (~STEM_AccentFolding); 229 else { 230 cout << "Error: please enter .a0 (accent sensitive) or .a1 (accentfolded)\n"; 231 } 232 } 233 #endif 222 234 else if (queryArray[1] == 'o') { // output - short/long 223 235 if (queryArray[2] == '1') shortOutput = true; … … 274 286 275 287 // clean up, everybody clean up 276 288 textData.UnloadData (); 277 289 indexData.UnloadData (); 278 290 -
trunk/mgpp/text/Terms.cpp
r8692 r13477 210 210 vector<unsigned long> &equivWords) { 211 211 equivWords.erase (equivWords.begin(), equivWords.end()); 212 213 if (stemMethod == 0 || stemMethod==4 || stemMethod==5) { 212 213 // if the stem method specified is not a valid one (i.e. there was no appropriate stem index, then we set it to 0) 214 // unless we have partial matching, in which case we are not doing stem indexes anyway. 215 if (!(stemMethod & STEM_PARTIAL_MATCH) && indexData.stemFile[stemMethod-1] == NULL) { 216 cerr << "Stem index for method "<<stemMethod<< " was not built, so not doing stemming\n"; 217 stemMethod = 0; 218 } 219 /* [JFG - Mar 06: Accent folding patch] */ 220 /* use flag PARTIAL_MATCH */ 221 if (stemMethod == 0 || (stemMethod & STEM_PARTIAL_MATCH)) { 214 222 // don't need to stem the word, 215 223 // find the word number(s) for this term … … 218 226 word_block_dict_el wordDictEl; 219 227 wordDictEl.SetNumLevels (numLevels); 220 if (stemMethod == 0) {228 if (stemMethod == 0) { 221 229 if (SearchWordBlockDictEl (indexData.dictFile, indexData.biWords, 222 230 indexData.bdh.entries_per_wblk, … … 228 236 } else { 229 237 // partial matching, 230 PartialMatchSearchWordBlockDictEl (indexData.dictFile, indexData.biWords, indexData.bdh.entries_per_wblk, indexData.bdh.word_dict_size, numLevels, term, wordDictEl, equivWords, (stemMethod==5?true:false) ); 238 PartialMatchSearchWordBlockDictEl (indexData.dictFile, indexData.biWords, indexData.bdh.entries_per_wblk, indexData.bdh.word_dict_size, numLevels, term, wordDictEl, equivWords, (stemMethod & STEM_CaseFolding)? true : false); 239 // TODO: Accent Folding is not handled here!! 231 240 return; 232 241 } … … 234 243 235 244 // need to stem this word and find it in the blocked stem index 236 237 unsigned char mgWord[MAXSTEMLEN + 1]; 245 unsigned char mgWord[MAXSTEMLEN + 1]; 238 246 UCArray stemTerm; 239 247 unsigned long stemmerNum = 0; 240 if (stemMethod == 1) stemmerNum = indexData.sih1.stemmer_num; 241 else if (stemMethod == 2) stemmerNum = indexData.sih2.stemmer_num; 242 else if (stemMethod == 3) stemmerNum = indexData.sih3.stemmer_num; 243 248 249 /* [JFG - Mar 06: Accent folding patch] */ 250 if(stemMethod > STEM_MAX) { 251 return; 252 //TODO: throw an error here 253 } 254 255 stemmerNum = indexData.sih[stemMethod-1].stemmer_num; 256 244 257 // convert the word to an "mg word" 245 258 mgWord[0] = term.size(); … … 247 260 248 261 // stem the word 249 stemmer (stemMethod, stemmerNum, mgWord); 250 262 mgpp_stemmer (stemMethod, stemmerNum, mgWord); 251 263 // convert the result back to a UCArray 252 264 stemTerm.insert (stemTerm.end(), &mgWord[1], &mgWord[1] + mgWord[0]); … … 256 268 unsigned long stemElNum; 257 269 bool result = false; 258 if (stemMethod == 1) { 259 result = SearchStemBlockDictEl (indexData.stem1File, 260 indexData.sii1, 261 indexData.sih1.entries_per_block, 262 indexData.sih1.dict_size, 270 271 /* [JFG - Mar 06: Accent folding patch] */ 272 result = SearchStemBlockDictEl (indexData.stemFile[stemMethod-1], 273 indexData.sii[stemMethod-1], 274 indexData.sih[stemMethod-1].entries_per_block, 275 indexData.sih[stemMethod-1].dict_size, 263 276 stemTerm, 264 277 stemDictEl, 265 278 stemElNum); 266 267 } else if (stemMethod == 2) { 268 result = SearchStemBlockDictEl (indexData.stem2File, 269 indexData.sii2, 270 indexData.sih2.entries_per_block, 271 indexData.sih2.dict_size, 272 stemTerm, 273 stemDictEl, 274 stemElNum); 275 276 } else if (stemMethod == 3) { 277 result = SearchStemBlockDictEl (indexData.stem3File, 278 indexData.sii3, 279 indexData.sih3.entries_per_block, 280 indexData.sih3.dict_size, 281 stemTerm, 282 stemDictEl, 283 stemElNum); 284 } 285 279 286 280 if (result) { 287 281 equivWords = stemDictEl.equivWords; -
trunk/mgpp/text/mg_files.cpp
r8692 r13477 93 93 } 94 94 95 96 /* [JFG - Mar 06: Accent folding patch] */ 97 /* This generates a suffixe for a file name. It places the name in the 98 buffer specified or if that is NULL, it uses a static buffer. 99 Please do not specify buffers under 512 or the data to be written. */ 100 char * 101 make_suffix (const char *suffix_format, const char suffix_arg, char *buffer) 102 { 103 static char suffix[512]; 104 if (!buffer) 105 buffer = suffix; 106 sprintf (buffer, suffix_format, suffix_arg); 107 return buffer; 108 } 109 95 110 96 111 -
trunk/mgpp/text/mg_files.h
r3365 r13477 50 50 #define MAGIC_WGHT_APPROX GEN_MAGIC('M','G','w', 0 ) 51 51 #define MAGIC_PARAGRAPH GEN_MAGIC('M','G','P', 0 ) 52 /* [RPAP - Jan 97: Stem Index Change] */ 53 #define MAGIC_STEM_1 GEN_MAGIC('M','G','s','1') 54 #define MAGIC_STEM_2 GEN_MAGIC('M','G','s','2') 55 #define MAGIC_STEM_3 GEN_MAGIC('M','G','s','3') 56 52 #define MAGIC_STEM_GEN(x) GEN_MAGIC('M', 'G', 's', x) 57 53 #define IS_MAGIC(a) ((((u_long)(a)) & 0xffff0000) == MAGIC_XXXX) 58 54 … … 219 215 220 216 /* [RPAP - Jan 97: Stem Index Change] */ 217 /* [JFG - Mar 06: Accent folding patch] 218 * Use the pattern with make_suffix */ 221 219 /* The casefolded index into the stemmed dictionary */ 222 220 #ifdef SHORT_SUFFIX 223 # define INVF_DICT_BLOCKED_1_SUFFIX ".ib1" 224 #else 225 # define INVF_DICT_BLOCKED_1_SUFFIX ".invf.dict.blocked.1" 226 #endif 227 228 /* [RPAP - Jan 97: Stem Index Change] */ 229 /* The stemmed index into the stemmed dictionary */ 230 #ifdef SHORT_SUFFIX 231 # define INVF_DICT_BLOCKED_2_SUFFIX ".ib2" 232 #else 233 # define INVF_DICT_BLOCKED_2_SUFFIX ".invf.dict.blocked.2" 234 #endif 235 236 /* [RPAP - Jan 97: Stem Index Change] */ 237 /* The casefolded and stemmed index into the stemmed dictionary */ 238 #ifdef SHORT_SUFFIX 239 # define INVF_DICT_BLOCKED_3_SUFFIX ".ib3" 240 #else 241 # define INVF_DICT_BLOCKED_3_SUFFIX ".invf.dict.blocked.3" 242 #endif 221 # define INVF_DICT_BLOCKED_SUFFIX_PAT ".ib%d" 222 #else 223 # define INVF_DICT_BLOCKED_SUFFIX_PAT ".invf.dict.blocked.%d" 224 #endif 225 243 226 244 227 /* [RPAP - Feb 97: WIN32 Port] */ … … 264 247 char *make_name (const char *name, const char *suffix, char *buffer); 265 248 266 249 /* [JFG - Mar 06: Accent folding patch] */ 250 /* This generates the suffix of a file. It places the name in the buffer 251 specified or if that is NULL it uses a static buffer. */ 252 char *make_suffix (const char *suffix_format, const char suffix_arg, char *buffer); 267 253 268 254 -
trunk/mgpp/text/mgpp_stem_idx.cpp
r9613 r13477 80 80 81 81 // stem the word 82 stemmer (stemMethod, stemmerNum, mgWord);82 mgpp_stemmer (stemMethod, stemmerNum, mgWord); 83 83 84 84 // convert the result back to a UCArray … … 101 101 int stemmerNum, 102 102 unsigned long entriesPerBlock) { 103 104 /* [JFG - Mar 06: Accent folding patch] */ 103 105 // Create appropriate stem index file 104 106 FILE *stemDictFile = NULL; 105 if (stemMethod == 1) { 106 stemDictFile = create_file (filename, INVF_DICT_BLOCKED_1_SUFFIX, 107 "wb", MAGIC_STEM_1, MG_ABORT); 108 } else if (stemMethod == 2) { 109 stemDictFile = create_file (filename, INVF_DICT_BLOCKED_2_SUFFIX, 110 "wb", MAGIC_STEM_2, MG_ABORT); 111 } else if (stemMethod == 3) { 112 stemDictFile = create_file (filename, INVF_DICT_BLOCKED_3_SUFFIX, 113 "wb", MAGIC_STEM_3, MG_ABORT); 114 } else { 107 if (stemMethod >= STEM_MIN && stemMethod <= STEM_MAX) { 108 char *suffix = make_suffix (INVF_DICT_BLOCKED_SUFFIX_PAT, stemMethod, NULL); 109 stemDictFile = create_file (filename, suffix, 110 "wb", MAGIC_STEM_GEN(stemMethod + '0'), MG_ABORT); 111 } 112 else { 115 113 FatalError (1, "Unknown stem method %d", stemMethod); 116 114 } … … 213 211 break; 214 212 case 'a': 215 stemmerNum = stemmernumber ((unsigned char *) optarg);213 stemmerNum = mgpp_stemmernumber ((unsigned char *) optarg); 216 214 break; 217 215 case 'h': 218 216 case '?': 219 217 fprintf (stderr, "usage: %s [-d directory] " 220 "[-b entries-per-block] [-h] -s 1|2|3 " 221 "[-a stemmer-method] -f name\n", argv[0]); 218 "[-b entries-per-block] [-h] -s 1|2|3", argv[0]); 219 #ifdef ENABLE_ACCENTFOLD 220 fprintf (stderr, "|4|5|6|7"); 221 #endif 222 fprintf (stderr, " [-a stemmer-method] -f name\n"); 222 223 exit (1); 223 224 } 224 225 } 225 226 226 if (stemMethod < 1 || stemMethod > 3) 227 FatalError (1, "Stem method must be 1, 2 or 3"); 228 227 /* [JFG - Mar 06: Accent folding patch] */ 228 if (stemMethod < STEM_MIN || stemMethod > STEM_MAX) 229 FatalError (1, "Stem method must be between %d and %d", STEM_MIN, STEM_MAX); 230 #ifndef ENABLE_ACCENTFOLD 231 if (stemMethod & STEM_AccentFolding) { 232 // accent folding not enabled 233 return -1; 234 } 235 #endif 229 236 // read in the dictionary and create the in memory dictionary 230 237 StemMapDict stemDict; -
trunk/mgpp/text/stemmer.cpp
r9613 r13477 22 22 #include "sysfuncs.h" 23 23 #include "stemmer.h" 24 25 24 #include "lovinstem.h" 26 25 #include "simplefrenchstem.h" 27 26 #include "unitool.h" 27 28 #ifdef ENABLE_ACCENTFOLD 29 /* [JFG - Mar 06: Accent folding patch] */ 30 #include "unac.h" 31 #endif 28 32 29 33 #define LOVINSTEMMER 0 … … 34 38 * making sure the final length doesn't exceed the original 35 39 * length */ 36 static void unicode_casefold (u_char *word) {40 static void mgpp_unicode_casefold (u_char *word) { 37 41 unsigned short out[256]; /* temp space */ 38 42 int i; … … 52 56 } 53 57 58 #ifdef ENABLE_ACCENTFOLD 59 /* [JFG - Mar 06: Accent folding patch] */ 60 /* ========================================================================= 61 * Function: unicode_accentfold 62 63 * Description: remove accents from characters 64 * Input: a word string with the length in the first byte 65 * Output: the unaccented word 66 * ========================================================================= */ 67 void mgpp_unicode_accentfold (unsigned char *word) { 68 size_t unac_size = 0; 69 char *unac = NULL; 54 70 55 int stemmernumber (u_char *stemmerdescription) { 71 72 unac_string("utf-8", (char*)word+1, word[0], &unac, &unac_size); 73 strncpy((char*)word+1, unac, word[0]+1); 74 word[0] = unac_size; 75 76 free(unac); 77 return; 78 } 79 #endif 80 81 int mgpp_stemmernumber (u_char *stemmerdescription) { 56 82 u_char descript[MAX_STEM_DESCRIPTION_LEN]; 57 83 int i; … … 85 111 * Method 2 - Stem. 86 112 * Method 3 - Case fold and stem. 87 * 113 * Method 4 - Accent fold 114 * Method 5 - Accent fold and case fold 115 * Method 6 - Accent fold and stem 116 * Method 7 - Accent fold, stem and case fold 117 88 118 * The stemmer number should be obtained using 89 119 * the stemmernumber function above. 90 120 */ 91 121 void 92 stemmer (int method, int stemmer, u_char *word) {93 if (method & 1) {94 unicode_casefold (word);122 mgpp_stemmer (int method, int stemmer, u_char *word) { 123 if (method & STEM_CaseFolding) { 124 mgpp_unicode_casefold (word); 95 125 } 96 126 97 if (method & 2) { 127 #ifdef ENABLE_ACCENTFOLD 128 if (method & STEM_AccentFolding) { 129 mgpp_unicode_accentfold (word); 130 } 131 #endif 132 133 if (method & STEM_Stemming) { 98 134 switch (stemmer) { 99 135 case LOVINSTEMMER: lovinstem (word); -
trunk/mgpp/text/stemmer.h
r3365 r13477 23 23 #define STEMMER_H 24 24 25 #include "sysfuncs.h" 25 /* [RPAP - Jan 97: Stem Index Change] */ 26 /* [JFG - Mar 06: Accent folding patch] */ 27 enum stemMethods { 28 STEM_None = 0, 29 STEM_CaseFolding = 0x1, 30 STEM_Stemming = 0x2, 31 STEM_AccentFolding = 0x4, 32 }; 33 34 /* This is for the QueryParser */ 35 #define CHAR_FLAG_STEM_CaseFold 'i' // ignore case 36 #define CHAR_FLAG_STEM_NoCaseFold 'c' // case sensitive 37 #define CHAR_FLAG_STEM_Stemming 's' // stem words 38 #define CHAR_FLAG_STEM_NoStemming 'u' // do not stem words 39 #define CHAR_FLAG_STEM_AccentFold 'f' // accent fold 40 #define CHAR_FLAG_STEM_NoAccentFold 'a' // do no accent folding 41 #define CHAR_FLAG_STEM_Validator "icsufa" // all of the above 42 43 44 #define STEM_MIN 1 45 #define STEM_MAX (STEM_CaseFolding | STEM_Stemming | STEM_AccentFolding) 46 #define STEM_PARTIAL_MATCH (STEM_MAX+1) 47 #define STEM_INVALID (STEM_MAX+2) 26 48 27 49 #define STEMMER_MASK 3 … … 47 69 * stemmer description. 48 70 */ 49 int stemmernumber (u_char *stemmerdescription);71 int mgpp_stemmernumber (unsigned char *stemmerdescription); 50 72 51 73 /* … … 54 76 * Method 2 - Stem. 55 77 * Method 3 - Case fold and stem. 56 * 78 * Method 4 - Accent fold 79 * Method 5 - Case fold and accent fold 80 * Method 6 - Stem and accent fold 81 * Method 7 - Case fold, stem and accent fold 57 82 * The stemmer number should be obtained using function 58 83 * stemmernumber above. … … 61 86 extern "C" 62 87 #endif 63 void stemmer (int method, int stemmer, u_char * word);88 void mgpp_stemmer (int method, int stemmer, unsigned char * word); 64 89 65 90 #endif
Note:
See TracChangeset
for help on using the changeset viewer.