Changeset 34
- Timestamp:
- 1998-11-25T20:55:52+13:00 (26 years ago)
- Location:
- trunk/gsdl/packages/mg-1.3d
- Files:
-
- 4 added
- 4 deleted
- 35 edited
Legend:
- Unmodified
- Added
- Removed
-
trunk/gsdl/packages/mg-1.3d/lib/Makefile.in
r29 r34 76 76 77 77 HEADERS = \ 78 bitio_m_random.h filestats.h local_strings.h stem.h \78 bitio_m_random.h filestats.h local_strings.h lovinstem.h \ 79 79 bitio_m_stdio.h getopt.h memlib.h \ 80 80 bitio_gen.h bitio_mem.h heap.h messages.h \ … … 83 83 bitio_m_mems.h bitio_stdio.h huffman_stdio.h sptree.h \ 84 84 regex.h rx.h pathmax.h getpagesize.h \ 85 random.h frenchstem.h85 random.h simplefrenchstem.h 86 86 87 87 SOURCES = \ … … 90 90 bitio_gen.c filestats.c huffman_mem.c perf_hash.c \ 91 91 bitio_mem.c getopt.c huffman_stdio.c sptree.c \ 92 bitio_mems.c getopt1.c local_strings.c stem.c \92 bitio_mems.c getopt1.c local_strings.c lovinstem.c \ 93 93 timing.c regex.c rx.c \ 94 94 alloca.c error.c xmalloc.c strstr.c \ 95 95 gmalloc.c ftruncate.c strcasecmp.c random.c \ 96 frenchstem.c96 simplefrenchstem.c 97 97 98 98 OBJECTS = @ALLOCA@ @LIBOBJS@ \ … … 101 101 bitio_gen$o filestats$o huffman_mem$o perf_hash$o \ 102 102 bitio_mem$o getopt$o huffman_stdio$o sptree$o \ 103 bitio_mems$o getopt1$o local_strings$o stem$o \103 bitio_mems$o getopt1$o local_strings$o lovinstem$o \ 104 104 timing$o regex$o rx$o \ 105 105 error$o xmalloc$o \ 106 gmalloc$o random$o frenchstem$o106 gmalloc$o random$o simplefrenchstem$o 107 107 108 108 DISTFILES = Makefile.in $(HEADERS) $(SOURCES) -
trunk/gsdl/packages/mg-1.3d/lib/WIN32.MAK
r29 r34 47 47 48 48 HEADERS = \ 49 bitio_m_random.h filestats.h local_strings.h stem.h \49 bitio_m_random.h filestats.h local_strings.h lovinstem.h \ 50 50 bitio_m_stdio.h getopt.h memlib.h \ 51 51 bitio_gen.h bitio_mem.h heap.h messages.h \ … … 54 54 bitio_m_mems.h bitio_stdio.h huffman_stdio.h sptree.h \ 55 55 rx.h pathmax.h getpagesize.h \ 56 random.h win32in.h frenchstem.h56 random.h win32in.h simplefrenchstem.h 57 57 58 58 SOURCES = \ … … 61 61 bitio_gen.c filestats.c huffman_mem.c perf_hash.c \ 62 62 bitio_mem.c getopt.c huffman_stdio.c sptree.c \ 63 bitio_mems.c getopt1.c local_strings.c stem.c \63 bitio_mems.c getopt1.c local_strings.c lovinstem.c \ 64 64 timing.c rx.c \ 65 65 alloca.c error.c xmalloc.c strstr.c \ 66 66 ftruncate.c strcasecmp.c random.c win32in.c \ 67 frenchstem.c67 simplefrenchstem.c 68 68 69 69 OBJECTS = rx$o \ … … 72 72 bitio_gen$o filestats$o huffman_mem$o perf_hash$o \ 73 73 bitio_mem$o getopt$o huffman_stdio$o sptree$o \ 74 bitio_mems$o getopt1$o local_strings$o stem$o \74 bitio_mems$o getopt1$o local_strings$o lovinstem$o \ 75 75 timing$o \ 76 76 error$o xmalloc$o \ 77 random$o win32in$o frenchstem$o77 random$o win32in$o simplefrenchstem$o 78 78 79 79 DISTFILES = Makefile.in $(HEADERS) $(SOURCES) -
trunk/gsdl/packages/mg-1.3d/src/text/bool_parser.c
r26 r34 36 36 static bool_tree_node *tree_base = NULL; 37 37 static TermList **term_list; 38 static int stemmer_num; 38 39 static int stem_method; 39 40 /* [RPAP - Jan 97: Stem Index Change] */ … … 258 259 259 260 bcopy ((char *) word, (char *) sWord, *word + 1); 260 stemmer (stem_to_apply, s Word);261 stemmer (stem_to_apply, stemmer_num, sWord); 261 262 262 263 if (stem_to_apply == 0 || !indexed || p__sd == NULL) … … 380 381 bool_tree_node * 381 382 ParseBool(char *query_line, int query_len, 382 TermList **the_term_list, int the_stem_method, int *res, 383 TermList **the_term_list, int the_stemmer_num, int the_stem_method, 384 int *res, 383 385 stemmed_dict * the_sd, int is_indexed, /* [RPAP - Jan 97: Stem Index Change] */ 384 386 QueryTermList **the_query_term_list) /* [RPAP - Feb 97: Term Frequency] */ … … 386 388 /* global variables to be accessed by bison/yacc created parser */ 387 389 term_list = the_term_list; 390 stemmer_num = the_stemmer_num; 388 391 stem_method = the_stem_method; 389 392 ch_buf = query_line; … … 551 554 case 2: 552 555 #line 79 "bool_parser.y" 553 { yyval.node = CreateBoolTermNode(term_list, yyvsp[0].text, 1, word_num, count, doc_count, invf_ptr, invf_len ); }556 { yyval.node = CreateBoolTermNode(term_list, yyvsp[0].text, 1, word_num, count, doc_count, invf_ptr, invf_len, stemmer_num); } 554 557 break; 555 558 case 3: -
trunk/gsdl/packages/mg-1.3d/src/text/bool_parser.h
r13 r34 24 24 /* 25 25 $Log$ 26 Revision 1.2 1998/11/25 07:55:40 rjmcnab 27 28 Modified mg to that you can specify the stemmer you want 29 to use via a command line option. You specify it to 30 mg_passes during the build process. The number of the 31 stemmer that you used is stored within the inverted 32 dictionary header and the stemmed dictionary header so 33 the correct stemmer is used in later stages of building 34 and querying. 35 26 36 Revision 1.1 1998/11/17 09:34:26 rjmcnab 27 37 *** empty log message *** … … 47 57 48 58 bool_tree_node *ParseBool (char *query_line, int query_len, 49 TermList ** the_term_list, int the_stem_method, int *res, 59 TermList ** the_term_list, int the_stemmer_num, 60 int the_stem_method, int *res, 50 61 stemmed_dict * the_sd, int is_indexed, /* [RPAP - Jan 97: Stem Index Change] */ 51 62 QueryTermList ** the_query_term_list); /* [RPAP - Feb 97: Term Frequency] */ -
trunk/gsdl/packages/mg-1.3d/src/text/bool_query.c
r13 r34 313 313 int res = 0; 314 314 315 tree = ParseBool (Query, MAXLINEBUFFERLEN, &(qd->TL), stem_method, &res, 315 tree = ParseBool (Query, MAXLINEBUFFERLEN, &(qd->TL), 316 qd->sd->sdh.stemmer_num, stem_method, &res, 316 317 qd->sd, qd->sd->sdh.indexed, /* [RPAP - Jan 97: Stem Index Change] */ 317 318 &(qd->QTL)); /* [RPAP - Feb 97: Term Frequency] */ -
trunk/gsdl/packages/mg-1.3d/src/text/bool_tester.c
r13 r34 24 24 /* 25 25 $Log$ 26 Revision 1.2 1998/11/25 07:55:41 rjmcnab 27 28 Modified mg to that you can specify the stemmer you want 29 to use via a command line option. You specify it to 30 mg_passes during the build process. The number of the 31 stemmer that you used is stored within the inverted 32 dictionary header and the stemmed dictionary header so 33 the correct stemmer is used in later stages of building 34 and querying. 35 26 36 Revision 1.1 1998/11/17 09:34:29 rjmcnab 27 37 *** empty log message *** … … 57 67 58 68 #define MAX_LINE_LEN 255 69 #define STEMMER_NUM 0 /* Lovin's stemmer */ 59 70 #define STEM_METHOD 3 60 71 … … 74 85 * ========================================================================= */ 75 86 76 void 77 main (int argc, char *argv[]) 87 int main (int argc, char *argv[]) 78 88 { 79 89 bool_tree_node *tree = NULL; … … 93 103 len = strlen (line) - 1; /* -1 => ignore the \n */ 94 104 95 tree = ParseBool (line, len, &term_list, STEM _METHOD, &res,105 tree = ParseBool (line, len, &term_list, STEMMER_NUM, STEM_METHOD, &res, 96 106 NULL, 0, /* [RPAP - Jan 97: Stem Index Change] */ 97 107 &query_term_list); /* [RPAP - Feb 97: Term Frequency] */ … … 141 151 142 152 } 143 153 return 0; 144 154 } 145 155 -
trunk/gsdl/packages/mg-1.3d/src/text/bool_tree.c
r13 r34 56 56 bool_tree_node * 57 57 CreateBoolTermNode (TermList ** tl, char *text, int Count, int word_num, 58 u_long count, u_long doc_count, u_long invf_ptr, u_long invf_len) /* [RPAP - Feb 97: Term Frequency] */ 58 u_long count, u_long doc_count, u_long invf_ptr, u_long invf_len, /* [RPAP - Feb 97: Term Frequency] */ 59 int stemmer_num) 59 60 { 60 61 bool_tree_node *n = NULL; … … 63 64 n = CreateBoolNode (N_term); 64 65 65 BOOL_TERM (n) = AddTerm (tl, (u_char *) text, Count, word_num, count, doc_count, invf_ptr, invf_len); /* [RPAP - Feb 97: Term Frequency] */ 66 BOOL_TERM (n) = AddTerm (tl, (u_char *) text, Count, word_num, count, doc_count, 67 invf_ptr, invf_len, stemmer_num); /* [RPAP - Feb 97: Term Frequency] */ 66 68 67 69 return n; -
trunk/gsdl/packages/mg-1.3d/src/text/bool_tree.h
r13 r34 24 24 /* 25 25 $Log$ 26 Revision 1.2 1998/11/25 07:55:42 rjmcnab 27 28 Modified mg to that you can specify the stemmer you want 29 to use via a command line option. You specify it to 30 mg_passes during the build process. The number of the 31 stemmer that you used is stored within the inverted 32 dictionary header and the stemmed dictionary header so 33 the correct stemmer is used in later stages of building 34 and querying. 35 26 36 Revision 1.1 1998/11/17 09:34:31 rjmcnab 27 37 *** empty log message *** … … 86 96 bool_tree_node *CopyBoolTree (bool_tree_node * tree); 87 97 bool_tree_node *CreateBoolTermNode (TermList ** tl, char *text, int Count, int word_num, 88 u_long count, u_long doc_count, u_long invf_ptr, u_long invf_len); 98 u_long count, u_long doc_count, u_long invf_ptr, 99 u_long invf_len, int stemmer_num); 89 100 bool_tree_node *CreateBoolTreeNode (N_Tag tag, 90 101 bool_tree_node * left, bool_tree_node * right); -
trunk/gsdl/packages/mg-1.3d/src/text/build.h
r13 r34 156 156 */ 157 157 158 159 extern int stemmer_num; 160 /* 161 * The stemmer to use for stemming words for the inverted file. 162 * see stemmer.h 163 */ 164 158 165 extern int stem_method; 159 166 /* -
trunk/gsdl/packages/mg-1.3d/src/text/invf.h
r13 r34 38 38 unsigned long static_num_of_docs; 39 39 unsigned long num_of_words; 40 unsigned long stemmer_num; 40 41 unsigned long stem_method; 41 42 }; … … 51 52 unsigned long static_num_of_docs; 52 53 unsigned long num_of_words; 54 unsigned long stemmer_num; 53 55 unsigned long stem_method; 54 56 unsigned long indexed; /* [RPAP - Jan 97: Stem Index Change] */ -
trunk/gsdl/packages/mg-1.3d/src/text/ivf.pass1.c
r13 r34 44 44 /* 45 45 $Log$ 46 Revision 1.2 1998/11/25 07:55:43 rjmcnab 47 48 Modified mg to that you can specify the stemmer you want 49 to use via a command line option. You specify it to 50 mg_passes during the build process. The number of the 51 stemmer that you used is stored within the inverted 52 dictionary header and the stemmed dictionary header so 53 the correct stemmer is used in later stages of building 54 and querying. 55 46 56 Revision 1.1 1998/11/17 09:34:44 rjmcnab 47 57 *** empty log message *** … … 292 302 PARSE_STEM_WORD (Word, s_in, end); 293 303 294 stemmer (stem_method, Word);304 stemmer (stem_method, stemmer_num, Word); 295 305 if (SkipSGML) 296 306 PARSE_NON_STEM_WORD_OR_SGML_TAG (s_in, end); … … 620 630 HTONUL2(callnum, idh.static_num_of_docs); 621 631 HTONUL2(words_read, idh.num_of_words); 632 HTONUL2(stemmer_num, idh.stemmer_num); 622 633 HTONUL2(stem_method, idh.stem_method); 623 634 -
trunk/gsdl/packages/mg-1.3d/src/text/ivf.pass2.c
r13 r34 24 24 /* 25 25 $Log$ 26 Revision 1.2 1998/11/25 07:55:43 rjmcnab 27 28 Modified mg to that you can specify the stemmer you want 29 to use via a command line option. You specify it to 30 mg_passes during the build process. The number of the 31 stemmer that you used is stored within the inverted 32 dictionary header and the stemmed dictionary header so 33 the correct stemmer is used in later stages of building 34 and querying. 35 26 36 Revision 1.1 1998/11/17 09:34:45 rjmcnab 27 37 *** empty log message *** … … 58 68 #include "words.h" 59 69 #include "hash.h" 60 #include "stemmer.h"61 70 62 71 /* [RPAP - Feb 97: WIN32 Port] */ … … 347 356 NTOHUL(idh.static_num_of_docs); 348 357 NTOHUL(idh.num_of_words); 358 NTOHUL(idh.stemmer_num); 349 359 NTOHUL(idh.stem_method); 350 360 … … 779 789 780 790 PARSE_STEM_WORD (Word, s_in, end); 781 stemmer (idh.stem_method, Word);791 stemmer (idh.stem_method, idh.stemmer_num, Word); 782 792 if (SkipSGML) 783 793 PARSE_NON_STEM_WORD_OR_SGML_TAG (s_in, end); -
trunk/gsdl/packages/mg-1.3d/src/text/mg_compression_dict.c
r13 r34 45 45 /* 46 46 $Log$ 47 Revision 1.2 1998/11/25 07:55:44 rjmcnab 48 49 Modified mg to that you can specify the stemmer you want 50 to use via a command line option. You specify it to 51 mg_passes during the build process. The number of the 52 stemmer that you used is stored within the inverted 53 dictionary header and the stemmed dictionary header so 54 the correct stemmer is used in later stages of building 55 and querying. 56 47 57 Revision 1.1 1998/11/17 09:34:52 rjmcnab 48 58 *** empty log message *** … … 113 123 114 124 115 void 116 main (int argc, char **argv) 125 int main (int argc, char **argv) 117 126 { 118 127 ProgTime StartTime; … … 226 235 227 236 Message ("%s", ElapsedTime (&StartTime, NULL)); 228 exit (0);237 return 0; 229 238 } 230 239 -
trunk/gsdl/packages/mg-1.3d/src/text/mg_fast_comp_dict.c
r13 r34 25 25 /* 26 26 $Log$ 27 Revision 1.2 1998/11/25 07:55:44 rjmcnab 28 29 Modified mg to that you can specify the stemmer you want 30 to use via a command line option. You specify it to 31 mg_passes during the build process. The number of the 32 stemmer that you used is stored within the inverted 33 dictionary header and the stemmed dictionary header so 34 the correct stemmer is used in later stages of building 35 and querying. 36 27 37 Revision 1.1 1998/11/17 09:34:57 rjmcnab 28 38 *** empty log message *** … … 86 96 static u_long mem, fixup_mem; 87 97 88 void 89 main (int argc, char **argv) 98 int main (int argc, char **argv) 90 99 { 91 100 ProgTime StartTime; … … 188 197 189 198 Message ("%s", ElapsedTime (&StartTime, NULL)); 190 exit (0);199 return 0; 191 200 } 192 201 -
trunk/gsdl/packages/mg-1.3d/src/text/mg_hilite_words.1
r24 r34 15 15 [ 16 16 .B --stem_method [0-3] 17 ] 18 .if n .ti +9n 19 [ 20 .B --stemmer [english|lovin|french|simplefrench] 17 21 ] 18 22 .if n .ti +9n … … 47 51 This should match the way the inverted file index was created. 48 52 The default is 3, which is fold and stem. 53 .TP 54 .B --stemmer [english|lovin|french|simplefrench] 55 This option allows you to select the stemmer to use. The 56 default is the English stemmer. 49 57 .TP 50 58 .B --style [bold|underline|italic|emphasis|strong] -
trunk/gsdl/packages/mg-1.3d/src/text/mg_hilite_words.c
r29 r34 104 104 static short hilite_style = BOLD; 105 105 static char *pager = "less"; 106 static int stemmer_num = 0; /* Lovin's stemmer */ 106 107 static int stem_method = 3; /* fold & stem */ 107 108 static char **word_list; … … 458 459 PARSE_STEM_WORD (word, s_in, end); 459 460 460 stemmer (stem_method, word);461 stemmer (stem_method, stemmer_num, word); 461 462 462 463 if (set_member (word)) /* output with highlighting */ … … 567 568 {"pager", required_argument, 0, 'p'}, 568 569 {"stem_method", required_argument, 0, 'm'}, 570 {"stemmer", required_argument, 0, 'a'}, 569 571 {0, 0, 0, 0} 570 572 }; … … 577 579 578 580 opterr = 0; 579 while ((ch = getopt_long (argc, argv, "s:p:t:m: ", long_opts, (int *) 0)) != -1)581 while ((ch = getopt_long (argc, argv, "s:p:t:m:a:", long_opts, (int *) 0)) != -1) 580 582 { 581 583 switch (ch) … … 591 593 hilite_style = i; 592 594 } 595 break; 596 case 'a': 597 stemmer_num = stemmernumber (optarg); 593 598 break; 594 599 case 't': … … 604 609 FatalError (1, "Usage: \n" 605 610 "mg_hilite_words --stem_method [0-3]\n" 611 " --stemmer [english|lovin|french|simplefrench]\n" 606 612 " --style [bold|underline|italic|emphasis|strong]\n" 607 613 " --pager [less|more|html|???]\n"); -
trunk/gsdl/packages/mg-1.3d/src/text/mg_invf_dict.c
r13 r34 36 36 /* 37 37 $Log$ 38 Revision 1.2 1998/11/25 07:55:45 rjmcnab 39 40 Modified mg to that you can specify the stemmer you want 41 to use via a command line option. You specify it to 42 mg_passes during the build process. The number of the 43 stemmer that you used is stored within the inverted 44 dictionary header and the stemmed dictionary header so 45 the correct stemmer is used in later stages of building 46 and querying. 47 38 48 Revision 1.1 1998/11/17 09:35:03 rjmcnab 39 49 *** empty log message *** … … 59 69 static void process_files (char *filename); 60 70 61 void 62 main (int argc, char **argv) 71 int main (int argc, char **argv) 63 72 { 64 73 char *file_name = ""; … … 90 99 91 100 process_files (file_name); 92 exit (0);101 return 0; 93 102 } 94 103 … … 135 144 NTOHUL(idh.static_num_of_docs); 136 145 NTOHUL(idh.num_of_words); 146 NTOHUL(idh.stemmer_num); 137 147 NTOHUL(idh.stem_method); 138 148 … … 145 155 sdh.static_num_of_docs = idh.static_num_of_docs; 146 156 sdh.num_of_words = idh.num_of_words; 157 sdh.stemmer_num = idh.stemmer_num; 147 158 sdh.stem_method = idh.stem_method; 148 159 sdh.indexed = 0; /* [RPAP - Jan 97: Stem Index Change] */ … … 317 328 HTONUL(sdh.static_num_of_docs); 318 329 HTONUL(sdh.num_of_words); 330 HTONUL(sdh.stemmer_num); 319 331 HTONUL(sdh.stem_method); 320 332 HTONUL(sdh.indexed); -
trunk/gsdl/packages/mg-1.3d/src/text/mg_invf_dump.c
r13 r34 40 40 /* 41 41 $Log$ 42 Revision 1.2 1998/11/25 07:55:46 rjmcnab 43 44 Modified mg to that you can specify the stemmer you want 45 to use via a command line option. You specify it to 46 mg_passes during the build process. The number of the 47 stemmer that you used is stored within the inverted 48 dictionary header and the stemmed dictionary header so 49 the correct stemmer is used in later stages of building 50 and querying. 51 42 52 Revision 1.1 1998/11/17 09:35:05 rjmcnab 43 53 *** empty log message *** … … 62 72 63 73 64 void 65 main (int argc, char **argv) 74 int main (int argc, char **argv) 66 75 { 67 76 ProgTime start; … … 99 108 process_files (file_name); 100 109 Message ("%s\n", ElapsedTime (&start, NULL)); 101 exit (0);110 return 0; 102 111 } 103 112 … … 125 134 NTOHUL(idh.static_num_of_docs); 126 135 NTOHUL(idh.num_of_words); 136 NTOHUL(idh.stemmer_num); 127 137 NTOHUL(idh.stem_method); 128 138 -
trunk/gsdl/packages/mg-1.3d/src/text/mg_invf_merge.c
r13 r34 132 132 NTOHUL(m[OLD].idh.static_num_of_docs); 133 133 NTOHUL(m[OLD].idh.num_of_words); 134 NTOHUL(m[OLD].idh.stemmer_num); 134 135 NTOHUL(m[OLD].idh.stem_method); 135 136 … … 147 148 NTOHUL(m[NEW].idh.static_num_of_docs); 148 149 NTOHUL(m[NEW].idh.num_of_words); 150 NTOHUL(m[NEW].idh.stemmer_num); 149 151 NTOHUL(m[NEW].idh.stem_method); 150 152 … … 668 670 m[MERGE].idh.num_of_docs = m[MERGE].nDocs; 669 671 m[MERGE].idh.num_of_words = m[OLD].idh.num_of_words + m[NEW].idh.num_of_words; 672 m[MERGE].idh.stemmer_num = m[OLD].idh.stemmer_num; 670 673 m[MERGE].idh.stem_method = m[OLD].idh.stem_method; 671 674 … … 685 688 HTONUL(m[MERGE].idh.static_num_of_docs); 686 689 HTONUL(m[MERGE].idh.num_of_words); 690 HTONUL(m[MERGE].idh.stemmer_num); 687 691 HTONUL(m[MERGE].idh.stem_method); 688 692 -
trunk/gsdl/packages/mg-1.3d/src/text/mg_invf_rebuild.c
r13 r34 49 49 /* 50 50 $Log$ 51 Revision 1.2 1998/11/25 07:55:46 rjmcnab 52 53 Modified mg to that you can specify the stemmer you want 54 to use via a command line option. You specify it to 55 mg_passes during the build process. The number of the 56 stemmer that you used is stored within the inverted 57 dictionary header and the stemmed dictionary header so 58 the correct stemmer is used in later stages of building 59 and querying. 60 51 61 Revision 1.1 1998/11/17 09:35:10 rjmcnab 52 62 *** empty log message *** … … 96 106 97 107 98 void 99 main (int argc, char **argv) 108 int main (int argc, char **argv) 100 109 { 101 110 ProgTime start; … … 174 183 Message ("**** rebuild them with mg_stem_idx. ****\n"); 175 184 176 exit (0);185 return 0; 177 186 } 178 187 … … 268 277 NTOHUL(idh.static_num_of_docs); 269 278 NTOHUL(idh.num_of_words); 279 NTOHUL(idh.stemmer_num); 270 280 NTOHUL(idh.stem_method); 271 281 -
trunk/gsdl/packages/mg-1.3d/src/text/mg_passes.1
r24 r34 50 50 [ 51 51 .BI \-c " numchunks" 52 ] 53 [ 54 .BI \-a " stemmer" 52 55 ] 53 56 [ … … 202 205 .I stemmethod 203 206 does both case folding and stemming. 207 .TP 208 .BI \-a " stemmer" 209 This specifies the stemmer to use when stemming words. This 210 is a description of the language the stemmer is intended for 211 or a description of the stemmer. Valid options include: 212 english, lovin, french, and simplefrench. 204 213 .TP 205 214 .BI \-b " bufsize" -
trunk/gsdl/packages/mg-1.3d/src/text/mg_passes.c
r13 r34 41 41 /* 42 42 $Log$ 43 Revision 1.2 1998/11/25 07:55:47 rjmcnab 44 45 Modified mg to that you can specify the stemmer you want 46 to use via a command line option. You specify it to 47 mg_passes during the build process. The number of the 48 stemmer that you used is stored within the inverted 49 dictionary header and the stemmed dictionary header so 50 the correct stemmer is used in later stages of building 51 and querying. 52 43 53 Revision 1.1 1998/11/17 09:35:13 rjmcnab 44 54 *** empty log message *** … … 76 86 double bytes_processed = 0; /* [RJM 07/97: 4G limit] */ 77 87 double bytes_received = 0; /* [RJM 07/97: 4G limit] */ 88 int stemmer_num = 0; /* default to the lovin stemmer */ 78 89 int stem_method = 0; 79 90 … … 123 134 " %*s [-N2] [-W] [-S] [-b buffer-size] [-d dictionary-directory]\n" 124 135 " %*s [-t trace-point Mb] [-m invf-memory] [-c chunk-limit]\n" 125 " %*s [-n trace-name] [-C comp-stat-size] [-s stem_method] -f doc-collection-name\n"; 136 " %*s [-n trace-name] [-C comp-stat-size] [-s stem_method]\n" 137 " %*s [-a stemmer] -f doc-collection-name\n"; 126 138 127 139 … … 452 464 453 465 454 void 455 main (int argc, char **argv) 466 int main (int argc, char **argv) 456 467 { 457 468 int ch, in_fd; … … 462 473 463 474 opterr = 0; 464 while ((ch = getopt (argc, argv, "hC:WGSD123f:d:b:T:I:t:m:N:c:n:s: ")) != -1)475 while ((ch = getopt (argc, argv, "hC:WGSD123f:d:b:T:I:t:m:N:c:n:s:a:")) != -1) 465 476 { 466 477 switch (ch) … … 495 506 case 'd': 496 507 set_basepath (optarg); 508 break; 509 case 'a': 510 stemmer_num = stemmernumber (optarg); 497 511 break; 498 512 case 's': … … 601 615 fclose (Comp_Stats); 602 616 603 exit (0);617 return 0; 604 618 } -
trunk/gsdl/packages/mg-1.3d/src/text/mg_perf_hash_build.c
r13 r34 38 38 /* 39 39 $Log$ 40 Revision 1.2 1998/11/25 07:55:47 rjmcnab 41 42 Modified mg to that you can specify the stemmer you want 43 to use via a command line option. You specify it to 44 mg_passes during the build process. The number of the 45 stemmer that you used is stored within the inverted 46 dictionary header and the stemmed dictionary header so 47 the correct stemmer is used in later stages of building 48 and querying. 49 40 50 Revision 1.1 1998/11/17 09:35:15 rjmcnab 41 51 *** empty log message *** … … 60 70 int r = -1; 61 71 62 void 63 main (int argc, char **argv) 72 int main (int argc, char **argv) 64 73 { 65 74 ProgTime start; … … 90 99 process_files (file_name); 91 100 Message ("%s\n", ElapsedTime (&start, NULL)); 92 exit (0);101 return 0; 93 102 } 94 103 … … 124 133 NTOHUL(idh.static_num_of_docs); 125 134 NTOHUL(idh.num_of_words); 135 NTOHUL(idh.stemmer_num); 126 136 NTOHUL(idh.stem_method); 127 137 -
trunk/gsdl/packages/mg-1.3d/src/text/mg_stem_idx.c
r13 r34 238 238 NTOHUL(sd->sdh.static_num_of_docs); 239 239 NTOHUL(sd->sdh.num_of_words); 240 NTOHUL(sd->sdh.stemmer_num); 240 241 NTOHUL(sd->sdh.stem_method); 241 242 NTOHUL(sd->sdh.indexed); … … 409 410 /* Stem word */ 410 411 bcopy ((char *) prev, (char *) word, *prev + 1); 411 stemmer (stem_method, word);412 stemmer (stem_method, sd->sdh.stemmer_num, word); 412 413 413 414 /* Check if word follows straight on from previous word */ … … 828 829 829 830 /* Main */ 830 voidmain (int argc, char **argv)831 int main (int argc, char **argv) 831 832 { 832 833 File *idb; /* File to .invf.dict.blocked */ … … 887 888 UpdateStemDict (filename, stem_method); 888 889 889 exit (0);890 } 891 890 return 0; 891 } 892 -
trunk/gsdl/packages/mg-1.3d/src/text/mg_text_estimate.c
r13 r34 49 49 /* 50 50 $Log$ 51 Revision 1.2 1998/11/25 07:55:48 rjmcnab 52 53 Modified mg to that you can specify the stemmer you want 54 to use via a command line option. You specify it to 55 mg_passes during the build process. The number of the 56 stemmer that you used is stored within the inverted 57 dictionary header and the stemmed dictionary header so 58 the correct stemmer is used in later stages of building 59 and querying. 60 51 61 Revision 1.1 1998/11/17 09:35:19 rjmcnab 52 62 *** empty log message *** … … 85 95 86 96 87 void 88 main (int argc, char **argv) 97 int main (int argc, char **argv) 89 98 { 90 99 char *stats_dict = NULL, *comp_dict = NULL; … … 170 179 escape_count * 100.0 / huff_count); 171 180 Message ("%s", ElapsedTime (&StartTime, NULL)); 181 return 0; 172 182 } 173 183 -
trunk/gsdl/packages/mg-1.3d/src/text/mg_text_merge.c
r13 r34 209 209 * main() 210 210 *=======================================================================*/ 211 void 212 main (int argc, char *argv[]) 211 int main (int argc, char *argv[]) 213 212 { 214 213 char *progname; … … 247 246 done_merge_text (); 248 247 Message ("%s\n", ElapsedTime (&start, NULL)); 249 exit (0);250 } 248 return 0; 249 } -
trunk/gsdl/packages/mg-1.3d/src/text/mg_weights_build.c
r13 r34 42 42 /* 43 43 $Log$ 44 Revision 1.2 1998/11/25 07:55:49 rjmcnab 45 46 Modified mg to that you can specify the stemmer you want 47 to use via a command line option. You specify it to 48 mg_passes during the build process. The number of the 49 stemmer that you used is stored within the inverted 50 dictionary header and the stemmed dictionary header so 51 the correct stemmer is used in later stages of building 52 and querying. 53 44 54 Revision 1.1 1998/11/17 09:35:22 rjmcnab 45 55 *** empty log message *** … … 72 82 73 83 74 void 75 main (int argc, char **argv) 84 int main (int argc, char **argv) 76 85 { 77 86 ProgTime StartTime; … … 116 125 117 126 Message ("%s", ElapsedTime (&StartTime, NULL)); 118 exit (0); 127 128 return 0; 119 129 } 120 130 … … 202 212 NTOHUL(idh.static_num_of_docs); 203 213 NTOHUL(idh.num_of_words); 214 NTOHUL(idh.stemmer_num); 204 215 NTOHUL(idh.stem_method); 205 216 -
trunk/gsdl/packages/mg-1.3d/src/text/mgdictlist.c
r13 r34 37 37 /* 38 38 $Log$ 39 Revision 1.2 1998/11/25 07:55:49 rjmcnab 40 41 Modified mg to that you can specify the stemmer you want 42 to use via a command line option. You specify it to 43 mg_passes during the build process. The number of the 44 stemmer that you used is stored within the inverted 45 dictionary header and the stemmed dictionary header so 46 the correct stemmer is used in later stages of building 47 and querying. 48 39 49 Revision 1.1 1998/11/17 09:35:24 rjmcnab 40 50 *** empty log message *** … … 82 92 NTOHUL(idh.static_num_of_docs); 83 93 NTOHUL(idh.num_of_words); 94 NTOHUL(idh.stemmer_num); 84 95 NTOHUL(idh.stem_method); 85 96 … … 345 356 346 357 347 void 348 main (int argc, char **argv) 358 int main (int argc, char **argv) 349 359 { 350 360 FILE *fp; … … 389 399 } 390 400 fclose (fp); 391 exit (0);392 } 401 return 0; 402 } -
trunk/gsdl/packages/mg-1.3d/src/text/mgquery.c
r13 r34 24 24 /* 25 25 $Log$ 26 Revision 1.2 1998/11/25 07:55:49 rjmcnab 27 28 Modified mg to that you can specify the stemmer you want 29 to use via a command line option. You specify it to 30 mg_passes during the build process. The number of the 31 stemmer that you used is stored within the inverted 32 dictionary header and the stemmed dictionary header so 33 the correct stemmer is used in later stages of building 34 and querying. 35 26 36 Revision 1.1 1998/11/17 09:35:29 rjmcnab 27 37 *** empty log message *** … … 968 978 return; 969 979 } 970 sprintf (pager, "%s --style=%s --pager=%s --stem_method=%ld %s",980 sprintf (pager, "%s --style=%s --pager=%s --stem_method=%ld --stemmer=%ld %s", 971 981 HILITE_PAGER, 972 982 GetEnv ("hilite_style"), 973 983 GetEnv ("pager"), 974 984 qd->sd->sdh.stem_method, 985 qd->sd->sdh.stemmer_num, 975 986 terms_str); 976 987 … … 1369 1380 /* Initialises global variables based on command line switches, and opens */ 1370 1381 /* files. Then calls query () to perform the querying. */ 1371 void 1372 main (int argc, char **argv) 1382 int main (int argc, char **argv) 1373 1383 { 1374 1384 ProgTime StartTime; … … 1476 1486 1477 1487 UninitEnv (); 1478 exit (0);1479 } 1488 return 0; 1489 } -
trunk/gsdl/packages/mg-1.3d/src/text/mgstat.c
r13 r34 36 36 /* 37 37 $Log$ 38 Revision 1.2 1998/11/25 07:55:50 rjmcnab 39 40 Modified mg to that you can specify the stemmer you want 41 to use via a command line option. You specify it to 42 mg_passes during the build process. The number of the 43 stemmer that you used is stored within the inverted 44 dictionary header and the stemmed dictionary header so 45 the correct stemmer is used in later stages of building 46 and querying. 47 38 48 Revision 1.1 1998/11/17 09:35:31 rjmcnab 39 49 *** empty log message *** … … 59 69 60 70 61 void 62 main (int argc, char **argv) 71 int main (int argc, char **argv) 63 72 { 64 73 unsigned long sub_total; … … 152 161 printf ("\n"); 153 162 process_file (NULL, "TOTAL", exact); 154 exit (0); 163 164 return 0; 155 165 } 156 166 -
trunk/gsdl/packages/mg-1.3d/src/text/query.ranked.c
r13 r34 49 49 /* 50 50 $Log$ 51 Revision 1.2 1998/11/25 07:55:50 rjmcnab 52 53 Modified mg to that you can specify the stemmer you want 54 to use via a command line option. You specify it to 55 mg_passes during the build process. The number of the 56 stemmer that you used is stored within the inverted 57 dictionary header and the stemmed dictionary header so 58 the correct stemmer is used in later stages of building 59 and querying. 60 51 61 Revision 1.1 1998/11/17 09:35:34 rjmcnab 52 62 *** empty log message *** … … 319 329 320 330 bcopy ((char *) Word, (char *) sWord, *Word + 1); 321 stemmer (stem_to_apply, s Word);331 stemmer (stem_to_apply, sd->sdh.stemmer_num, sWord); 322 332 323 333 if (!indexed || stem_to_apply == 0) -
trunk/gsdl/packages/mg-1.3d/src/text/stemmer.c
r13 r34 25 25 #include "stemmer.h" 26 26 27 #include "lovinstem.h" 28 #include "simplefrenchstem.h" 29 27 30 /* 28 31 $Log$ 32 Revision 1.2 1998/11/25 07:55:51 rjmcnab 33 34 Modified mg to that you can specify the stemmer you want 35 to use via a command line option. You specify it to 36 mg_passes during the build process. The number of the 37 stemmer that you used is stored within the inverted 38 dictionary header and the stemmed dictionary header so 39 the correct stemmer is used in later stages of building 40 and querying. 41 29 42 Revision 1.1 1998/11/17 09:35:42 rjmcnab 30 43 *** empty log message *** … … 41 54 static char *RCSID = "$Id$"; 42 55 56 57 #define LOVINSTEMMER 0 58 #define SIMPLEFRENCHSTEMMER 1 59 60 43 61 /* [RJM 03/98: Extended ascii] */ 44 62 u_char exttolower (u_char c) { … … 51 69 } 52 70 71 72 int stemmernumber (u_char *stemmerdescription) { 73 u_char descript[MAX_STEM_DESCRIPTION_LEN]; 74 int i; 75 76 /* copy and case-fold the description */ 77 for (i=0; (stemmerdescription[i] != '\0') && 78 (i < MAX_STEM_DESCRIPTION_LEN-1); i++) 79 descript[i] = exttolower (stemmerdescription[i]); 80 descript[i] = '\0'; 81 82 /* map the description to its number */ 83 84 if ((strcmp (descript, "0") == 0) || 85 (strcmp (descript, "english") == 0) || 86 (strcmp (descript, "lovin") == 0)) 87 return LOVINSTEMMER; 88 89 if ((strcmp (descript, "1") == 1) || 90 (strcmp (descript, "french") == 0) || 91 (strcmp (descript, "simplefrench") == 0)) 92 return SIMPLEFRENCHSTEMMER; 93 94 return -1; 95 } 96 97 98 53 99 /* 54 100 * Method 0 - Do not stem or case fold. … … 56 102 * Method 2 - Stem. 57 103 * Method 3 - Case fold and stem. 104 * 105 * The stemmer number should be obtained using 106 * the stemmernumber function above. 58 107 */ 59 108 void 60 stemmer (int method, u_char * word)109 stemmer (int method, int stemmer, u_char * word) 61 110 { 62 111 if (method & 1) … … 68 117 *w = exttolower (*w); 69 118 } 119 70 120 if (method & 2) { 71 #ifdef FRENCHSTEMMER 72 frenchstem (word);73 #else 74 stem (word);75 #endif 76 121 switch (stemmer) { 122 case LOVINSTEMMER: lovinstem (word); 123 break; 124 case SIMPLEFRENCHSTEMMER: simplefrenchstem (word); 125 break; 126 } 77 127 } 78 128 } -
trunk/gsdl/packages/mg-1.3d/src/text/stemmer.h
r29 r34 27 27 #include "sysfuncs.h" 28 28 29 #ifdef FRENCHSTEMMER 30 #include "frenchstem.h" 31 #else 32 #include "stem.h" 33 #endif 29 #define STEMMER_MASK 3 30 #define MAX_STEM_DESCRIPTION_LEN 16 34 31 35 #define STEMMER_MASK 3 32 /* stemmernumber will return the stemmer for 33 * a description of the stemmer. Stemmer descriptions 34 * are not case sensitive. Valid descriptions are: 35 * 36 * 'English' 37 * 'Lovin' 38 * 'French' 39 * 'SimpleFrench' 40 * 41 * More than one description might result in the same 42 * stemmer number (for example, for stemming 'English' 43 * we currently use the 'Lovin' stemmer). 44 * 45 * stemmerdescription is a normal C, null-terminated, 46 * string. 47 * 48 * stemmernumber will return -1 if it doesn't know the 49 * stemmer description. 50 */ 51 int stemmernumber (u_char *stemmerdescription); 36 52 37 53 /* … … 40 56 * Method 2 - Stem. 41 57 * Method 3 - Case fold and stem. 58 * 59 * The stemmer number should be obtained using function 60 * stemmernumber above. 42 61 */ 43 void stemmer (int method, u_char * word);62 void stemmer (int method, int stemmer, u_char * word); 44 63 45 64 #endif -
trunk/gsdl/packages/mg-1.3d/src/text/term_lists.c
r13 r34 24 24 /* 25 25 $Log$ 26 Revision 1.2 1998/11/25 07:55:52 rjmcnab 27 28 Modified mg to that you can specify the stemmer you want 29 to use via a command line option. You specify it to 30 mg_passes during the build process. The number of the 31 stemmer that you used is stored within the inverted 32 dictionary header and the stemmed dictionary header so 33 the correct stemmer is used in later stages of building 34 and querying. 35 26 36 Revision 1.1 1998/11/17 09:35:43 rjmcnab 27 37 *** empty log message *** … … 184 194 int 185 195 AddTerm (TermList ** query_term_list, u_char * Word, int Count, int word_num, 186 u_long count, u_long doc_count, u_long invf_ptr, u_long invf_len ) /* [RPAP - Feb 97: Term Frequency] */196 u_long count, u_long doc_count, u_long invf_ptr, u_long invf_len, int stemmer_num) /* [RPAP - Feb 97: Term Frequency] */ 187 197 { 188 198 int j; … … 221 231 if (!te.Stem) 222 232 FatalError (1, "Could NOT create memory to add term"); 223 stemmer (2, te.Stem);233 stemmer (2, stemmer_num, te.Stem); 224 234 225 235 te.require_match = 0; /* [RJM - 07/97: Ranked Required Terms] */ -
trunk/gsdl/packages/mg-1.3d/src/text/term_lists.h
r13 r34 24 24 /* 25 25 $Log$ 26 Revision 1.2 1998/11/25 07:55:52 rjmcnab 27 28 Modified mg to that you can specify the stemmer you want 29 to use via a command line option. You specify it to 30 mg_passes during the build process. The number of the 31 stemmer that you used is stored within the inverted 32 dictionary header and the stemmed dictionary header so 33 the correct stemmer is used in later stages of building 34 and querying. 35 26 36 Revision 1.1 1998/11/17 09:35:44 rjmcnab 27 37 *** empty log message *** … … 77 87 int AddTermEntry (TermList ** query_term_list, TermEntry * te); 78 88 int AddTerm (TermList ** query_term_list, u_char * Word, int Count, int word_num, 79 u_long count, u_long doc_count, u_long invf_ptr, u_long invf_len); /* [RPAP - Feb 97: Term Frequency] */ 89 u_long count, u_long doc_count, u_long invf_ptr, u_long invf_len, /* [RPAP - Feb 97: Term Frequency] */ 90 int stemmer_num); 80 91 void ResetTermList (TermList ** tl); 81 92 void FreeTermList (TermList ** the_tl);
Note:
See TracChangeset
for help on using the changeset viewer.