Changeset 34


Ignore:
Timestamp:
1998-11-25T20:55:52+13:00 (26 years ago)
Author:
rjmcnab
Message:

Modified mg to that you can specify the stemmer you want
to use via a command line option. You specify it to
mg_passes during the build process. The number of the
stemmer that you used is stored within the inverted
dictionary header and the stemmed dictionary header so
the correct stemmer is used in later stages of building
and querying.

Location:
trunk/gsdl/packages/mg-1.3d
Files:
4 added
4 deleted
35 edited

Legend:

Unmodified
Added
Removed
  • trunk/gsdl/packages/mg-1.3d/lib/Makefile.in

    r29 r34  
    7676 
    7777HEADERS = \
    78     bitio_m_random.h   filestats.h        local_strings.h    stem.h \
     78    bitio_m_random.h   filestats.h        local_strings.h    lovinstem.h \
    7979    bitio_m_stdio.h    getopt.h           memlib.h \
    8080    bitio_gen.h        bitio_mem.h        heap.h             messages.h \
     
    8383    bitio_m_mems.h     bitio_stdio.h      huffman_stdio.h    sptree.h \
    8484    regex.h            rx.h               pathmax.h          getpagesize.h \
    85     random.h           frenchstem.h
     85    random.h           simplefrenchstem.h
    8686
    8787SOURCES = \
     
    9090    bitio_gen.c       filestats.c       huffman_mem.c     perf_hash.c \
    9191    bitio_mem.c       getopt.c          huffman_stdio.c   sptree.c \
    92     bitio_mems.c      getopt1.c         local_strings.c   stem.c \
     92    bitio_mems.c      getopt1.c         local_strings.c   lovinstem.c \
    9393    timing.c          regex.c           rx.c              \
    9494    alloca.c          error.c           xmalloc.c         strstr.c \
    9595    gmalloc.c         ftruncate.c       strcasecmp.c      random.c \
    96     frenchstem.c
     96    simplefrenchstem.c
    9797
    9898OBJECTS = @ALLOCA@ @LIBOBJS@ \
     
    101101    bitio_gen$o       filestats$o       huffman_mem$o     perf_hash$o \
    102102    bitio_mem$o       getopt$o          huffman_stdio$o   sptree$o \
    103     bitio_mems$o      getopt1$o         local_strings$o   stem$o \
     103    bitio_mems$o      getopt1$o         local_strings$o   lovinstem$o \
    104104    timing$o          regex$o       rx$o \
    105105    error$o           xmalloc$o \
    106     gmalloc$o         random$o          frenchstem$o
     106    gmalloc$o         random$o          simplefrenchstem$o
    107107 
    108108DISTFILES = Makefile.in $(HEADERS) $(SOURCES)
  • trunk/gsdl/packages/mg-1.3d/lib/WIN32.MAK

    r29 r34  
    4747 
    4848HEADERS = \
    49     bitio_m_random.h   filestats.h        local_strings.h    stem.h \
     49    bitio_m_random.h   filestats.h        local_strings.h    lovinstem.h \
    5050    bitio_m_stdio.h    getopt.h           memlib.h \
    5151    bitio_gen.h        bitio_mem.h        heap.h             messages.h \
     
    5454    bitio_m_mems.h     bitio_stdio.h      huffman_stdio.h    sptree.h \
    5555                       rx.h               pathmax.h          getpagesize.h \
    56     random.h           win32in.h          frenchstem.h
     56    random.h           win32in.h          simplefrenchstem.h
    5757
    5858SOURCES = \
     
    6161    bitio_gen.c       filestats.c       huffman_mem.c     perf_hash.c \
    6262    bitio_mem.c       getopt.c          huffman_stdio.c   sptree.c \
    63     bitio_mems.c      getopt1.c         local_strings.c   stem.c \
     63    bitio_mems.c      getopt1.c         local_strings.c   lovinstem.c \
    6464    timing.c                            rx.c              \
    6565    alloca.c          error.c           xmalloc.c         strstr.c \
    6666    ftruncate.c       strcasecmp.c      random.c          win32in.c \
    67     frenchstem.c
     67    simplefrenchstem.c
    6868
    6969OBJECTS =   rx$o \
     
    7272    bitio_gen$o       filestats$o       huffman_mem$o     perf_hash$o \
    7373    bitio_mem$o       getopt$o          huffman_stdio$o   sptree$o \
    74     bitio_mems$o      getopt1$o         local_strings$o   stem$o \
     74    bitio_mems$o      getopt1$o         local_strings$o   lovinstem$o \
    7575    timing$o           \
    7676    error$o           xmalloc$o \
    77     random$o          win32in$o         frenchstem$o
     77    random$o          win32in$o         simplefrenchstem$o
    7878 
    7979DISTFILES = Makefile.in $(HEADERS) $(SOURCES)
  • trunk/gsdl/packages/mg-1.3d/src/text/bool_parser.c

    r26 r34  
    3636static bool_tree_node *tree_base = NULL;
    3737static TermList **term_list;
     38static int stemmer_num;
    3839static int stem_method;
    3940/* [RPAP - Jan 97: Stem Index Change] */
     
    258259
    259260      bcopy ((char *) word, (char *) sWord, *word + 1);
    260       stemmer (stem_to_apply, sWord);
     261      stemmer (stem_to_apply, stemmer_num, sWord);
    261262
    262263      if (stem_to_apply == 0 || !indexed || p__sd == NULL)
     
    380381bool_tree_node *
    381382ParseBool(char *query_line, int query_len,
    382           TermList **the_term_list, int the_stem_method, int *res,
     383          TermList **the_term_list, int the_stemmer_num, int the_stem_method,
     384      int *res,
    383385      stemmed_dict * the_sd, int is_indexed,   /* [RPAP - Jan 97: Stem Index Change] */
    384386      QueryTermList **the_query_term_list)  /* [RPAP - Feb 97: Term Frequency] */
     
    386388  /* global variables to be accessed by bison/yacc created parser */
    387389  term_list = the_term_list;
     390  stemmer_num = the_stemmer_num;
    388391  stem_method = the_stem_method;
    389392  ch_buf = query_line;
     
    551554case 2:
    552555#line 79 "bool_parser.y"
    553 { yyval.node = CreateBoolTermNode(term_list, yyvsp[0].text, 1, word_num, count, doc_count, invf_ptr, invf_len); }
     556{ yyval.node = CreateBoolTermNode(term_list, yyvsp[0].text, 1, word_num, count, doc_count, invf_ptr, invf_len, stemmer_num); }
    554557break;
    555558case 3:
  • trunk/gsdl/packages/mg-1.3d/src/text/bool_parser.h

    r13 r34  
    2424/*
    2525   $Log$
     26   Revision 1.2  1998/11/25 07:55:40  rjmcnab
     27
     28   Modified mg to that you can specify the stemmer you want
     29   to use via a command line option. You specify it to
     30   mg_passes during the build process. The number of the
     31   stemmer that you used is stored within the inverted
     32   dictionary header and the stemmed dictionary header so
     33   the correct stemmer is used in later stages of building
     34   and querying.
     35
    2636   Revision 1.1  1998/11/17 09:34:26  rjmcnab
    2737   *** empty log message ***
     
    4757
    4858bool_tree_node *ParseBool (char *query_line, int query_len,
    49                TermList ** the_term_list, int the_stem_method, int *res,
     59               TermList ** the_term_list, int the_stemmer_num,
     60               int the_stem_method, int *res,
    5061               stemmed_dict * the_sd, int is_indexed,  /* [RPAP - Jan 97: Stem Index Change] */
    5162               QueryTermList ** the_query_term_list);  /* [RPAP - Feb 97: Term Frequency] */
  • trunk/gsdl/packages/mg-1.3d/src/text/bool_query.c

    r13 r34  
    313313  int res = 0;
    314314
    315   tree = ParseBool (Query, MAXLINEBUFFERLEN, &(qd->TL), stem_method, &res,
     315  tree = ParseBool (Query, MAXLINEBUFFERLEN, &(qd->TL),
     316            qd->sd->sdh.stemmer_num, stem_method, &res,
    316317            qd->sd, qd->sd->sdh.indexed,         /* [RPAP - Jan 97: Stem Index Change] */
    317318            &(qd->QTL));                         /* [RPAP - Feb 97: Term Frequency] */
  • trunk/gsdl/packages/mg-1.3d/src/text/bool_tester.c

    r13 r34  
    2424/*
    2525   $Log$
     26   Revision 1.2  1998/11/25 07:55:41  rjmcnab
     27
     28   Modified mg to that you can specify the stemmer you want
     29   to use via a command line option. You specify it to
     30   mg_passes during the build process. The number of the
     31   stemmer that you used is stored within the inverted
     32   dictionary header and the stemmed dictionary header so
     33   the correct stemmer is used in later stages of building
     34   and querying.
     35
    2636   Revision 1.1  1998/11/17 09:34:29  rjmcnab
    2737   *** empty log message ***
     
    5767
    5868#define MAX_LINE_LEN  255
     69#define STEMMER_NUM 0 /* Lovin's stemmer */
    5970#define STEM_METHOD 3
    6071
     
    7485 * ========================================================================= */
    7586
    76 void
    77 main (int argc, char *argv[])
     87int main (int argc, char *argv[])
    7888{
    7989  bool_tree_node *tree = NULL;
     
    93103      len = strlen (line) - 1;  /* -1 => ignore the \n */
    94104
    95       tree = ParseBool (line, len, &term_list, STEM_METHOD, &res,
     105      tree = ParseBool (line, len, &term_list, STEMMER_NUM, STEM_METHOD, &res,
    96106            NULL, 0,             /* [RPAP - Jan 97: Stem Index Change] */
    97107            &query_term_list);   /* [RPAP - Feb 97: Term Frequency] */
     
    141151
    142152    }
    143 
     153  return 0;
    144154}
    145155
  • trunk/gsdl/packages/mg-1.3d/src/text/bool_tree.c

    r13 r34  
    5656bool_tree_node *
    5757CreateBoolTermNode (TermList ** tl, char *text, int Count, int word_num,
    58             u_long count, u_long doc_count, u_long invf_ptr, u_long invf_len)  /* [RPAP - Feb 97: Term Frequency] */
     58            u_long count, u_long doc_count, u_long invf_ptr, u_long invf_len,  /* [RPAP - Feb 97: Term Frequency] */
     59            int stemmer_num)
    5960{
    6061  bool_tree_node *n = NULL;
     
    6364  n = CreateBoolNode (N_term);
    6465
    65   BOOL_TERM (n) = AddTerm (tl, (u_char *) text, Count, word_num, count, doc_count, invf_ptr, invf_len);  /* [RPAP - Feb 97: Term Frequency] */
     66  BOOL_TERM (n) = AddTerm (tl, (u_char *) text, Count, word_num, count, doc_count,
     67               invf_ptr, invf_len, stemmer_num);  /* [RPAP - Feb 97: Term Frequency] */
    6668
    6769  return n;
  • trunk/gsdl/packages/mg-1.3d/src/text/bool_tree.h

    r13 r34  
    2424/*
    2525   $Log$
     26   Revision 1.2  1998/11/25 07:55:42  rjmcnab
     27
     28   Modified mg to that you can specify the stemmer you want
     29   to use via a command line option. You specify it to
     30   mg_passes during the build process. The number of the
     31   stemmer that you used is stored within the inverted
     32   dictionary header and the stemmed dictionary header so
     33   the correct stemmer is used in later stages of building
     34   and querying.
     35
    2636   Revision 1.1  1998/11/17 09:34:31  rjmcnab
    2737   *** empty log message ***
     
    8696bool_tree_node *CopyBoolTree (bool_tree_node * tree);
    8797bool_tree_node *CreateBoolTermNode (TermList ** tl, char *text, int Count, int word_num,
    88                     u_long count, u_long doc_count, u_long invf_ptr, u_long invf_len);
     98                    u_long count, u_long doc_count, u_long invf_ptr,
     99                    u_long invf_len, int stemmer_num);
    89100bool_tree_node *CreateBoolTreeNode (N_Tag tag,
    90101                 bool_tree_node * left, bool_tree_node * right);
  • trunk/gsdl/packages/mg-1.3d/src/text/build.h

    r13 r34  
    156156 */
    157157
     158
     159extern int stemmer_num;
     160/*
     161 * The stemmer to use for stemming words for the inverted file.
     162 * see stemmer.h
     163 */
     164
    158165extern int stem_method;
    159166/*
  • trunk/gsdl/packages/mg-1.3d/src/text/invf.h

    r13 r34  
    3838    unsigned long static_num_of_docs;
    3939    unsigned long num_of_words;
     40    unsigned long stemmer_num;
    4041    unsigned long stem_method;
    4142  };
     
    5152    unsigned long static_num_of_docs;
    5253    unsigned long num_of_words;
     54    unsigned long stemmer_num;
    5355    unsigned long stem_method;
    5456    unsigned long indexed;       /* [RPAP - Jan 97: Stem Index Change] */
  • trunk/gsdl/packages/mg-1.3d/src/text/ivf.pass1.c

    r13 r34  
    4444/*
    4545   $Log$
     46   Revision 1.2  1998/11/25 07:55:43  rjmcnab
     47
     48   Modified mg to that you can specify the stemmer you want
     49   to use via a command line option. You specify it to
     50   mg_passes during the build process. The number of the
     51   stemmer that you used is stored within the inverted
     52   dictionary header and the stemmed dictionary header so
     53   the correct stemmer is used in later stages of building
     54   and querying.
     55
    4656   Revision 1.1  1998/11/17 09:34:44  rjmcnab
    4757   *** empty log message ***
     
    292302      PARSE_STEM_WORD (Word, s_in, end);
    293303     
    294       stemmer (stem_method, Word);
     304      stemmer (stem_method, stemmer_num, Word);
    295305      if (SkipSGML)
    296306    PARSE_NON_STEM_WORD_OR_SGML_TAG (s_in, end);
     
    620630  HTONUL2(callnum, idh.static_num_of_docs);
    621631  HTONUL2(words_read, idh.num_of_words);
     632  HTONUL2(stemmer_num, idh.stemmer_num);
    622633  HTONUL2(stem_method, idh.stem_method);
    623634
  • trunk/gsdl/packages/mg-1.3d/src/text/ivf.pass2.c

    r13 r34  
    2424/*
    2525   $Log$
     26   Revision 1.2  1998/11/25 07:55:43  rjmcnab
     27
     28   Modified mg to that you can specify the stemmer you want
     29   to use via a command line option. You specify it to
     30   mg_passes during the build process. The number of the
     31   stemmer that you used is stored within the inverted
     32   dictionary header and the stemmed dictionary header so
     33   the correct stemmer is used in later stages of building
     34   and querying.
     35
    2636   Revision 1.1  1998/11/17 09:34:45  rjmcnab
    2737   *** empty log message ***
     
    5868#include "words.h"
    5969#include "hash.h"
    60 #include "stemmer.h"
    6170
    6271/* [RPAP - Feb 97: WIN32 Port] */
     
    347356  NTOHUL(idh.static_num_of_docs);
    348357  NTOHUL(idh.num_of_words);
     358  NTOHUL(idh.stemmer_num);
    349359  NTOHUL(idh.stem_method);
    350360
     
    779789
    780790      PARSE_STEM_WORD (Word, s_in, end);
    781       stemmer (idh.stem_method, Word);
     791      stemmer (idh.stem_method, idh.stemmer_num, Word);
    782792      if (SkipSGML)
    783793    PARSE_NON_STEM_WORD_OR_SGML_TAG (s_in, end);
  • trunk/gsdl/packages/mg-1.3d/src/text/mg_compression_dict.c

    r13 r34  
    4545/*
    4646   $Log$
     47   Revision 1.2  1998/11/25 07:55:44  rjmcnab
     48
     49   Modified mg to that you can specify the stemmer you want
     50   to use via a command line option. You specify it to
     51   mg_passes during the build process. The number of the
     52   stemmer that you used is stored within the inverted
     53   dictionary header and the stemmed dictionary header so
     54   the correct stemmer is used in later stages of building
     55   and querying.
     56
    4757   Revision 1.1  1998/11/17 09:34:52  rjmcnab
    4858   *** empty log message ***
     
    113123
    114124
    115 void
    116 main (int argc, char **argv)
     125int main (int argc, char **argv)
    117126{
    118127  ProgTime StartTime;
     
    226235
    227236  Message ("%s", ElapsedTime (&StartTime, NULL));
    228   exit (0);
     237  return 0;
    229238}
    230239
  • trunk/gsdl/packages/mg-1.3d/src/text/mg_fast_comp_dict.c

    r13 r34  
    2525/*
    2626   $Log$
     27   Revision 1.2  1998/11/25 07:55:44  rjmcnab
     28
     29   Modified mg to that you can specify the stemmer you want
     30   to use via a command line option. You specify it to
     31   mg_passes during the build process. The number of the
     32   stemmer that you used is stored within the inverted
     33   dictionary header and the stemmed dictionary header so
     34   the correct stemmer is used in later stages of building
     35   and querying.
     36
    2737   Revision 1.1  1998/11/17 09:34:57  rjmcnab
    2838   *** empty log message ***
     
    8696static u_long mem, fixup_mem;
    8797
    88 void
    89 main (int argc, char **argv)
     98int main (int argc, char **argv)
    9099{
    91100  ProgTime StartTime;
     
    188197
    189198  Message ("%s", ElapsedTime (&StartTime, NULL));
    190   exit (0);
     199  return 0;
    191200}
    192201
  • trunk/gsdl/packages/mg-1.3d/src/text/mg_hilite_words.1

    r24 r34  
    1515[
    1616.B --stem_method [0-3]
     17]
     18.if n .ti +9n
     19[
     20.B --stemmer [english|lovin|french|simplefrench]
    1721]
    1822.if n .ti +9n
     
    4751This should match the way the inverted file index was created.
    4852The default is 3, which is fold and stem.
     53.TP
     54.B --stemmer [english|lovin|french|simplefrench]
     55This option allows you to select the stemmer to use.  The
     56default is the English stemmer.
    4957.TP
    5058.B --style [bold|underline|italic|emphasis|strong]
  • trunk/gsdl/packages/mg-1.3d/src/text/mg_hilite_words.c

    r29 r34  
    104104static short hilite_style = BOLD;
    105105static char *pager = "less";
     106static int stemmer_num = 0; /* Lovin's stemmer */
    106107static int stem_method = 3; /* fold & stem */
    107108static char **word_list;
     
    458459      PARSE_STEM_WORD (word, s_in, end);
    459460
    460       stemmer (stem_method, word);
     461      stemmer (stem_method, stemmer_num, word);
    461462
    462463      if (set_member (word))    /* output with highlighting */
     
    567568  {"pager", required_argument, 0, 'p'},
    568569  {"stem_method", required_argument, 0, 'm'},
     570  {"stemmer", required_argument, 0, 'a'},
    569571  {0, 0, 0, 0}
    570572};
     
    577579
    578580  opterr = 0;
    579   while ((ch = getopt_long (argc, argv, "s:p:t:m:", long_opts, (int *) 0)) != -1)
     581  while ((ch = getopt_long (argc, argv, "s:p:t:m:a:", long_opts, (int *) 0)) != -1)
    580582    {
    581583      switch (ch)
     
    591593          hilite_style = i;
    592594      }
     595      break;
     596    case 'a':
     597      stemmer_num = stemmernumber (optarg);
    593598      break;
    594599    case 't':
     
    604609      FatalError (1, "Usage: \n"
    605610              "mg_hilite_words --stem_method [0-3]\n"
     611      "                --stemmer [english|lovin|french|simplefrench]\n"
    606612      "                --style [bold|underline|italic|emphasis|strong]\n"
    607613              "                --pager [less|more|html|???]\n");
  • trunk/gsdl/packages/mg-1.3d/src/text/mg_invf_dict.c

    r13 r34  
    3636/*
    3737   $Log$
     38   Revision 1.2  1998/11/25 07:55:45  rjmcnab
     39
     40   Modified mg to that you can specify the stemmer you want
     41   to use via a command line option. You specify it to
     42   mg_passes during the build process. The number of the
     43   stemmer that you used is stored within the inverted
     44   dictionary header and the stemmed dictionary header so
     45   the correct stemmer is used in later stages of building
     46   and querying.
     47
    3848   Revision 1.1  1998/11/17 09:35:03  rjmcnab
    3949   *** empty log message ***
     
    5969static void process_files (char *filename);
    6070
    61 void
    62 main (int argc, char **argv)
     71int main (int argc, char **argv)
    6372{
    6473  char *file_name = "";
     
    9099
    91100  process_files (file_name);
    92   exit (0);
     101  return 0;
    93102}
    94103
     
    135144  NTOHUL(idh.static_num_of_docs);
    136145  NTOHUL(idh.num_of_words);
     146  NTOHUL(idh.stemmer_num);
    137147  NTOHUL(idh.stem_method);
    138148
     
    145155  sdh.static_num_of_docs = idh.static_num_of_docs;
    146156  sdh.num_of_words = idh.num_of_words;
     157  sdh.stemmer_num = idh.stemmer_num;
    147158  sdh.stem_method = idh.stem_method;
    148159  sdh.indexed = 0;  /* [RPAP - Jan 97: Stem Index Change] */
     
    317328  HTONUL(sdh.static_num_of_docs);
    318329  HTONUL(sdh.num_of_words);
     330  HTONUL(sdh.stemmer_num);
    319331  HTONUL(sdh.stem_method);
    320332  HTONUL(sdh.indexed);
  • trunk/gsdl/packages/mg-1.3d/src/text/mg_invf_dump.c

    r13 r34  
    4040/*
    4141   $Log$
     42   Revision 1.2  1998/11/25 07:55:46  rjmcnab
     43
     44   Modified mg to that you can specify the stemmer you want
     45   to use via a command line option. You specify it to
     46   mg_passes during the build process. The number of the
     47   stemmer that you used is stored within the inverted
     48   dictionary header and the stemmed dictionary header so
     49   the correct stemmer is used in later stages of building
     50   and querying.
     51
    4252   Revision 1.1  1998/11/17 09:35:05  rjmcnab
    4353   *** empty log message ***
     
    6272
    6373
    64 void
    65 main (int argc, char **argv)
     74int main (int argc, char **argv)
    6675{
    6776  ProgTime start;
     
    99108  process_files (file_name);
    100109  Message ("%s\n", ElapsedTime (&start, NULL));
    101   exit (0);
     110  return 0;
    102111}
    103112
     
    125134  NTOHUL(idh.static_num_of_docs);
    126135  NTOHUL(idh.num_of_words);
     136  NTOHUL(idh.stemmer_num);
    127137  NTOHUL(idh.stem_method);
    128138
  • trunk/gsdl/packages/mg-1.3d/src/text/mg_invf_merge.c

    r13 r34  
    132132  NTOHUL(m[OLD].idh.static_num_of_docs);
    133133  NTOHUL(m[OLD].idh.num_of_words);
     134  NTOHUL(m[OLD].idh.stemmer_num);
    134135  NTOHUL(m[OLD].idh.stem_method);
    135136
     
    147148  NTOHUL(m[NEW].idh.static_num_of_docs);
    148149  NTOHUL(m[NEW].idh.num_of_words);
     150  NTOHUL(m[NEW].idh.stemmer_num);
    149151  NTOHUL(m[NEW].idh.stem_method);
    150152
     
    668670  m[MERGE].idh.num_of_docs = m[MERGE].nDocs;
    669671  m[MERGE].idh.num_of_words = m[OLD].idh.num_of_words + m[NEW].idh.num_of_words;
     672  m[MERGE].idh.stemmer_num = m[OLD].idh.stemmer_num;
    670673  m[MERGE].idh.stem_method = m[OLD].idh.stem_method;
    671674
     
    685688  HTONUL(m[MERGE].idh.static_num_of_docs);
    686689  HTONUL(m[MERGE].idh.num_of_words);
     690  HTONUL(m[MERGE].idh.stemmer_num);
    687691  HTONUL(m[MERGE].idh.stem_method);
    688692
  • trunk/gsdl/packages/mg-1.3d/src/text/mg_invf_rebuild.c

    r13 r34  
    4949/*
    5050   $Log$
     51   Revision 1.2  1998/11/25 07:55:46  rjmcnab
     52
     53   Modified mg to that you can specify the stemmer you want
     54   to use via a command line option. You specify it to
     55   mg_passes during the build process. The number of the
     56   stemmer that you used is stored within the inverted
     57   dictionary header and the stemmed dictionary header so
     58   the correct stemmer is used in later stages of building
     59   and querying.
     60
    5161   Revision 1.1  1998/11/17 09:35:10  rjmcnab
    5262   *** empty log message ***
     
    96106
    97107
    98 void
    99 main (int argc, char **argv)
     108int main (int argc, char **argv)
    100109{
    101110  ProgTime start;
     
    174183  Message ("**** rebuild them with mg_stem_idx.                                     ****\n");
    175184
    176   exit (0);
     185  return 0;
    177186}
    178187
     
    268277  NTOHUL(idh.static_num_of_docs);
    269278  NTOHUL(idh.num_of_words);
     279  NTOHUL(idh.stemmer_num);
    270280  NTOHUL(idh.stem_method);
    271281
  • trunk/gsdl/packages/mg-1.3d/src/text/mg_passes.1

    r24 r34  
    5050[
    5151.BI \-c " numchunks"
     52]
     53[
     54.BI \-a " stemmer"
    5255]
    5356[
     
    202205.I stemmethod
    203206does both case folding and stemming.
     207.TP
     208.BI \-a " stemmer"
     209This specifies the stemmer to use when stemming words.  This
     210is a description of the language the stemmer is intended for
     211or a description of the stemmer.  Valid options include:
     212english, lovin, french, and simplefrench.
    204213.TP
    205214.BI \-b " bufsize"
  • trunk/gsdl/packages/mg-1.3d/src/text/mg_passes.c

    r13 r34  
    4141/*
    4242   $Log$
     43   Revision 1.2  1998/11/25 07:55:47  rjmcnab
     44
     45   Modified mg to that you can specify the stemmer you want
     46   to use via a command line option. You specify it to
     47   mg_passes during the build process. The number of the
     48   stemmer that you used is stored within the inverted
     49   dictionary header and the stemmed dictionary header so
     50   the correct stemmer is used in later stages of building
     51   and querying.
     52
    4353   Revision 1.1  1998/11/17 09:35:13  rjmcnab
    4454   *** empty log message ***
     
    7686double bytes_processed = 0; /* [RJM 07/97: 4G limit] */
    7787double bytes_received = 0; /* [RJM 07/97: 4G limit] */
     88int stemmer_num = 0; /* default to the lovin stemmer */
    7889int stem_method = 0;
    7990
     
    123134"  %*s [-N2] [-W] [-S] [-b buffer-size] [-d dictionary-directory]\n"
    124135"  %*s [-t trace-point Mb] [-m invf-memory] [-c chunk-limit]\n"
    125 "  %*s [-n trace-name] [-C comp-stat-size] [-s stem_method] -f doc-collection-name\n";
     136"  %*s [-n trace-name] [-C comp-stat-size] [-s stem_method]\n"
     137"  %*s [-a stemmer] -f doc-collection-name\n";
    126138
    127139
     
    452464
    453465
    454 void
    455 main (int argc, char **argv)
     466int main (int argc, char **argv)
    456467{
    457468  int ch, in_fd;
     
    462473
    463474  opterr = 0;
    464   while ((ch = getopt (argc, argv, "hC:WGSD123f:d:b:T:I:t:m:N:c:n:s:")) != -1)
     475  while ((ch = getopt (argc, argv, "hC:WGSD123f:d:b:T:I:t:m:N:c:n:s:a:")) != -1)
    465476    {
    466477      switch (ch)
     
    495506    case 'd':
    496507      set_basepath (optarg);
     508      break;
     509    case 'a':
     510      stemmer_num = stemmernumber (optarg);
    497511      break;
    498512    case 's':
     
    601615    fclose (Comp_Stats);
    602616
    603   exit (0);
     617  return 0;
    604618}
  • trunk/gsdl/packages/mg-1.3d/src/text/mg_perf_hash_build.c

    r13 r34  
    3838/*
    3939   $Log$
     40   Revision 1.2  1998/11/25 07:55:47  rjmcnab
     41
     42   Modified mg to that you can specify the stemmer you want
     43   to use via a command line option. You specify it to
     44   mg_passes during the build process. The number of the
     45   stemmer that you used is stored within the inverted
     46   dictionary header and the stemmed dictionary header so
     47   the correct stemmer is used in later stages of building
     48   and querying.
     49
    4050   Revision 1.1  1998/11/17 09:35:15  rjmcnab
    4151   *** empty log message ***
     
    6070int r = -1;
    6171
    62 void
    63 main (int argc, char **argv)
     72int main (int argc, char **argv)
    6473{
    6574  ProgTime start;
     
    9099  process_files (file_name);
    91100  Message ("%s\n", ElapsedTime (&start, NULL));
    92   exit (0);
     101  return 0;
    93102}
    94103
     
    124133  NTOHUL(idh.static_num_of_docs);
    125134  NTOHUL(idh.num_of_words);
     135  NTOHUL(idh.stemmer_num);
    126136  NTOHUL(idh.stem_method);
    127137
  • trunk/gsdl/packages/mg-1.3d/src/text/mg_stem_idx.c

    r13 r34  
    238238  NTOHUL(sd->sdh.static_num_of_docs);
    239239  NTOHUL(sd->sdh.num_of_words);
     240  NTOHUL(sd->sdh.stemmer_num);
    240241  NTOHUL(sd->sdh.stem_method);
    241242  NTOHUL(sd->sdh.indexed);
     
    409410      /* Stem word */
    410411      bcopy ((char *) prev, (char *) word, *prev + 1);
    411       stemmer (stem_method, word);
     412      stemmer (stem_method, sd->sdh.stemmer_num, word);
    412413
    413414      /* Check if word follows straight on from previous word */
     
    828829
    829830/* Main */
    830 void main (int argc, char **argv)
     831int main (int argc, char **argv)
    831832{
    832833  File *idb;    /* File to .invf.dict.blocked */
     
    887888  UpdateStemDict (filename, stem_method);
    888889
    889   exit (0);
    890 }
    891 
     890  return 0;
     891}
     892
  • trunk/gsdl/packages/mg-1.3d/src/text/mg_text_estimate.c

    r13 r34  
    4949/*
    5050   $Log$
     51   Revision 1.2  1998/11/25 07:55:48  rjmcnab
     52
     53   Modified mg to that you can specify the stemmer you want
     54   to use via a command line option. You specify it to
     55   mg_passes during the build process. The number of the
     56   stemmer that you used is stored within the inverted
     57   dictionary header and the stemmed dictionary header so
     58   the correct stemmer is used in later stages of building
     59   and querying.
     60
    5161   Revision 1.1  1998/11/17 09:35:19  rjmcnab
    5262   *** empty log message ***
     
    8595
    8696
    87 void
    88 main (int argc, char **argv)
     97int main (int argc, char **argv)
    8998{
    9099  char *stats_dict = NULL, *comp_dict = NULL;
     
    170179       escape_count * 100.0 / huff_count);
    171180  Message ("%s", ElapsedTime (&StartTime, NULL));
     181  return 0;
    172182}
    173183
  • trunk/gsdl/packages/mg-1.3d/src/text/mg_text_merge.c

    r13 r34  
    209209 * main()
    210210 *=======================================================================*/
    211 void
    212 main (int argc, char *argv[])
     211int main (int argc, char *argv[])
    213212{
    214213  char *progname;
     
    247246  done_merge_text ();
    248247  Message ("%s\n", ElapsedTime (&start, NULL));
    249   exit (0);
    250 }
     248  return 0;
     249}
  • trunk/gsdl/packages/mg-1.3d/src/text/mg_weights_build.c

    r13 r34  
    4242/*
    4343   $Log$
     44   Revision 1.2  1998/11/25 07:55:49  rjmcnab
     45
     46   Modified mg to that you can specify the stemmer you want
     47   to use via a command line option. You specify it to
     48   mg_passes during the build process. The number of the
     49   stemmer that you used is stored within the inverted
     50   dictionary header and the stemmed dictionary header so
     51   the correct stemmer is used in later stages of building
     52   and querying.
     53
    4454   Revision 1.1  1998/11/17 09:35:22  rjmcnab
    4555   *** empty log message ***
     
    7282
    7383
    74 void
    75 main (int argc, char **argv)
     84int main (int argc, char **argv)
    7685{
    7786  ProgTime StartTime;
     
    116125
    117126  Message ("%s", ElapsedTime (&StartTime, NULL));
    118   exit (0);
     127
     128  return 0;
    119129}
    120130
     
    202212  NTOHUL(idh.static_num_of_docs);
    203213  NTOHUL(idh.num_of_words);
     214  NTOHUL(idh.stemmer_num);
    204215  NTOHUL(idh.stem_method);
    205216
  • trunk/gsdl/packages/mg-1.3d/src/text/mgdictlist.c

    r13 r34  
    3737/*
    3838   $Log$
     39   Revision 1.2  1998/11/25 07:55:49  rjmcnab
     40
     41   Modified mg to that you can specify the stemmer you want
     42   to use via a command line option. You specify it to
     43   mg_passes during the build process. The number of the
     44   stemmer that you used is stored within the inverted
     45   dictionary header and the stemmed dictionary header so
     46   the correct stemmer is used in later stages of building
     47   and querying.
     48
    3949   Revision 1.1  1998/11/17 09:35:24  rjmcnab
    4050   *** empty log message ***
     
    8292  NTOHUL(idh.static_num_of_docs);
    8393  NTOHUL(idh.num_of_words);
     94  NTOHUL(idh.stemmer_num);
    8495  NTOHUL(idh.stem_method);
    8596
     
    345356
    346357
    347 void
    348 main (int argc, char **argv)
     358int main (int argc, char **argv)
    349359{
    350360  FILE *fp;
     
    389399    }
    390400  fclose (fp);
    391   exit (0);
    392 }
     401  return 0;
     402}
  • trunk/gsdl/packages/mg-1.3d/src/text/mgquery.c

    r13 r34  
    2424/*
    2525   $Log$
     26   Revision 1.2  1998/11/25 07:55:49  rjmcnab
     27
     28   Modified mg to that you can specify the stemmer you want
     29   to use via a command line option. You specify it to
     30   mg_passes during the build process. The number of the
     31   stemmer that you used is stored within the inverted
     32   dictionary header and the stemmed dictionary header so
     33   the correct stemmer is used in later stages of building
     34   and querying.
     35
    2636   Revision 1.1  1998/11/17 09:35:29  rjmcnab
    2737   *** empty log message ***
     
    968978      return;
    969979    }
    970       sprintf (pager, "%s --style=%s --pager=%s --stem_method=%ld %s",
     980      sprintf (pager, "%s --style=%s --pager=%s --stem_method=%ld --stemmer=%ld %s",
    971981           HILITE_PAGER,
    972982           GetEnv ("hilite_style"),
    973983           GetEnv ("pager"),
    974984           qd->sd->sdh.stem_method,
     985           qd->sd->sdh.stemmer_num,
    975986           terms_str);
    976987
     
    13691380/* Initialises global variables based on command line switches, and opens */
    13701381/* files.  Then calls  query ()  to perform the querying. */
    1371 void
    1372 main (int argc, char **argv)
     1382int main (int argc, char **argv)
    13731383{
    13741384  ProgTime StartTime;
     
    14761486
    14771487  UninitEnv ();
    1478   exit (0);
    1479 }
     1488  return 0;
     1489}
  • trunk/gsdl/packages/mg-1.3d/src/text/mgstat.c

    r13 r34  
    3636/*
    3737   $Log$
     38   Revision 1.2  1998/11/25 07:55:50  rjmcnab
     39
     40   Modified mg to that you can specify the stemmer you want
     41   to use via a command line option. You specify it to
     42   mg_passes during the build process. The number of the
     43   stemmer that you used is stored within the inverted
     44   dictionary header and the stemmed dictionary header so
     45   the correct stemmer is used in later stages of building
     46   and querying.
     47
    3848   Revision 1.1  1998/11/17 09:35:31  rjmcnab
    3949   *** empty log message ***
     
    5969
    6070
    61 void
    62 main (int argc, char **argv)
     71int main (int argc, char **argv)
    6372{
    6473  unsigned long sub_total;
     
    152161  printf ("\n");
    153162  process_file (NULL, "TOTAL", exact);
    154   exit (0);
     163
     164  return 0;
    155165}
    156166
  • trunk/gsdl/packages/mg-1.3d/src/text/query.ranked.c

    r13 r34  
    4949/*
    5050   $Log$
     51   Revision 1.2  1998/11/25 07:55:50  rjmcnab
     52
     53   Modified mg to that you can specify the stemmer you want
     54   to use via a command line option. You specify it to
     55   mg_passes during the build process. The number of the
     56   stemmer that you used is stored within the inverted
     57   dictionary header and the stemmed dictionary header so
     58   the correct stemmer is used in later stages of building
     59   and querying.
     60
    5161   Revision 1.1  1998/11/17 09:35:34  rjmcnab
    5262   *** empty log message ***
     
    319329
    320330      bcopy ((char *) Word, (char *) sWord, *Word + 1);
    321       stemmer (stem_to_apply, sWord);
     331      stemmer (stem_to_apply, sd->sdh.stemmer_num, sWord);
    322332
    323333      if (!indexed || stem_to_apply == 0)
  • trunk/gsdl/packages/mg-1.3d/src/text/stemmer.c

    r13 r34  
    2525#include "stemmer.h"
    2626
     27#include "lovinstem.h"
     28#include "simplefrenchstem.h"
     29
    2730/*
    2831   $Log$
     32   Revision 1.2  1998/11/25 07:55:51  rjmcnab
     33
     34   Modified mg to that you can specify the stemmer you want
     35   to use via a command line option. You specify it to
     36   mg_passes during the build process. The number of the
     37   stemmer that you used is stored within the inverted
     38   dictionary header and the stemmed dictionary header so
     39   the correct stemmer is used in later stages of building
     40   and querying.
     41
    2942   Revision 1.1  1998/11/17 09:35:42  rjmcnab
    3043   *** empty log message ***
     
    4154static char *RCSID = "$Id$";
    4255
     56
     57#define LOVINSTEMMER        0
     58#define SIMPLEFRENCHSTEMMER 1
     59
     60
    4361/* [RJM 03/98: Extended ascii] */
    4462u_char exttolower (u_char c) {
     
    5169}
    5270
     71
     72int stemmernumber (u_char *stemmerdescription) {
     73  u_char descript[MAX_STEM_DESCRIPTION_LEN];
     74  int i;
     75
     76  /* copy and case-fold the description */
     77  for (i=0; (stemmerdescription[i] != '\0') &&
     78     (i < MAX_STEM_DESCRIPTION_LEN-1); i++)
     79    descript[i] = exttolower (stemmerdescription[i]);
     80  descript[i] = '\0';
     81
     82  /* map the description to its number */
     83
     84  if ((strcmp (descript, "0") == 0) ||
     85      (strcmp (descript, "english") == 0) ||
     86      (strcmp (descript, "lovin") == 0))
     87    return LOVINSTEMMER;
     88
     89  if ((strcmp (descript, "1") == 1) ||
     90      (strcmp (descript, "french") == 0) ||
     91      (strcmp (descript, "simplefrench") == 0))
     92    return SIMPLEFRENCHSTEMMER;
     93
     94  return -1;
     95}
     96
     97
     98
    5399/*
    54100 * Method 0 - Do not stem or case fold.
     
    56102 * Method 2 - Stem.
    57103 * Method 3 - Case fold and stem.
     104 *
     105 * The stemmer number should be obtained using
     106 * the stemmernumber function above.
    58107 */
    59108void
    60 stemmer (int method, u_char * word)
     109stemmer (int method, int stemmer, u_char * word)
    61110{
    62111  if (method & 1)
     
    68117    *w = exttolower (*w);
    69118    }
     119
    70120  if (method & 2) {
    71 #ifdef FRENCHSTEMMER
    72     frenchstem (word);
    73 #else
    74     stem (word);
    75 #endif
    76 
     121    switch (stemmer) {
     122    case LOVINSTEMMER: lovinstem (word);
     123      break;
     124    case SIMPLEFRENCHSTEMMER: simplefrenchstem (word);
     125      break;
     126    }
    77127  }
    78128}
  • trunk/gsdl/packages/mg-1.3d/src/text/stemmer.h

    r29 r34  
    2727#include "sysfuncs.h"
    2828
    29 #ifdef FRENCHSTEMMER
    30   #include "frenchstem.h"
    31 #else
    32   #include "stem.h"
    33 #endif
     29#define STEMMER_MASK 3
     30#define MAX_STEM_DESCRIPTION_LEN 16
    3431
    35 #define STEMMER_MASK 3
     32/* stemmernumber will return the stemmer for
     33 * a description of the stemmer. Stemmer descriptions
     34 * are not case sensitive. Valid descriptions are:
     35 *
     36 *   'English'
     37 *   'Lovin'
     38 *   'French'
     39 *   'SimpleFrench'
     40 *
     41 * More than one description might result in the same
     42 * stemmer number (for example, for stemming 'English'
     43 * we currently use the 'Lovin' stemmer).
     44 *
     45 * stemmerdescription is a normal C, null-terminated,
     46 * string.
     47 *
     48 * stemmernumber will return -1 if it doesn't know the
     49 * stemmer description.
     50 */
     51int stemmernumber (u_char *stemmerdescription);
    3652
    3753/*
     
    4056 * Method 2 - Stem.
    4157 * Method 3 - Case fold and stem.
     58 *
     59 * The stemmer number should be obtained using function
     60 * stemmernumber above.
    4261 */
    43 void stemmer (int method, u_char * word);
     62void stemmer (int method, int stemmer, u_char * word);
    4463
    4564#endif
  • trunk/gsdl/packages/mg-1.3d/src/text/term_lists.c

    r13 r34  
    2424/*
    2525   $Log$
     26   Revision 1.2  1998/11/25 07:55:52  rjmcnab
     27
     28   Modified mg to that you can specify the stemmer you want
     29   to use via a command line option. You specify it to
     30   mg_passes during the build process. The number of the
     31   stemmer that you used is stored within the inverted
     32   dictionary header and the stemmed dictionary header so
     33   the correct stemmer is used in later stages of building
     34   and querying.
     35
    2636   Revision 1.1  1998/11/17 09:35:43  rjmcnab
    2737   *** empty log message ***
     
    184194int
    185195AddTerm (TermList ** query_term_list, u_char * Word, int Count, int word_num,
    186      u_long count, u_long doc_count, u_long invf_ptr, u_long invf_len)  /* [RPAP - Feb 97: Term Frequency] */
     196     u_long count, u_long doc_count, u_long invf_ptr, u_long invf_len, int stemmer_num)  /* [RPAP - Feb 97: Term Frequency] */
    187197{
    188198  int j;
     
    221231    if (!te.Stem)
    222232      FatalError (1, "Could NOT create memory to add term");
    223     stemmer (2, te.Stem);
     233    stemmer (2, stemmer_num, te.Stem);
    224234
    225235    te.require_match = 0;  /* [RJM - 07/97: Ranked Required Terms] */
  • trunk/gsdl/packages/mg-1.3d/src/text/term_lists.h

    r13 r34  
    2424/*
    2525   $Log$
     26   Revision 1.2  1998/11/25 07:55:52  rjmcnab
     27
     28   Modified mg to that you can specify the stemmer you want
     29   to use via a command line option. You specify it to
     30   mg_passes during the build process. The number of the
     31   stemmer that you used is stored within the inverted
     32   dictionary header and the stemmed dictionary header so
     33   the correct stemmer is used in later stages of building
     34   and querying.
     35
    2636   Revision 1.1  1998/11/17 09:35:44  rjmcnab
    2737   *** empty log message ***
     
    7787int AddTermEntry (TermList ** query_term_list, TermEntry * te);
    7888int AddTerm (TermList ** query_term_list, u_char * Word, int Count, int word_num,
    79          u_long count, u_long doc_count, u_long invf_ptr, u_long invf_len);  /* [RPAP - Feb 97: Term Frequency] */
     89         u_long count, u_long doc_count, u_long invf_ptr, u_long invf_len, /* [RPAP - Feb 97: Term Frequency] */
     90         int stemmer_num); 
    8091void ResetTermList (TermList ** tl);
    8192void FreeTermList (TermList ** the_tl);
Note: See TracChangeset for help on using the changeset viewer.