/************************************************************************** * * mgpp_passes.cpp -- Driver for the various passes * Copyright (C) 1994 Neil Sharman * Copyright (C) 1999 Rodger McNab * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. * **************************************************************************/ #define _XOPEN_SOURCE 1 #define _XOPEN_SOURCE_EXTENDED 1 #if defined(WIN32) || defined(_WIN32) || defined(__WIN32__) #pragma warning(disable:4786) #endif // need this to avoid bizarre compiler problems under VC++ 6.0 #if defined (__WIN32__) && !defined (GSDL_USE_IOS_H) # include #endif #include "sysfuncs.h" #ifdef HAVE_MALLINFO # include #endif #if defined __WIN32__ # include # include "getopt_old.h" # define close _close # define open _open #elif defined __CYGWIN__ #include "getopt_old.h" #else # include #endif #include "memlib.h" #include "messages.h" #include "longlong.h" #include "mg_files.h" #include "mg.h" #include "build.h" #include "text.h" #include "stemmer.h" #include "FileBuf.h" #include "TextEl.h" #include "TagInfo.h" #include "words.h" #define MAX_PASSES 5 #define SPECIAL 1 #define TEXT_PASS_1 2 #define TEXT_PASS_2 4 #define IVF_PASS_1 8 #define IVF_PASS_2 16 #define MIN_BUF 8192 unsigned long invf_buffer_size = 5 * 1024 * 1024; /* 5Mb */ static char Passes = 0; static char **files = NULL; static int num_files = 0; struct pass_data { char *name; int (*init) (const TagInfo &tagInfo, char *); int (*process) (const TagInfo &tagInfo, const TextElArray &doc); int (*done) (const TagInfo &tagInfo, char *); }; static pass_data PassData[MAX_PASSES] = { {"special", init_special, process_special, done_special}, {"text.pass1", init_text_1, process_text_1, done_text_1}, {"text.pass2", init_text_2, process_text_2, done_text_2}, {"ivf.pass1", init_ivf_1, process_ivf_1, done_ivf_1}, {"ivf.pass2", init_ivf_2, process_ivf_2, done_ivf_2}, }; static char *usage_str = "\nUSAGE:\n" " %s [-J doc-tag] [-K level-tag] [-L index-level]\n" " %*s [-m invf-memory] [-T1] [-T2] [-I1] [-I2] [-S]\n" " %*s [-C] [-h] [-d directory] [-M maxnumeric] -f name\n\n"; static void Usage (char *err) { if (err) Message (err); fprintf (stderr, usage_str, msg_prefix, strlen (msg_prefix), "", strlen (msg_prefix), ""); exit (1); } int OpenNextFile (int in_fd) { if (in_fd > 0) close (in_fd); if (num_files == 0) return (-1); if ((in_fd = open (files[0], O_RDONLY)) == -1) FatalError (1, "Cannot open %s", files[0]); ++files; --num_files; return (in_fd); } static void Driver (int in_fd, char *file_name, const TagInfo &tagInfo, bool compatMode) { // cout << tagInfo; int pass; unsigned long numBytes = 0; unsigned long numDocs = 0; // initialise all the passes for (pass = 0; pass < MAX_PASSES; ++pass) { if (Passes & (1 << pass)) { if (PassData[pass].init (tagInfo, file_name) == COMPERROR) FatalError (1, "Error during init of \"%s\"", PassData[pass].name); } } // set up various variables FileBuf buf; TextElArray doc; unsigned long docLen = 0; // read and process each file (start with an open file) do { // read and process each document in this file buf.SetFD (in_fd); while (ReadDoc (buf, tagInfo.docTag, doc, docLen, compatMode)) { // give this document to each pass for (pass = 0; pass < MAX_PASSES; ++pass) { if (Passes & (1 << pass)) { if (PassData[pass].process (tagInfo, doc) == COMPERROR) FatalError(1,"Error during processing of \"%s\"",PassData[pass].name); } } // another document has been processed numBytes += docLen; ++numDocs; } } while ((in_fd = OpenNextFile (in_fd)) > 0); // do done for each pass for (pass = 0; pass < MAX_PASSES; ++pass) { if (Passes & (1 << pass)) { if (PassData[pass].done (tagInfo, file_name) == COMPERROR) FatalError (1, "Error during done of \"%s\"", PassData[pass].name); } } } int main (int argc, char **argv) { int ch, in_fd, maxnum; char *filename = NULL; bool compatMode = false; TagInfo tagInfo; tagInfo.SetDocTag ("Document"); msg_prefix = argv[0]; opterr = 0; while ((ch=getopt(argc, argv, "J:K:L:M:f:d:m:I:T:SCh"))!=-1){ switch (ch) { case 'J': tagInfo.SetDocTag (optarg); break; case 'K': tagInfo.AddLevelTag (optarg); break; case 'L': tagInfo.SetIndexLevel (optarg); break; case 'M': maxnum = atoi(optarg); if (4 < maxnum < 512) { MAXNUMERIC = maxnum; } break; case 'f': filename = optarg; break; case 'd': set_basepath (optarg); break; case 'm': invf_buffer_size = (int) (atof (optarg) * 1024 * 1024); break; case 'I': if (*optarg == '1') Passes |= IVF_PASS_1; else if (*optarg == '2') Passes |= IVF_PASS_2; else Usage ("Invalid pass number"); break; case 'T': if (*optarg == '1') Passes |= TEXT_PASS_1; else if (*optarg == '2') Passes |= TEXT_PASS_2; else Usage ("Invalid pass number"); break; case 'S': Passes |= SPECIAL; break; case 'C': compatMode = true; break; case 'h': case '?': Usage (NULL); } } if (!filename || *filename == '\0') FatalError (1, "A document collection name must be specified."); if ((Passes & (IVF_PASS_1 | IVF_PASS_2)) == (IVF_PASS_1 | IVF_PASS_2)) FatalError (1, "I1 and I2 cannot be done simultaneously."); if ((Passes & (TEXT_PASS_1 | TEXT_PASS_2)) == (TEXT_PASS_1 | TEXT_PASS_2)) FatalError (1, "T1 and T2 cannot be done simultaneously."); if (!Passes) FatalError (1, "S, T1, T2, I1 or I2 must be specified."); if (optind < argc) { if ((in_fd = open (argv[optind], O_RDONLY)) == -1) FatalError (1, "Cannot open %s", argv[optind]); files = &argv[optind + 1]; num_files = argc - (optind + 1); } else in_fd = 0; // stdin if (compatMode) tagInfo.SetDocTag ("Document"); // a document tag is also a level tag tagInfo.levelTags.insert (tagInfo.docTag); Driver (in_fd, filename, tagInfo, compatMode); return (0); }