/************************************************************************** * * mgpp_passes.cpp -- Driver for the various passes * Copyright (C) 1994 Neil Sharman * Copyright (C) 1999 Rodger McNab * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. * **************************************************************************/ #define _XOPEN_SOURCE 1 #define _XOPEN_SOURCE_EXTENDED 1 #if defined(WIN32) || defined(_WIN32) || defined(__WIN32__) #pragma warning(disable:4786) #endif // need this to avoid bizarre compiler problems under VC++ 6.0 #if defined (__WIN32__) && !defined (GSDL_USE_IOS_H) # include #endif #include "sysfuncs.h" #ifdef HAVE_MALLINFO # include #endif #if defined __WIN32__ # include # include "getopt_old.h" # define close _close # define open _open #elif defined __CYGWIN__ #include "getopt_old.h" #else # include #endif #include "memlib.h" #include "messages.h" #include "longlong.h" #include "mg_files.h" #include "mg.h" #include "build.h" #include "text.h" #include "stemmer.h" #include "FileBuf.h" #include "TextEl.h" #include "TagInfo.h" #include "mgpp_passes_4jni.h" #define MAX_PASSES 5 #define SPECIAL 1 #define TEXT_PASS_1 2 #define TEXT_PASS_2 4 #define IVF_PASS_1 8 #define IVF_PASS_2 16 #define MIN_BUF 8192 unsigned long invf_buffer_size = 5 * 1024 * 1024; /* 5Mb */ static char Passes = 0; static char **files = NULL; static int num_files = 0; TagInfo tagInfo; char *filename = NULL; unsigned long numBytes = 0; unsigned long numDocs = 0; int mgpp_passes_exit_value = 0; struct pass_data { char *name; int (*init) (const TagInfo &tagInfo, char *); int (*process) (const TagInfo &tagInfo, const TextElArray &doc); int (*done) (const TagInfo &tagInfo, char *); }; static pass_data PassData[MAX_PASSES] = { {"special", init_special, process_special, done_special}, {"text.pass1", init_text_1, process_text_1, done_text_1}, {"text.pass2", init_text_2, process_text_2, done_text_2}, {"ivf.pass1", init_ivf_1, process_ivf_1, done_ivf_1}, {"ivf.pass2", init_ivf_2, process_ivf_2, done_ivf_2}, }; /* clear all the settings from one mgpp_passes run to the next */ void clear_variables() { tagInfo.Clear(); tagInfo.SetDocTag ("Document"); Passes = 0; invf_buffer_size = 5 * 1024 * 1024; numBytes = 0; numDocs = 0; mgpp_passes_exit_value = 0; } /* ################################################## */ /* the following are methods to set all the variables that used to be set by command line args */ /* -S, -T1, -T2, -I1, -I2, args to mg_passes */ void add_pass (char pass_type, char pass_num) { switch(pass_type) { case 'S': Passes |= SPECIAL; break; case 'I': if (pass_num == '1') Passes |= IVF_PASS_1; else if (pass_num == '2') Passes |= IVF_PASS_2; else fprintf(stderr, "Invalid pass number %c for pass type %c\n", pass_num, pass_type); break; case 'T': if (pass_num == '1') Passes |= TEXT_PASS_1; else if (pass_num == '2') Passes |= TEXT_PASS_2; else fprintf(stderr, "Invalid pass number %c for pass type %c\n", pass_num, pass_type); break; } } /* -m arg to mgpp_passes */ void set_inversion_limit(int limit) { invf_buffer_size = limit * 1024 * 1024; } /* -J arg to mgpp_passes */ void set_document_tag(const char *tag_name) { tagInfo.SetDocTag (tag_name); // a doc tag is also a level tag tagInfo.AddLevelTag(tag_name); } /* -K arg to mgpp_passes */ void add_level_tag(const char *tag_nam) { tagInfo.AddLevelTag(tag_nam); } /* -L arg to mgpp_passes */ void set_index_level(const char *tag_name) { tagInfo.SetIndexLevel(tag_name); } // is this enough??? /* -f arg to mgpp_passes */ void set_filename(const char * filen) { if (filename) { Xfree (filename); filename = NULL; } filename = Xstrdup (filen); } /* ############################################### */ /* The old driver method has been split into 3: init_driver, process_document (called numdocs times), finalise_driver. The above set vars methods should all be called before init_driver. */ void init_driver () { if (!filename || *filename == '\0') { mgpp_passes_exit_value = 1; FatalError (1, "A document collection name must be specified."); } if ((Passes & (IVF_PASS_1 | IVF_PASS_2)) == (IVF_PASS_1 | IVF_PASS_2)) { mgpp_passes_exit_value = 1; FatalError (1, "I1 and I2 cannot be done simultaneously."); } if ((Passes & (TEXT_PASS_1 | TEXT_PASS_2)) == (TEXT_PASS_1 | TEXT_PASS_2)) { mgpp_passes_exit_value = 1; FatalError (1, "T1 and T2 cannot be done simultaneously."); } if (!Passes) { mgpp_passes_exit_value = 1; FatalError (1, "S, T1, T2, I1 or I2 must be specified."); } // initialise all the passes for (int pass = 0; pass < MAX_PASSES; pass++) { if (Passes & (1 << pass)) { if (PassData[pass].init (tagInfo, filename) == COMPERROR) { mgpp_passes_exit_value = 1; FatalError (1, "Error during init of \"%s\"", PassData[pass].name); } } } } void process_document(u_char *buffer, int len) { TextElArray doc; unsigned long doc_len = 0; while(ReadDoc(&buffer, tagInfo.docTag, doc, doc_len, false)) { // give this document to each pass for (int pass = 0; pass < MAX_PASSES; pass++) { if (Passes & (1 << pass)) { if (PassData[pass].process (tagInfo, doc) == COMPERROR) { mgpp_passes_exit_value = 1; FatalError(1,"Error during processing of \"%s\"",PassData[pass].name); } } } // another document has been processed numBytes += doc_len; numDocs++; cout << "doc_len = "< 0); // do done for each pass for (pass = 0; pass < MAX_PASSES; pass++) { if (Passes & (1 << pass)) { if (PassData[pass].done (tagInfo, file_name) == COMPERROR) FatalError (1, "Error during done of \"%s\"", PassData[pass].name); } } } */