Changeset 2746


Ignore:
Timestamp:
2001-09-22T00:46:42+12:00 (23 years ago)
Author:
kjm18
Message:

updated mg to be in line with mg_1.3f. Now uses long long for some variables
to enable indexing of very large collections.

Location:
trunk/gsdl/packages/mg/src/text
Files:
5 edited

Legend:

Unmodified
Added
Removed
  • trunk/gsdl/packages/mg/src/text/build.h

    r439 r2746  
    2626#define H_BUILD
    2727
     28#include "longlong.h"
    2829
    2930#define TERMPARAGRAPH   '\003'
     
    146147 */
    147148
    148 extern double bytes_processed; /* [RJM 07/97: 4G limit] */
     149extern mg_ullong bytes_processed;
    149150/*
    150151 * The number of bytes processed. NOTE: This excludes document separators.
    151152 */
    152153
    153 extern double bytes_received; /* [RJM 07/97: 4G limit] */
     154extern mg_ullong bytes_received;
    154155/*
    155156 * The number of bytes processed. NOTE: This includes document separators.
  • trunk/gsdl/packages/mg/src/text/ivf.pass1.c

    r439 r2746  
    4141#include "hash.h"
    4242
     43#include "longlong.h"
    4344
    4445/*
    4546   $Log$
     47   Revision 1.2  2001/09/21 12:46:42  kjm18
     48   updated mg to be in line with mg_1.3f. Now uses long long for some variables
     49   to enable indexing of very large collections.
     50
    4651   Revision 1.1  1999/08/10 21:17:54  sjboddie
    4752   renamed mg-1.3d directory mg
     
    8590 */
    8691
     92/*
     93 *  Modified:
     94 *   - long long bit counts for inverted file
     95 *     (1999-08-05 Tim Bell <[email protected]>)
     96 */
    8797static char *RCSID = "$Id$";
    8898
     
    119129static unsigned long words_read = 0, words_diff = 0, bytes_diff = 0;
    120130static unsigned long outputbytes = 0;
    121 static double inputbytes = 0; /* [RJM 07/97: 4G limit] */
     131static unsigned long inputbytes = 0;
    122132static unsigned long MaxMemInUse = 0;
    123133static unsigned long MemInUse = 0;
     
    140150static int max_first_occr;
    141151
    142 static unsigned long L1_bits = 0, L1_ohead = 0;
    143 static unsigned long L2_bits = 0, L2_ohead = 0;
    144 static unsigned long L3_bits = 0, L3_ohead = 0;
     152static mg_ullong L1_bits = 0;
     153static mg_ullong L2_bits = 0;
     154static mg_ullong L3_bits = 0;
     155static unsigned long L1_ohead = 0;
     156static unsigned long L2_ohead = 0;
     157static unsigned long L3_ohead = 0;
    145158static unsigned long callnum = 0, lcallnum = 0, wordnum = 0, lwordnum = 0;
    146159static unsigned long ptrcnt = 0;
     
    554567{
    555568  int i;
     569  mg_ullong oldL12_bits = 0;
    556570  for (i = 0; i < HashUsed; i++)
    557571    {
     
    567581          (1.6 + log2 (1.0 * words_read / (wrd->wcnt + callnum))));
    568582      L3_ohead += 0;
     583
     584      /* check for overflow */
     585      if (L1_bits + L2_bits < oldL12_bits) {
     586    fprintf(stderr, "ERROR: Inverted file size will probably overflow %d byte unsigned integer\n", sizeof (mg_ullong));
     587    fprintf(stderr, "       counter in pass 2.\n");
     588    if (sizeof (mg_ullong) < 8) {
     589      fprintf(stderr, "       Try compiling with GCC to enable use of 8 bytes for this counter.\n");
     590    }
     591    fprintf(stderr, "       Build aborted.\n");
     592    exit(1);
     593      }
     594      oldL12_bits = L1_bits + L2_bits;
    569595    }
    570596  L3_bits = (L3_bits + L2_bits + L1_bits + 7) / 8;
  • trunk/gsdl/packages/mg/src/text/ivf.pass2.c

    r439 r2746  
    2424/*
    2525   $Log$
     26   Revision 1.2  2001/09/21 12:46:42  kjm18
     27   updated mg to be in line with mg_1.3f. Now uses long long for some variables
     28   to enable indexing of very large collections.
     29
     30 * Revision 1.2  1997/08/02  05:01:57  wew
     31 * changed literal values of 32 for the bit size of magic numbers of
     32 * files to sizeof (unsigned long) * 8, increased the gap at the start
     33 * of the invf during processing to 200 bytes
     34
    2635   Revision 1.1  1999/08/10 21:17:54  sjboddie
    2736   renamed mg-1.3d directory mg
     
    5261   * For version 1.1
    5362   *
     63 */
     64
     65/*
     66 *  Modified:
     67 *   - long long disk pointers and bit counts for inverted file
     68 *     (1999-08-03 Tim Bell <[email protected]>)
     69 *     Code provided by Owen de Kretser <[email protected]>
    5470 */
    5571
     
    7793#include "hash.h"
    7894
     95#include "longlong.h"
     96
     97#ifdef USE_LONG_LONG
     98#define BIO_Random_Seek_X BIO_Random_Seek_LL
     99#define BIO_Random_Tell_X BIO_Random_Tell_LL
     100#else
     101#define BIO_Random_Seek_X BIO_Random_Seek
     102#define BIO_Random_Tell_X BIO_Random_Tell
     103#endif
     104
    79105/* [RPAP - Feb 97: WIN32 Port] */
    80106#ifdef __WIN32__
     
    104130typedef struct invf_state_rec
    105131  {
    106     unsigned long Disk_Ptr;
    107     unsigned long Disk_Last;
     132    mg_ullong Disk_Ptr;
     133    mg_ullong Disk_Last;
    108134    unsigned long Disk_B;
    109135  }
     
    268294
    269295#define ISR_CACHE 1024
    270 #define ISR_ENTRY_SIZE (sizeof(unsigned long)*2 + sizeof(unsigned long))
     296#define ISR_ENTRY_SIZE (sizeof(mg_ullong)*2 + sizeof(unsigned long))
    271297
    272298invf_state_rec *
     
    317343      return 0;
    318344    }
    319       BIO_Random_Seek (32, &rbs);
     345      BIO_Random_Seek_X (sizeof (unsigned long) * 8, &rbs);
    320346      pos = 0;
    321347    }
     
    345371  u_char prev[MAXSTEMLEN + 1];
    346372  int i;
    347   unsigned long totalIbits;
     373  mg_ullong totalIbits;
     374  mg_ullong lasttotalIbits;
    348375  double logN = 0.0;
    349376
     377 
    350378  if (open_files (file_name) == COMPERROR)
    351379    return COMPERROR;
     
    419447  totalDbytes += sizeof (chunk) * (ChunkLimit + 2);
    420448
    421   totalIbits = 32;      /* The magic number */
    422   totalIbits += 8 * 100;    /* A 100 byte gap */
     449  totalIbits = sizeof (unsigned long) * 8;      /* The magic number */
     450  totalIbits += 8 * 200;    /* A 200 byte gap */
    423451
    424452  if (MakeWeights)
     
    440468      unsigned long fcnt, wcnt;
    441469
     470      lasttotalIbits = totalIbits;
     471     
    442472      copy = fgetc (dict);
    443473      suff = fgetc (dict);
     
    473503    totalIbits += BIO_Gamma_Bound (wcnt, fcnt);
    474504
     505#ifdef USE_LONG_LONG
     506      totalIbits = (totalIbits + 7ull) & 0xfffffffffffffff8ull;
     507#else
    475508      totalIbits = (totalIbits + 7ul) & 0xfffffff8ul;
    476 
     509#endif
     510
     511      if (totalIbits < lasttotalIbits) {
     512    fprintf(stderr, "ERROR: The totalIbits counter (%d byte unsigned integer) has overflowed.\n", sizeof (mg_ullong));
     513    if (sizeof (mg_ullong) < 8) {
     514      fprintf(stderr, "       Try compiling with GCC to enable use of 8 bytes for this counter.\n");
     515    }
     516    fprintf(stderr, "       Build aborted.\n");
     517    exit(1);
     518      }
    477519    }
    478520
    479521
    480522  /* now convert to bytes, and actually get the space */
     523#ifdef USE_LONG_LONG
     524  totalIbytes = (totalIbits + 7ull) >> 3ull;
     525#else
    481526  totalIbytes = (totalIbits + 7ul) >> 3ul;
    482 
     527#endif
    483528
    484529  return (COMPALLOK);
     
    648693      register int B;
    649694
    650       BIO_Random_Seek (isr->Disk_Ptr, &rbs);    /* Position in invf file */
     695      BIO_Random_Seek_X (isr->Disk_Ptr, &rbs);  /* Position in invf file */
    651696
    652697      B = isr->Disk_B;
     
    711756    }
    712757
    713       isr->Disk_Ptr = BIO_Random_Tell (&rbs);
     758      isr->Disk_Ptr = BIO_Random_Tell_X (&rbs);
    714759
    715760    }
     
    948993{
    949994  long i;
    950   unsigned long totalIbits;
     995  mg_ullong totalIbits;
    951996  unsigned long invf_len;
    952997  unsigned long bytes_output;
     
    9811026  bytes_output = ftell (invf_out);
    9821027
    983   totalIbits = 32;      /* The magic number */
    984   totalIbits += 8 * 100;    /* A 100 byte gap */
     1028  totalIbits = sizeof (unsigned long) * 8;      /* The magic number */
     1029  totalIbits += 8 * 200;    /* A 200 byte gap */
    9851030
    9861031  /* find the right place in the file to start reading p values */
     
    10341079      if (InvfLevel >= 2)
    10351080    totalIbits += BIO_Gamma_Bound (wcnt, fcnt);
     1081#ifdef USE_LONG_LONG
     1082      totalIbits = (totalIbits + 7ull) & 0xfffffffffffffff8ull;
     1083#else
    10361084      totalIbits = (totalIbits + 7ul) & 0xfffffff8ul;
     1085#endif
    10371086
    10381087    }
  • trunk/gsdl/packages/mg/src/text/mg_passes.c

    r2538 r2746  
    3232#include "timing.h"
    3333
     34#include "longlong.h"
     35
    3436#include "mg_files.h"
    3537#include "mg.h"
     
    4143/*
    4244   $Log$
     45   Revision 1.3  2001/09/21 12:46:42  kjm18
     46   updated mg to be in line with mg_1.3f. Now uses long long for some variables
     47   to enable indexing of very large collections.
     48
    4349   Revision 1.2  2001/06/12 23:23:42  jrm21
    4450   fixed a bug where mg_passes segfaults when trying to print the usage message.
     
    95101FILE *Comp_Stats = NULL;
    96102int comp_stat_point = 0;
    97 double bytes_processed = 0; /* [RJM 07/97: 4G limit] */
    98 double bytes_received = 0; /* [RJM 07/97: 4G limit] */
     103mg_ullong bytes_processed = 0;
     104mg_ullong bytes_received = 0;
    99105int stemmer_num = 0; /* default to the lovin stemmer */
    100106int stem_method = 0;
     
    349355          }
    350356        if (Trace)
    351           fprintf (Trace, "%10.0f bytes |%7lu docs | %s\n",
     357          fprintf (Trace, "%11" ULL_FS " bytes |%7lu docs | %s\n",
    352358               bytes_processed, num_docs,
    353359               ElapsedTime (&StartTime, NULL));
     
    374380          mi = mallinfo ();
    375381          block_bytes -= trace;
    376           fprintf (Trace, "%10.0f bytes |%7lu docs |%7.3f Mb | %s\n",
     382          fprintf (Trace, "%11" ULL_FS " bytes |%7lu docs |%7.3f Mb | %s\n",
    377383               bytes_processed, num_docs, mi.arena / 1024.0 / 1024.0,
    378384               ElapsedTime (&StartTime, NULL));
    379385#else
    380386          block_bytes -= trace;
    381           fprintf (Trace, "%10.0f bytes |%7lu docs | %s\n",
     387          fprintf (Trace, "%11" ULL_FS " bytes |%7lu docs | %s\n",
    382388               bytes_processed, num_docs,
    383389               ElapsedTime (&StartTime, NULL));
     
    430436      struct mallinfo mi;
    431437      mi = mallinfo ();
    432       fprintf (Trace, "%10.0f bytes |%7lu docs |%7.3f Mb | %s\n",
     438      fprintf (Trace, "%11" ULL_FS " bytes |%7lu docs |%7.3f Mb | %s\n",
    433439           bytes_processed, num_docs, mi.arena / 1024.0 / 1024.0,
    434440           ElapsedTime (&StartTime, NULL));
    435441#else
    436       fprintf (Trace, "%10.0f bytes |%7lu docs | %s\n",
     442      fprintf (Trace, "%11" ULL_FS " bytes |%7lu docs | %s\n",
    437443           bytes_processed, num_docs,
    438444           ElapsedTime (&StartTime, NULL));
     
    466472  Message ("Total time      : %s", ElapsedTime (&StartTime, &DoneTime));
    467473  Message ("Documents       : %u", num_docs);
    468   Message ("Bytes received  : %.0f", bytes_received);
    469   Message ("Bytes processed : %.0f", bytes_processed);
     474  Message ("Bytes received  : %" ULL_FS, bytes_received);
     475  Message ("Bytes processed : %" ULL_FS, bytes_processed);
    470476  Message ("Process Rate    : %.1f kB per cpu second",
    471477   (double) bytes_processed / (ProcTime.CPUTime - InitTime.CPUTime) / 1024);
  • trunk/gsdl/packages/mg/src/text/mgstat.c

    r439 r2746  
    3232#include "invf.h"
    3333#include "text.h"
    34 
     34#include "longlong.h"
    3535
    3636/*
    3737   $Log$
     38   Revision 1.2  2001/09/21 12:46:42  kjm18
     39   updated mg to be in line with mg_1.3f. Now uses long long for some variables
     40   to enable indexing of very large collections.
     41
    3842   Revision 1.1  1999/08/10 21:18:19  sjboddie
    3943   renamed mg-1.3d directory mg
     
    6569long ProcessStemBlk (char *name);  /* [RPAP - Jan 97: Stem Index Change] */
    6670
    67 static double inputbytes = 0.0; /* [RJM 07/97: 4G limit] */
    68 static unsigned long total = 0;
     71static mg_ullong inputbytes = 0;
     72static mg_ullong total = 0;
    6973
    7074
     
    7478int main (int argc, char **argv)
    7579{
    76   unsigned long sub_total;
     80  mg_ullong sub_total;
    7781  int fast;
    7882  char *file_name = "";
     
    164168  printf ("\n");
    165169  process_file (NULL, "TOTAL", exact);
    166 
    167170  return 0;
     171
    168172}
    169173
     
    251255    {
    252256      inputbytes = cth.num_of_bytes;
    253       printf ("Input bytes                        : %10.0f, %8.2f Mbyte\n",
    254           cth.num_of_bytes, cth.num_of_bytes / 1024.0 / 1024.0); /* [RJM 07/97: 4G limit] */
     257      printf ("Input bytes                        : %10" ULL_FS ", %8.2f Mbyte\n",
     258          cth.num_of_bytes, (double) cth.num_of_bytes / 1024 / 1024);
    255259      printf ("Documents                          : %10lu\n", cth.num_of_docs);
    256260      printf ("Words in collection [dict]         : %10lu\n", cth.num_of_words);
     
    378382    {
    379383      if (exact)
    380         printf ("%-34s : %10ld bytes   %7.3f%%\n", ext,
     384        printf ("%-34s : %10" ULL_FS " bytes   %7.3f%%\n", ext,
    381385            total,
    382386            100.0 * total / inputbytes);
     
    389393    {
    390394      if (exact)
    391         printf ("%-34s : %10ld bytes\n", ext, total);
     395        printf ("%-34s : %10" ULL_FS " bytes\n", ext, total);
    392396      else
    393397        printf ("%-34s : %8.2f %s\n", ext,
Note: See TracChangeset for help on using the changeset viewer.