[3745] | 1 | /**************************************************************************
|
---|
| 2 | *
|
---|
| 3 | * mg_weights_build.c -- Program to build the document weights file
|
---|
| 4 | * Copyright (C) 1994 Neil Sharman
|
---|
| 5 | *
|
---|
| 6 | * This program is free software; you can redistribute it and/or modify
|
---|
| 7 | * it under the terms of the GNU General Public License as published by
|
---|
| 8 | * the Free Software Foundation; either version 2 of the License, or
|
---|
| 9 | * (at your option) any later version.
|
---|
| 10 | *
|
---|
| 11 | * This program is distributed in the hope that it will be useful,
|
---|
| 12 | * but WITHOUT ANY WARRANTY; without even the implied warranty of
|
---|
| 13 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
---|
| 14 | * GNU General Public License for more details.
|
---|
| 15 | *
|
---|
| 16 | * You should have received a copy of the GNU General Public License
|
---|
| 17 | * along with this program; if not, write to the Free Software
|
---|
| 18 | * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
|
---|
| 19 | *
|
---|
| 20 | * $Id: mg_weights_build.c 7400 2004-05-24 21:12:18Z kjdon $
|
---|
| 21 | *
|
---|
| 22 | **************************************************************************/
|
---|
| 23 |
|
---|
| 24 | #include "sysfuncs.h"
|
---|
| 25 | #include "memlib.h"
|
---|
| 26 | #include "messages.h"
|
---|
| 27 | #include "local_strings.h"
|
---|
| 28 | #include "bitio_gen.h"
|
---|
| 29 | #include "bitio_m.h"
|
---|
| 30 | #include "bitio_m_stdio.h"
|
---|
| 31 | #include "timing.h"
|
---|
| 32 | #include "netorder.h" /* [RPAP - Jan 97: Endian Ordering] */
|
---|
| 33 |
|
---|
| 34 | #include "mg_files.h"
|
---|
| 35 | #include "locallib.h"
|
---|
| 36 | #include "invf.h"
|
---|
| 37 | #include "text.h"
|
---|
| 38 | #include "words.h"
|
---|
| 39 |
|
---|
| 40 | #define MAXBITS (sizeof(unsigned long) * 8)
|
---|
| 41 |
|
---|
| 42 | /*
|
---|
| 43 | $Log$
|
---|
[7400] | 44 | Revision 1.2 2004/05/24 21:12:18 kjdon
|
---|
| 45 | changed a message
|
---|
| 46 |
|
---|
[3745] | 47 | Revision 1.1 2003/02/20 21:18:24 mdewsnip
|
---|
| 48 | Addition of MG package for search and retrieval
|
---|
| 49 |
|
---|
| 50 | Revision 1.1 1999/08/10 21:18:16 sjboddie
|
---|
| 51 | renamed mg-1.3d directory mg
|
---|
| 52 |
|
---|
| 53 | Revision 1.2 1998/11/25 07:55:49 rjmcnab
|
---|
| 54 |
|
---|
| 55 | Modified mg to that you can specify the stemmer you want
|
---|
| 56 | to use via a command line option. You specify it to
|
---|
| 57 | mg_passes during the build process. The number of the
|
---|
| 58 | stemmer that you used is stored within the inverted
|
---|
| 59 | dictionary header and the stemmed dictionary header so
|
---|
| 60 | the correct stemmer is used in later stages of building
|
---|
| 61 | and querying.
|
---|
| 62 |
|
---|
| 63 | Revision 1.1 1998/11/17 09:35:22 rjmcnab
|
---|
| 64 | *** empty log message ***
|
---|
| 65 |
|
---|
| 66 | * Revision 1.4 1994/11/29 00:32:05 tes
|
---|
| 67 | * Committing the new merged files and changes.
|
---|
| 68 | *
|
---|
| 69 | * Revision 1.3 1994/10/20 03:57:00 tes
|
---|
| 70 | * I have rewritten the boolean query optimiser and abstracted out the
|
---|
| 71 | * components of the boolean query.
|
---|
| 72 | *
|
---|
| 73 | * Revision 1.2 1994/09/20 04:41:55 tes
|
---|
| 74 | * For version 1.1
|
---|
| 75 | *
|
---|
| 76 | */
|
---|
| 77 |
|
---|
| 78 | static char *RCSID = "$Id: mg_weights_build.c 7400 2004-05-24 21:12:18Z kjdon $";
|
---|
| 79 |
|
---|
| 80 | unsigned char bits = 8;
|
---|
| 81 | static char *file_name = "";
|
---|
| 82 | static char *text_file_name = "";
|
---|
| 83 | static unsigned long NumPara = 0;
|
---|
| 84 | static unsigned long StaticNumOfDocs = 0;
|
---|
| 85 |
|
---|
| 86 | unsigned long get_NumPara (void);
|
---|
| 87 | unsigned long get_StaticNumOfDocs (void);
|
---|
| 88 | void GenerateWeights (void);
|
---|
| 89 | void Make_weight_approx (void);
|
---|
| 90 | void Make_text_idx_wgt (void);
|
---|
| 91 |
|
---|
| 92 |
|
---|
| 93 | int main (int argc, char **argv)
|
---|
| 94 | {
|
---|
| 95 | ProgTime StartTime;
|
---|
| 96 | int ch;
|
---|
| 97 | opterr = 0;
|
---|
| 98 | msg_prefix = argv[0];
|
---|
| 99 | while ((ch = getopt (argc, argv, "f:t:d:b:sh")) != -1) /* [RJM 10/98 - Text Filename] */
|
---|
| 100 | switch (ch)
|
---|
| 101 | {
|
---|
| 102 | case 'f': /* input file */
|
---|
| 103 | file_name = optarg;
|
---|
| 104 | if (strlen(text_file_name) == 0) text_file_name = optarg;
|
---|
| 105 | break;
|
---|
| 106 | /* [RJM 10/98 - Text Filename] */
|
---|
| 107 | case 't': /* text input file */
|
---|
| 108 | text_file_name = optarg;
|
---|
| 109 | break;
|
---|
| 110 | case 'd':
|
---|
| 111 | set_basepath (optarg);
|
---|
| 112 | break;
|
---|
| 113 | case 'b':
|
---|
| 114 | bits = atoi (optarg);
|
---|
| 115 | if (bits > 32)
|
---|
| 116 | {
|
---|
| 117 | fprintf (stderr, "b may only take values 0-32\n");
|
---|
| 118 | exit (1);
|
---|
| 119 | }
|
---|
| 120 | break;
|
---|
| 121 | case 'h':
|
---|
| 122 | case '?':
|
---|
| 123 | fprintf (stderr, "usage: %s [-f input_file]"
|
---|
| 124 | "[-d data directory] [-b bits] [-s] [-h]\n", argv[0]);
|
---|
| 125 | exit (1);
|
---|
| 126 | }
|
---|
| 127 | GetTime (&StartTime);
|
---|
| 128 |
|
---|
| 129 | GenerateWeights ();
|
---|
| 130 |
|
---|
| 131 | Make_weight_approx ();
|
---|
| 132 |
|
---|
| 133 | Make_text_idx_wgt ();
|
---|
| 134 |
|
---|
| 135 | Message ("%s", ElapsedTime (&StartTime, NULL));
|
---|
| 136 |
|
---|
| 137 | return 0;
|
---|
| 138 | }
|
---|
| 139 |
|
---|
| 140 |
|
---|
| 141 |
|
---|
| 142 |
|
---|
| 143 | unsigned long
|
---|
| 144 | get_NumPara (void)
|
---|
| 145 | {
|
---|
| 146 | struct invf_dict_header idh;
|
---|
| 147 | FILE *invf_dict;
|
---|
| 148 | if (NumPara)
|
---|
| 149 | return (NumPara);
|
---|
| 150 | invf_dict = open_file (file_name, INVF_DICT_SUFFIX, "rb", MAGIC_STEM_BUILD,
|
---|
| 151 | MG_ABORT); /* [RPAP - Feb 97: WIN32 Port] */
|
---|
| 152 | fread ((char *) &idh, sizeof (idh), 1, invf_dict);
|
---|
| 153 | fclose (invf_dict);
|
---|
| 154 | NTOHUL2(idh.num_of_docs, NumPara); /* [RPAP - Jan 97: Endian Ordering] */
|
---|
| 155 | return NumPara;
|
---|
| 156 | }
|
---|
| 157 |
|
---|
| 158 |
|
---|
| 159 |
|
---|
| 160 | unsigned long
|
---|
| 161 | get_StaticNumOfDocs (void)
|
---|
| 162 | /* the static number of documents is the N parameter used to
|
---|
| 163 | * decode document gaps in the inverted file encoded using
|
---|
| 164 | * the Bblock method.
|
---|
| 165 | */
|
---|
| 166 | {
|
---|
| 167 | struct invf_dict_header idh;
|
---|
| 168 | FILE *invf_dict;
|
---|
| 169 | if (StaticNumOfDocs)
|
---|
| 170 | return (StaticNumOfDocs);
|
---|
| 171 | invf_dict = open_file (file_name, INVF_DICT_SUFFIX, "rb", MAGIC_STEM_BUILD,
|
---|
| 172 | MG_ABORT); /* [RPAP - Feb 97: WIN32 Port] */
|
---|
| 173 | fread ((char *) &idh, sizeof (idh), 1, invf_dict);
|
---|
| 174 | fclose (invf_dict);
|
---|
| 175 | NTOHUL2(idh.static_num_of_docs, StaticNumOfDocs); /* [RPAP - Jan 97: Endian Ordering] */
|
---|
| 176 | return StaticNumOfDocs;
|
---|
| 177 | }
|
---|
| 178 |
|
---|
| 179 |
|
---|
| 180 |
|
---|
| 181 | void GenerateWeights (void) {
|
---|
| 182 | FILE *dict, *invf, *f, *idx;
|
---|
| 183 | struct invf_dict_header idh;
|
---|
| 184 | struct invf_file_header ifh;
|
---|
| 185 | int i;
|
---|
| 186 | double logN;
|
---|
| 187 | float *DocWeights;
|
---|
| 188 |
|
---|
| 189 | /* make sure the globals NumPara and StaticNumOfDocs are loaded */
|
---|
| 190 | get_NumPara ();
|
---|
| 191 | get_StaticNumOfDocs ();
|
---|
| 192 |
|
---|
| 193 | /* check to see if the weights file has already been built */
|
---|
| 194 | if ((f = open_file (file_name, WEIGHTS_SUFFIX, "rb", MAGIC_WGHT,
|
---|
| 195 | MG_CONTINUE)) != NULL) {
|
---|
| 196 | fclose (f);
|
---|
| 197 | return;
|
---|
| 198 | }
|
---|
| 199 | Message ("The file \"%s.weight\" does not exist.", file_name);
|
---|
| 200 | Message ("Building the weight data from the file \"%s.invf\".", file_name);
|
---|
| 201 |
|
---|
| 202 | logN = log ((double) NumPara);
|
---|
| 203 |
|
---|
| 204 | /* allocate memory for the weights */
|
---|
| 205 | if (!(DocWeights = Xmalloc (sizeof (float) * (NumPara + 1))))
|
---|
| 206 | FatalError (1, "No memory for doc weights");
|
---|
| 207 | bzero ((char *) DocWeights, sizeof (float) * (NumPara + 1));
|
---|
| 208 |
|
---|
| 209 | /* open the .invf.dict file and read in its header */
|
---|
| 210 | dict = open_file (file_name, INVF_DICT_SUFFIX, "rb", MAGIC_STEM_BUILD,
|
---|
| 211 | MG_ABORT); /* [RPAP - Feb 97: WIN32 Port] */
|
---|
| 212 | fread ((char *) &idh, sizeof (idh), 1, dict);
|
---|
| 213 |
|
---|
| 214 | /* [RPAP - Jan 97: Endian Ordering] */
|
---|
| 215 | NTOHUL(idh.lookback);
|
---|
| 216 | NTOHUL(idh.dict_size);
|
---|
| 217 | NTOHUL(idh.total_bytes);
|
---|
| 218 | NTOHUL(idh.index_string_bytes);
|
---|
| 219 | NTOHD(idh.input_bytes); /* [RJM 07/97: 4G limit] */
|
---|
| 220 | NTOHUL(idh.num_of_docs);
|
---|
| 221 | NTOHUL(idh.static_num_of_docs);
|
---|
| 222 | NTOHUL(idh.num_of_words);
|
---|
| 223 | NTOHUL(idh.stemmer_num);
|
---|
| 224 | NTOHUL(idh.stem_method);
|
---|
| 225 |
|
---|
| 226 | /* open .invf.idx */
|
---|
| 227 | idx = open_file (file_name, INVF_IDX_SUFFIX, "rb", MAGIC_INVI,
|
---|
| 228 | MG_ABORT); /* [RPAP - Feb 97: WIN32 Port] */
|
---|
| 229 |
|
---|
| 230 | /* open .invf and read in its header */
|
---|
| 231 | invf = open_file (file_name, INVF_SUFFIX, "rb", MAGIC_INVF,
|
---|
| 232 | MG_ABORT);
|
---|
| 233 | fread ((char *) &ifh, sizeof (ifh), 1, invf);
|
---|
| 234 |
|
---|
| 235 | /* [RPAP - Jan 97: Endian Ordering] */
|
---|
| 236 | NTOHUL(ifh.no_of_words);
|
---|
| 237 | NTOHUL(ifh.no_of_ptrs);
|
---|
| 238 | NTOHUL(ifh.skip_mode);
|
---|
| 239 | for (i = 0; i <= 15; i++)
|
---|
| 240 | NTOHUL(ifh.params[i]);
|
---|
| 241 | NTOHUL(ifh.InvfLevel);
|
---|
| 242 |
|
---|
| 243 | /* make sure the inverted file does not contain skips and is not level 1 */
|
---|
| 244 | if (ifh.skip_mode != 0)
|
---|
| 245 | FatalError (0, "Can\'t make weights file from a skipped inverted file.");
|
---|
| 246 | if (ifh.InvfLevel == 1)
|
---|
| 247 | FatalError (0, "Can\'t make weights file from level 1 inverted file.");
|
---|
| 248 |
|
---|
| 249 | DECODE_START (invf)
|
---|
| 250 |
|
---|
| 251 | /* process each word adding its contributions to the document weights */
|
---|
| 252 | for (i = 0; i < ifh.no_of_words; i++)
|
---|
| 253 | {
|
---|
| 254 | u_char dummy1, dummy2[MAXSTEMLEN + 1];
|
---|
| 255 | unsigned long fcnt, wcnt, blk, CurrDoc, p, j;
|
---|
| 256 | float idf;
|
---|
| 257 |
|
---|
| 258 | /* give a little feedback every 4096 words */
|
---|
| 259 | if ((i & 0xfff) == 0)
|
---|
| 260 | fprintf (stderr, ".");
|
---|
| 261 |
|
---|
| 262 | /* read an entry for a word, just to get p value */
|
---|
| 263 | dummy1 = fgetc (dict);
|
---|
| 264 | dummy1 = fgetc (dict);
|
---|
| 265 | fread (dummy2, sizeof (u_char), dummy1, dict);
|
---|
| 266 | fread ((char *) &fcnt, sizeof (fcnt), 1, dict);
|
---|
| 267 | fread ((char *) &wcnt, sizeof (wcnt), 1, dict);
|
---|
| 268 |
|
---|
| 269 | dummy2[dummy1] = '\0';
|
---|
| 270 |
|
---|
| 271 | /* [RPAP - Jan 97: Endian Ordering] */
|
---|
| 272 | NTOHUL(fcnt);
|
---|
| 273 | NTOHUL(wcnt);
|
---|
| 274 |
|
---|
| 275 | p = fcnt;
|
---|
| 276 |
|
---|
| 277 | idf = logN - log ((double) fcnt);
|
---|
| 278 | blk = BIO_Bblock_Init (StaticNumOfDocs, p);
|
---|
| 279 | CurrDoc = 0;
|
---|
| 280 |
|
---|
| 281 | /* check the inverted file index entry for this word */
|
---|
| 282 | {
|
---|
| 283 | unsigned long loc;
|
---|
| 284 | fread ((char *) &loc, sizeof (loc), 1, idx);
|
---|
| 285 | NTOHUL(loc); /* [RPAP - Jan 97: Endian Ordering] */
|
---|
| 286 | if (ftell (invf) != loc)
|
---|
| 287 | {
|
---|
| 288 | FatalError (1, "Word %d %d != %d", i, ftell (invf), loc);
|
---|
| 289 | }
|
---|
| 290 | }
|
---|
| 291 |
|
---|
| 292 | for (j = 0; j < p; j++)
|
---|
| 293 | {
|
---|
| 294 | unsigned long x, tf;
|
---|
| 295 | BBLOCK_DECODE (x, blk);
|
---|
| 296 | CurrDoc += x;
|
---|
| 297 |
|
---|
| 298 | if (CurrDoc > idh.num_of_docs) {
|
---|
| 299 | FatalError (1, "CurrDoc = %d, number of documents = %d",
|
---|
| 300 | CurrDoc, idh.num_of_docs);
|
---|
| 301 | }
|
---|
| 302 |
|
---|
| 303 | if (ifh.InvfLevel >= 2)
|
---|
| 304 | {
|
---|
| 305 | double weight;
|
---|
| 306 | GAMMA_DECODE (tf);
|
---|
| 307 | weight = tf * idf;
|
---|
| 308 | DocWeights[CurrDoc - 1] += weight * weight;
|
---|
| 309 | }
|
---|
| 310 | }
|
---|
| 311 |
|
---|
| 312 | while (__btg)
|
---|
| 313 | DECODE_BIT;
|
---|
| 314 | }
|
---|
| 315 |
|
---|
| 316 | DECODE_DONE
|
---|
| 317 |
|
---|
| 318 | fclose (dict);
|
---|
| 319 | fclose (invf);
|
---|
| 320 | fprintf (stderr, "\n");
|
---|
| 321 |
|
---|
| 322 | /* [RPAP - Jan 97: Endian Ordering] */
|
---|
| 323 | for (i = 0; i < NumPara; i++)
|
---|
| 324 | HTONF(DocWeights[i]);
|
---|
| 325 |
|
---|
| 326 | f = create_file (file_name, WEIGHTS_SUFFIX, "wb", MAGIC_WGHT,
|
---|
| 327 | MG_ABORT);
|
---|
| 328 |
|
---|
| 329 | fwrite ((char *) DocWeights, sizeof (float), NumPara, f);
|
---|
| 330 | fclose (f);
|
---|
| 331 | Xfree (DocWeights);
|
---|
| 332 | }
|
---|
| 333 |
|
---|
| 334 |
|
---|
| 335 |
|
---|
| 336 |
|
---|
| 337 |
|
---|
| 338 |
|
---|
| 339 |
|
---|
| 340 |
|
---|
| 341 |
|
---|
| 342 |
|
---|
| 343 |
|
---|
| 344 | void
|
---|
| 345 | Make_weight_approx (void)
|
---|
| 346 | {
|
---|
| 347 | int i, pos, max;
|
---|
| 348 | unsigned long buf;
|
---|
| 349 | double U, L, B;
|
---|
| 350 | FILE *approx, *exact;
|
---|
| 351 |
|
---|
| 352 | exact = open_file (file_name, WEIGHTS_SUFFIX, "rb", MAGIC_WGHT,
|
---|
| 353 | MG_ABORT); /* [RPAP - Feb 97: WIN32 Port] */
|
---|
| 354 |
|
---|
| 355 | /* calculate U and L */
|
---|
| 356 | L = 1e300;
|
---|
| 357 | U = 0;
|
---|
| 358 | for (i = 0; i < NumPara; i++)
|
---|
| 359 | {
|
---|
| 360 | float wgt;
|
---|
| 361 | fread ((char *) &wgt, sizeof (wgt), 1, exact);
|
---|
| 362 | NTOHF(wgt); /* [RPAP - Jan 97: Endian Ordering] */
|
---|
| 363 | wgt = sqrt (wgt);
|
---|
| 364 | if (wgt > U)
|
---|
| 365 | U = wgt;
|
---|
| 366 | if (wgt > 0 && wgt < L)
|
---|
| 367 | L = wgt;
|
---|
| 368 |
|
---|
| 369 | }
|
---|
| 370 | fseek (exact, sizeof (u_long), SEEK_SET);
|
---|
| 371 |
|
---|
| 372 | B = pow (U / L, pow (2.0, -(double) bits));
|
---|
| 373 |
|
---|
| 374 | fprintf (stderr, "L = %f\n", L);
|
---|
| 375 | fprintf (stderr, "U = %f\n", U);
|
---|
| 376 | fprintf (stderr, "B = %f\n", B);
|
---|
| 377 |
|
---|
| 378 |
|
---|
| 379 |
|
---|
| 380 | approx = create_file (file_name, APPROX_WEIGHTS_SUFFIX, "wb",
|
---|
| 381 | MAGIC_WGHT_APPROX, MG_ABORT); /* [RPAP - Feb 97: WIN32 Port] */
|
---|
| 382 |
|
---|
| 383 | fwrite ((char *) &bits, sizeof (bits), 1, approx);
|
---|
| 384 | HTOND(L); /* [RPAP - Jan 97: Endian Ordering] */
|
---|
| 385 | HTOND(B); /* [RPAP - Jan 97: Endian Ordering] */
|
---|
| 386 | fwrite ((char *) &L, sizeof (L), 1, approx);
|
---|
| 387 | fwrite ((char *) &B, sizeof (B), 1, approx);
|
---|
| 388 | NTOHD(L); /* [RPAP - Jan 97: Endian Ordering] */
|
---|
| 389 | NTOHD(B); /* [RPAP - Jan 97: Endian Ordering] */
|
---|
| 390 |
|
---|
| 391 | max = bits == 32 ? 0xffffffff : (1 << bits) - 1;
|
---|
| 392 | for (buf = pos = i = 0; i < NumPara; i++)
|
---|
| 393 | {
|
---|
| 394 | unsigned long fx;
|
---|
| 395 | float wgt;
|
---|
| 396 | fread ((char *) &wgt, sizeof (wgt), 1, exact);
|
---|
| 397 | NTOHF(wgt); /* [RPAP - Jan 97: Endian Ordering] */
|
---|
| 398 | wgt = sqrt (wgt);
|
---|
| 399 | if (wgt == 0)
|
---|
| 400 | {
|
---|
| 401 | wgt = L;
|
---|
| 402 | #ifndef QUIET
|
---|
| 403 | Message ("Warning: Document %d had a weight of 0.", i);
|
---|
| 404 | #endif
|
---|
| 405 | }
|
---|
| 406 | fx = (int) floor (log (wgt / L) / log (B));
|
---|
| 407 |
|
---|
| 408 | if (fx > max)
|
---|
| 409 | fx = max;
|
---|
| 410 |
|
---|
| 411 | buf |= (fx << pos);
|
---|
| 412 | pos += bits;
|
---|
| 413 |
|
---|
| 414 | if (pos >= MAXBITS)
|
---|
| 415 | {
|
---|
| 416 | HTONUL(buf);
|
---|
| 417 | fwrite ((char *) &buf, sizeof (buf), 1, approx);
|
---|
| 418 | buf = fx >> (bits - (pos - MAXBITS));
|
---|
| 419 | pos = pos - MAXBITS;
|
---|
| 420 | }
|
---|
| 421 | }
|
---|
| 422 | if (pos > 0)
|
---|
| 423 | {
|
---|
| 424 | /* [RPAP - Jan 97: Endian Ordering] */
|
---|
| 425 | HTONUL(buf);
|
---|
| 426 | fwrite ((char *) &buf, sizeof (buf), 1, approx);
|
---|
| 427 | }
|
---|
| 428 |
|
---|
| 429 | fclose (approx);
|
---|
| 430 | fclose (exact);
|
---|
| 431 | }
|
---|
| 432 |
|
---|
| 433 |
|
---|
| 434 |
|
---|
| 435 |
|
---|
| 436 |
|
---|
| 437 | void
|
---|
| 438 | Make_text_idx_wgt (void)
|
---|
| 439 | {
|
---|
| 440 | compressed_text_header cth;
|
---|
| 441 | int i;
|
---|
| 442 | FILE *idx_wgt, *idx, *para, *exact;
|
---|
| 443 |
|
---|
| 444 | idx_wgt = create_file (file_name, TEXT_IDX_WGT_SUFFIX, "wb", MAGIC_TEXI_WGT,
|
---|
| 445 | MG_ABORT); /* [RPAP - Feb 97: WIN32 Port] */
|
---|
| 446 |
|
---|
| 447 | /* [RJM 10/98 - Text Filename] */
|
---|
| 448 | idx = open_file (text_file_name, TEXT_IDX_SUFFIX, "rb", MAGIC_TEXI,
|
---|
| 449 | MG_ABORT); /* [RPAP - Feb 97: WIN32 Port] */
|
---|
| 450 | if (fread (&cth, sizeof (cth), 1, idx) != 1)
|
---|
| 451 | FatalError (1, "Unable to read header of index file");
|
---|
| 452 |
|
---|
| 453 | /* [RPAP - Jan 97: Endian Ordering] */
|
---|
| 454 | NTOHUL(cth.num_of_docs);
|
---|
| 455 | NTOHD(cth.num_of_bytes); /* [RJM 07/97: 4G limit] */
|
---|
| 456 | NTOHUL(cth.num_of_words);
|
---|
| 457 | NTOHUL(cth.length_of_longest_doc);
|
---|
| 458 | NTOHD(cth.ratio);
|
---|
| 459 |
|
---|
| 460 | exact = open_file (file_name, WEIGHTS_SUFFIX, "rb", MAGIC_WGHT,
|
---|
| 461 | MG_ABORT); /* [RPAP - Feb 97: WIN32 Port] */
|
---|
| 462 |
|
---|
| 463 | get_NumPara ();
|
---|
| 464 | if (cth.num_of_docs != NumPara)
|
---|
| 465 | {
|
---|
[7400] | 466 | Message ("The number of documents %d does not equal "
|
---|
| 467 | "the number of paragraphs %d.", cth.num_of_docs, NumPara);
|
---|
[3745] | 468 | Message ("Using the \"%s.invf.paragraph\" file\n", file_name);
|
---|
| 469 | para = open_file (file_name, INVF_PARAGRAPH_SUFFIX, "rb", MAGIC_PARAGRAPH,
|
---|
| 470 | MG_ABORT); /* [RPAP - Feb 97: WIN32 Port] */
|
---|
| 471 | }
|
---|
| 472 | else
|
---|
| 473 | para = NULL;
|
---|
| 474 |
|
---|
| 475 | {
|
---|
| 476 | struct
|
---|
| 477 | {
|
---|
| 478 | unsigned long Start;
|
---|
| 479 | float Weight;
|
---|
| 480 | }
|
---|
| 481 | data;
|
---|
| 482 | for (i = 0; i < cth.num_of_docs; i++)
|
---|
| 483 | {
|
---|
| 484 | int count;
|
---|
| 485 | fread ((char *) &data.Start, sizeof (unsigned long), 1, idx);
|
---|
| 486 | if (para && i < cth.num_of_docs)
|
---|
| 487 | {
|
---|
| 488 | /* [RPAP - Jan 97: Endian Ordering] */
|
---|
| 489 | fread ((char *) &count, sizeof (count), 1, para);
|
---|
| 490 | NTOHSI(count);
|
---|
| 491 | }
|
---|
| 492 | else
|
---|
| 493 | count = 1;
|
---|
| 494 | while (count--)
|
---|
| 495 | {
|
---|
| 496 | fread ((char *) &data.Weight, sizeof (float), 1, exact);
|
---|
| 497 | NTOHF(data.Weight); /* [RPAP - Jan 97: Endian Ordering] */
|
---|
| 498 | data.Weight = sqrt (data.Weight);
|
---|
| 499 | HTONF(data.Weight); /* [RPAP - Jan 97: Endian Ordering] */
|
---|
| 500 | fwrite ((char *) &data, sizeof (data), 1, idx_wgt);
|
---|
| 501 | }
|
---|
| 502 | }
|
---|
| 503 | /* Write out the extra entry for the idx file */
|
---|
| 504 | fread ((char *) &data.Start, sizeof (unsigned long), 1, idx);
|
---|
| 505 | data.Weight = 0;
|
---|
| 506 | fwrite((char*)&data, sizeof(data), 1, idx_wgt);
|
---|
| 507 | }
|
---|
| 508 |
|
---|
| 509 | fclose (idx_wgt);
|
---|
| 510 | fclose (idx);
|
---|
| 511 | fclose (exact);
|
---|
| 512 | if (para)
|
---|
| 513 | fclose (para);
|
---|
| 514 | }
|
---|