Changeset 1232


Ignore:
Timestamp:
2000-06-23T14:25:48+12:00 (24 years ago)
Author:
gwp
Message:

Description of problem in hashing algorithm added to comments,
along with a description of two likely causes. No changes made
to code.

File:
1 edited

Legend:

Unmodified
Added
Removed
  • trunk/gsdl/src/hashfile/hashfile.cpp

    r915 r1232  
    143143
    144144
     145
     146// create a hash string from the contents of a file.
     147//
     148// The file is treated as a large base 256 number (each char is a digit),
     149// and the hash value is the remainder when this number is divided by a
     150// very large prime.
     151//
     152// PROBLEM: This is a flawed hash function because the rightmost (highest)
     153// value in remainder is more likely to be a "1" than it is to be any other
     154// number. 
     155//
     156// EVIDENCE: About 50% of the files in any GSDL directory have a hash code
     157// that starts with 01* which implies (based on the my_convert_num function)
     158// that the rightmost "digit" of the remainder = z such that (z % 16 = 0)
     159// and (z / 16 = 1) which means (z = 1).
     160//
     161// POSSIBLE CAUSE: Characters in the file are read one at a time; after each
     162// one is read it is prepended to the remainder, then the remainder is
     163// recalculated on the string thus far seen.  I think the math here
     164// may be just plain wrong.
     165//
     166// ANOTHER POSSIBLE CAUSE (this appears to be a genuine bug): Each character
     167// from the file is read into  remainder at the most significant end, and
     168// when that character is a zero you get a number like "01" which would
     169// be considered larger than a number like "8" because it is longer (two
     170// digits instead of 1). 
     171//
     172// These comments added by Gordon Paynter ([email protected]) on
     173// June 23, 2000.  I didn't write this code, however.
     174
    145175char *hashfile (char *filename) {
    146176  FILE *infile = (FILE *)NULL;
    147177  int i;
    148178
    149   // calculate the 8 multiples of the prime number to use
    150   // in the long division
    151179  number primepow[8];
    152180  number pow;
     
    163191  pow.len = 12;
    164192
     193  // calculate 8 multiples of the prime number.
     194  // These are used to find the remainder using only subtraction operations
    165195  for (i=0; i<8; i++) {
    166196    primepow[i] = pow;
     
    168198  }
    169199
     200  // The "remainder" after division by the prime.  Our result.
     201  remainder.len = 0;
     202
     203  // open the file
    170204  infile = fopen (filename, "rb");
    171205  if (infile == NULL) {
    172206    return (char *)NULL;
    173207  }
     208  c = (unsigned char)fgetc(infile);
    174209 
    175   remainder.len = 0;
    176   c = (unsigned char)fgetc(infile);
    177210  while (!feof (infile)) {
    178     // remainder = remainder * 256 + c
     211
     212    // make sure the remainder has not grown too large
    179213    if (remainder.len == MAXNUMLEN-1) {
    180214      fprintf (stderr, "ERROR - number overflow\n");
    181215      return (char *)NULL;
    182216    }
     217
     218    // remainder = remainder * 256 + c
    183219    for (i=remainder.len; i>0; i--) {
    184220      remainder.num[i] = remainder.num[i-1];
    185221    }
    186222    remainder.num[0] = c;
    187     if (remainder.len > 0 || c != 0) remainder.len = remainder.len+1;
     223    if (remainder.len > 0 || c != 0) remainder.len = remainder.len+1;
    188224   
     225    // remainder = (remainder % large-prime-number)
    189226    for (i=7; i>=0; i--) {
    190227      my_ifpos_dec (remainder, primepow[i]);
    191228    }
    192229
     230    // read a new character from the file
    193231    c = (unsigned char)fgetc(infile);
    194232  }
Note: See TracChangeset for help on using the changeset viewer.