/********************************************************************** * * hashfile.cpp -- * Copyright (C) 1999 The New Zealand Digital Library Project * * A component of the Greenstone digital library software * from the New Zealand Digital Library Project at the * University of Waikato, New Zealand. * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. * *********************************************************************/ #include #define MAXNUMLEN 100 // bytes // The number class is used to store very long unsigned integers. // The least significant of each number is stored first. Numbers can // be no longer than MAXNUMLEN bytes long. A number with length=0 // is considered to be equal to 0. There should be no extra zeros // at the front of the number. struct number { unsigned int len; unsigned char num[MAXNUMLEN]; number () {len = 0;} }; // evaluates a = a + b // a and b can be the same number void my_inc (number &a, number &b) { unsigned int carry = 0; unsigned int num = 0; unsigned int i = 0; while ((carry || (i < a.len) || (i < b.len)) && (i < MAXNUMLEN)) { num = carry; if (i < a.len) num += a.num[i]; if (i < b.len) num += b.num[i]; if (num >= 256) { num -= 256; // assumes that each number in the list < 256 carry = 1; } else { carry = 0; } a.num[i] = (unsigned char)num; ++i; } a.len = i; } // evaluates a = a - b if a >= b // returns true if the subtraction was possible int my_ifpos_dec (number &a, number &b) { int carry = 0; int num = 0; int i = 0; unsigned int len = 0; // the last non-zero digit // make sure a >= b if (b.len > a.len) return 0; if (b.len == a.len) { i = a.len - 1; while (i >= 0) { if (a.num[i] > b.num[i]) break; if (a.num[i] < b.num[i]) return 0; --i; } } unsigned int j = 0; len = 0; while ((j < a.len) || (j < b.len)) { num = -carry; if (j < a.len) num += a.num[j]; if (j < b.len) num -= b.num[j]; if (num < 0) { num += 256; // assumes each number in the list < 256 carry = 1; } else { carry = 0; } a.num[j] = (unsigned char) num; ++j; if (num != 0) len = j; } a.len = len; if (carry) { // panic, this should NOT happen fprintf (stderr, "panic: error in my_ifpos_dec\n"); } return 1; } char *my_convert_num (number &a) { char *result = new char[(a.len*2)+1]; int i,len; char convert[16] = {'0','1','2','3','4','5','6','7', '8','9','a','b','c','d','e','f'}; // zero might be a special case if (a.len == 0) { result[0] = '0'; result[1] = '\0'; return result; } for (i=a.len-1, len=0; i >= 0; --i) { result[len++] = convert[a.num[i]/16]; result[len++] = convert[a.num[i]%16]; } result[len] = '\0'; return result; } // create a hash string from the contents of a file. // // The file is treated as a large base 256 number (each char is a digit), // and the hash value is the remainder when this number is divided by a // very large prime. // // PROBLEM: This is a flawed hash function because the rightmost (highest) // value in remainder is more likely to be a "1" than it is to be any other // number. // // EVIDENCE: About 50% of the files in any GSDL directory have a hash code // that starts with 01* which implies (based on the my_convert_num function) // that the rightmost "digit" of the remainder = z such that (z % 16 = 0) // and (z / 16 = 1) which means (z = 1). // // MAJOR REASON: suppose our prime number was 19. Then, if we have a // reasonably random distribution of numbers for which we are going to // calculate (N % 19), we expect to get a roughly uniform distribution // of remainders where the possible values are 0, 1, 2, 3... 18. The // problem is that 9 out of the 19 possible values (10,11... 18) start // with the digit 1. Thus our hascode will start with "01". // // ANOTHER PROBLEM: Characters in the file are read one at a time; after each // one is read it is prepended to the remainder, then the remainder is // recalculated on the string thus far seen. I am sure the math here // is wrong - if I try calculating (111 mod 7) by the same algorithm, it // simply does nor work. // // ANOTHER POSSIBLE PROLEM: Each character from the file is read into // remainder at the most significant end, and when that character // is a zero you get a number like "01" which would be considered // larger than a number like "8" because it is longer (two digits // instead of 1). // // These comments added by Gordon Paynter (gwp@cs.waikato.ac.nz) in // June 2000. I didn't write any code, however. char *hashfile (char *filename) { FILE *infile = (FILE *)NULL; int i; number primepow[8]; number pow; number remainder; unsigned char c; // our prime number (618970019642690137449562111) pow.num[0] = 255; pow.num[1] = 255; pow.num[2] = 255; pow.num[3] = 255; pow.num[4] = 255; pow.num[5] = 255; pow.num[6] = 255; pow.num[7] = 255; pow.num[8] = 255; pow.num[9] = 255; pow.num[10] = 255; pow.num[11] = 1; pow.len = 12; // calculate 8 multiples of the prime number. // These are used to find the remainder using only subtraction operations for (i=0; i<8; ++i) { primepow[i] = pow; my_inc (pow, pow); } // The "remainder" after division by the prime. Our result. remainder.len = 0; // open the file infile = fopen (filename, "rb"); if (infile == NULL) { return (char *)NULL; } c = (unsigned char)fgetc(infile); while (!feof (infile)) { // make sure the remainder has not grown too large if (remainder.len == MAXNUMLEN-1) { fprintf (stderr, "ERROR - number overflow\n"); return (char *)NULL; } // remainder = remainder * 256 + c for (i=remainder.len; i>0; --i) { remainder.num[i] = remainder.num[i-1]; } remainder.num[0] = c; if (remainder.len > 0 || c != 0) remainder.len = remainder.len+1; // remainder = (remainder % large-prime-number) for (i=7; i>=0; --i) { my_ifpos_dec (remainder, primepow[i]); } // read a new character from the file c = (unsigned char)fgetc(infile); } fclose (infile); return my_convert_num (remainder); } int main (int argc, char *argv[]) { int i; char *result; for (i = 1; i < argc; ++i) { result = hashfile (argv[i]); if (result == NULL) { printf ("ERROR - could not process %s\n", argv[i]); } else { printf ("%s: %s\n", argv[i], result); delete [] result; } } return 0; }