[535] | 1 | /**********************************************************************
|
---|
| 2 | *
|
---|
| 3 | * hashfile.cpp --
|
---|
| 4 | * Copyright (C) 1999 The New Zealand Digital Library Project
|
---|
| 5 | *
|
---|
| 6 | * A component of the Greenstone digital library software
|
---|
| 7 | * from the New Zealand Digital Library Project at the
|
---|
| 8 | * University of Waikato, New Zealand.
|
---|
| 9 | *
|
---|
| 10 | * This program is free software; you can redistribute it and/or modify
|
---|
| 11 | * it under the terms of the GNU General Public License as published by
|
---|
| 12 | * the Free Software Foundation; either version 2 of the License, or
|
---|
| 13 | * (at your option) any later version.
|
---|
| 14 | *
|
---|
| 15 | * This program is distributed in the hope that it will be useful,
|
---|
| 16 | * but WITHOUT ANY WARRANTY; without even the implied warranty of
|
---|
| 17 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
---|
| 18 | * GNU General Public License for more details.
|
---|
| 19 | *
|
---|
| 20 | * You should have received a copy of the GNU General Public License
|
---|
| 21 | * along with this program; if not, write to the Free Software
|
---|
| 22 | * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
|
---|
| 23 | *
|
---|
| 24 | *********************************************************************/
|
---|
| 25 |
|
---|
[4] | 26 | #include <stdio.h>
|
---|
| 27 |
|
---|
| 28 | #define MAXNUMLEN 100 // bytes
|
---|
| 29 |
|
---|
| 30 |
|
---|
| 31 | // The number class is used to store very long unsigned integers.
|
---|
| 32 | // The least significant of each number is stored first. Numbers can
|
---|
| 33 | // be no longer than MAXNUMLEN bytes long. A number with length=0
|
---|
| 34 | // is considered to be equal to 0. There should be no extra zeros
|
---|
| 35 | // at the front of the number.
|
---|
| 36 |
|
---|
| 37 | struct number {
|
---|
| 38 | unsigned int len;
|
---|
| 39 | unsigned char num[MAXNUMLEN];
|
---|
| 40 |
|
---|
| 41 | number () {len = 0;}
|
---|
| 42 | };
|
---|
| 43 |
|
---|
| 44 |
|
---|
| 45 | // evaluates a = a + b
|
---|
| 46 | // a and b can be the same number
|
---|
| 47 | void my_inc (number &a, number &b) {
|
---|
| 48 | unsigned int carry = 0;
|
---|
| 49 | unsigned int num = 0;
|
---|
| 50 | unsigned int i = 0;
|
---|
| 51 |
|
---|
| 52 | while ((carry || (i < a.len) || (i < b.len)) && (i < MAXNUMLEN)) {
|
---|
| 53 | num = carry;
|
---|
| 54 | if (i < a.len) num += a.num[i];
|
---|
| 55 | if (i < b.len) num += b.num[i];
|
---|
| 56 |
|
---|
| 57 | if (num >= 256) {
|
---|
| 58 | num -= 256; // assumes that each number in the list < 256
|
---|
| 59 | carry = 1;
|
---|
| 60 | } else {
|
---|
| 61 | carry = 0;
|
---|
| 62 | }
|
---|
| 63 | a.num[i] = (unsigned char)num;
|
---|
| 64 |
|
---|
[9596] | 65 | ++i;
|
---|
[4] | 66 | }
|
---|
| 67 |
|
---|
| 68 | a.len = i;
|
---|
| 69 | }
|
---|
| 70 |
|
---|
| 71 |
|
---|
| 72 | // evaluates a = a - b if a >= b
|
---|
| 73 | // returns true if the subtraction was possible
|
---|
| 74 | int my_ifpos_dec (number &a, number &b) {
|
---|
| 75 | int carry = 0;
|
---|
| 76 | int num = 0;
|
---|
[598] | 77 | int i = 0;
|
---|
[4] | 78 | unsigned int len = 0; // the last non-zero digit
|
---|
| 79 |
|
---|
| 80 | // make sure a >= b
|
---|
| 81 | if (b.len > a.len) return 0;
|
---|
| 82 | if (b.len == a.len) {
|
---|
| 83 | i = a.len - 1;
|
---|
| 84 | while (i >= 0) {
|
---|
| 85 | if (a.num[i] > b.num[i]) break;
|
---|
| 86 | if (a.num[i] < b.num[i]) return 0;
|
---|
[9596] | 87 | --i;
|
---|
[4] | 88 | }
|
---|
| 89 | }
|
---|
| 90 |
|
---|
[598] | 91 | unsigned int j = 0;
|
---|
[4] | 92 | len = 0;
|
---|
[598] | 93 | while ((j < a.len) || (j < b.len)) {
|
---|
[4] | 94 | num = -carry;
|
---|
[598] | 95 | if (j < a.len) num += a.num[j];
|
---|
| 96 | if (j < b.len) num -= b.num[j];
|
---|
[4] | 97 |
|
---|
| 98 | if (num < 0) {
|
---|
| 99 | num += 256; // assumes each number in the list < 256
|
---|
| 100 | carry = 1;
|
---|
| 101 | } else {
|
---|
| 102 | carry = 0;
|
---|
| 103 | }
|
---|
| 104 |
|
---|
[598] | 105 | a.num[j] = (unsigned char) num;
|
---|
[4] | 106 |
|
---|
[9596] | 107 | ++j;
|
---|
[598] | 108 | if (num != 0) len = j;
|
---|
[4] | 109 | }
|
---|
| 110 |
|
---|
| 111 | a.len = len;
|
---|
| 112 |
|
---|
| 113 | if (carry) {
|
---|
| 114 | // panic, this should NOT happen
|
---|
| 115 | fprintf (stderr, "panic: error in my_ifpos_dec\n");
|
---|
| 116 | }
|
---|
| 117 |
|
---|
| 118 | return 1;
|
---|
| 119 | }
|
---|
| 120 |
|
---|
| 121 |
|
---|
| 122 | char *my_convert_num (number &a) {
|
---|
[915] | 123 | char *result = new char[(a.len*2)+1];
|
---|
[4] | 124 | int i,len;
|
---|
| 125 | char convert[16] = {'0','1','2','3','4','5','6','7',
|
---|
| 126 | '8','9','a','b','c','d','e','f'};
|
---|
| 127 |
|
---|
| 128 | // zero might be a special case
|
---|
| 129 | if (a.len == 0) {
|
---|
| 130 | result[0] = '0';
|
---|
| 131 | result[1] = '\0';
|
---|
| 132 | return result;
|
---|
| 133 | }
|
---|
| 134 |
|
---|
[9596] | 135 | for (i=a.len-1, len=0; i >= 0; --i) {
|
---|
[4] | 136 | result[len++] = convert[a.num[i]/16];
|
---|
| 137 | result[len++] = convert[a.num[i]%16];
|
---|
| 138 | }
|
---|
| 139 | result[len] = '\0';
|
---|
| 140 |
|
---|
| 141 | return result;
|
---|
| 142 | }
|
---|
| 143 |
|
---|
| 144 |
|
---|
[1232] | 145 |
|
---|
| 146 | // create a hash string from the contents of a file.
|
---|
| 147 | //
|
---|
| 148 | // The file is treated as a large base 256 number (each char is a digit),
|
---|
| 149 | // and the hash value is the remainder when this number is divided by a
|
---|
| 150 | // very large prime.
|
---|
| 151 | //
|
---|
| 152 | // PROBLEM: This is a flawed hash function because the rightmost (highest)
|
---|
| 153 | // value in remainder is more likely to be a "1" than it is to be any other
|
---|
| 154 | // number.
|
---|
| 155 | //
|
---|
| 156 | // EVIDENCE: About 50% of the files in any GSDL directory have a hash code
|
---|
| 157 | // that starts with 01* which implies (based on the my_convert_num function)
|
---|
| 158 | // that the rightmost "digit" of the remainder = z such that (z % 16 = 0)
|
---|
| 159 | // and (z / 16 = 1) which means (z = 1).
|
---|
| 160 | //
|
---|
[1238] | 161 | // MAJOR REASON: suppose our prime number was 19. Then, if we have a
|
---|
| 162 | // reasonably random distribution of numbers for which we are going to
|
---|
| 163 | // calculate (N % 19), we expect to get a roughly uniform distribution
|
---|
| 164 | // of remainders where the possible values are 0, 1, 2, 3... 18. The
|
---|
| 165 | // problem is that 9 out of the 19 possible values (10,11... 18) start
|
---|
| 166 | // with the digit 1. Thus our hascode will start with "01".
|
---|
| 167 | //
|
---|
| 168 | // ANOTHER PROBLEM: Characters in the file are read one at a time; after each
|
---|
[1232] | 169 | // one is read it is prepended to the remainder, then the remainder is
|
---|
[1238] | 170 | // recalculated on the string thus far seen. I am sure the math here
|
---|
| 171 | // is wrong - if I try calculating (111 mod 7) by the same algorithm, it
|
---|
| 172 | // simply does nor work.
|
---|
[1232] | 173 | //
|
---|
[1238] | 174 | // ANOTHER POSSIBLE PROLEM: Each character from the file is read into
|
---|
| 175 | // remainder at the most significant end, and when that character
|
---|
| 176 | // is a zero you get a number like "01" which would be considered
|
---|
| 177 | // larger than a number like "8" because it is longer (two digits
|
---|
| 178 | // instead of 1).
|
---|
[1232] | 179 | //
|
---|
[1238] | 180 | // These comments added by Gordon Paynter ([email protected]) in
|
---|
| 181 | // June 2000. I didn't write any code, however.
|
---|
[1232] | 182 |
|
---|
[4] | 183 | char *hashfile (char *filename) {
|
---|
[40] | 184 | FILE *infile = (FILE *)NULL;
|
---|
[4] | 185 | int i;
|
---|
| 186 |
|
---|
| 187 | number primepow[8];
|
---|
| 188 | number pow;
|
---|
| 189 | number remainder;
|
---|
| 190 | unsigned char c;
|
---|
| 191 |
|
---|
| 192 | // our prime number (618970019642690137449562111)
|
---|
| 193 | pow.num[0] = 255; pow.num[1] = 255;
|
---|
| 194 | pow.num[2] = 255; pow.num[3] = 255;
|
---|
| 195 | pow.num[4] = 255; pow.num[5] = 255;
|
---|
| 196 | pow.num[6] = 255; pow.num[7] = 255;
|
---|
| 197 | pow.num[8] = 255; pow.num[9] = 255;
|
---|
| 198 | pow.num[10] = 255; pow.num[11] = 1;
|
---|
| 199 | pow.len = 12;
|
---|
| 200 |
|
---|
[1232] | 201 | // calculate 8 multiples of the prime number.
|
---|
| 202 | // These are used to find the remainder using only subtraction operations
|
---|
[9596] | 203 | for (i=0; i<8; ++i) {
|
---|
[4] | 204 | primepow[i] = pow;
|
---|
| 205 | my_inc (pow, pow);
|
---|
| 206 | }
|
---|
| 207 |
|
---|
[1232] | 208 | // The "remainder" after division by the prime. Our result.
|
---|
| 209 | remainder.len = 0;
|
---|
| 210 |
|
---|
| 211 | // open the file
|
---|
[4] | 212 | infile = fopen (filename, "rb");
|
---|
| 213 | if (infile == NULL) {
|
---|
[40] | 214 | return (char *)NULL;
|
---|
[4] | 215 | }
|
---|
[1232] | 216 | c = (unsigned char)fgetc(infile);
|
---|
[4] | 217 |
|
---|
| 218 | while (!feof (infile)) {
|
---|
[1232] | 219 |
|
---|
| 220 | // make sure the remainder has not grown too large
|
---|
[4] | 221 | if (remainder.len == MAXNUMLEN-1) {
|
---|
| 222 | fprintf (stderr, "ERROR - number overflow\n");
|
---|
[40] | 223 | return (char *)NULL;
|
---|
[4] | 224 | }
|
---|
[1232] | 225 |
|
---|
| 226 | // remainder = remainder * 256 + c
|
---|
[9596] | 227 | for (i=remainder.len; i>0; --i) {
|
---|
[4] | 228 | remainder.num[i] = remainder.num[i-1];
|
---|
| 229 | }
|
---|
[1232] | 230 | remainder.num[0] = c;
|
---|
| 231 | if (remainder.len > 0 || c != 0) remainder.len = remainder.len+1;
|
---|
[4] | 232 |
|
---|
[1232] | 233 | // remainder = (remainder % large-prime-number)
|
---|
[9596] | 234 | for (i=7; i>=0; --i) {
|
---|
[4] | 235 | my_ifpos_dec (remainder, primepow[i]);
|
---|
| 236 | }
|
---|
| 237 |
|
---|
[1232] | 238 | // read a new character from the file
|
---|
[4] | 239 | c = (unsigned char)fgetc(infile);
|
---|
| 240 | }
|
---|
| 241 |
|
---|
| 242 | fclose (infile);
|
---|
| 243 |
|
---|
| 244 | return my_convert_num (remainder);
|
---|
| 245 | }
|
---|
| 246 |
|
---|
| 247 |
|
---|
| 248 | int main (int argc, char *argv[]) {
|
---|
| 249 | int i;
|
---|
| 250 | char *result;
|
---|
| 251 |
|
---|
[9596] | 252 | for (i = 1; i < argc; ++i) {
|
---|
[4] | 253 | result = hashfile (argv[i]);
|
---|
| 254 | if (result == NULL) {
|
---|
| 255 | printf ("ERROR - could not process %s\n", argv[i]);
|
---|
| 256 | } else {
|
---|
| 257 | printf ("%s: %s\n", argv[i], result);
|
---|
| 258 | delete [] result;
|
---|
| 259 | }
|
---|
| 260 | }
|
---|
| 261 |
|
---|
| 262 | return 0;
|
---|
| 263 | }
|
---|