1 | /**********************************************************************
|
---|
2 | *
|
---|
3 | * hashfile.cpp --
|
---|
4 | * Copyright (C) 1999 The New Zealand Digital Library Project
|
---|
5 | *
|
---|
6 | * A component of the Greenstone digital library software
|
---|
7 | * from the New Zealand Digital Library Project at the
|
---|
8 | * University of Waikato, New Zealand.
|
---|
9 | *
|
---|
10 | * This program is free software; you can redistribute it and/or modify
|
---|
11 | * it under the terms of the GNU General Public License as published by
|
---|
12 | * the Free Software Foundation; either version 2 of the License, or
|
---|
13 | * (at your option) any later version.
|
---|
14 | *
|
---|
15 | * This program is distributed in the hope that it will be useful,
|
---|
16 | * but WITHOUT ANY WARRANTY; without even the implied warranty of
|
---|
17 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
---|
18 | * GNU General Public License for more details.
|
---|
19 | *
|
---|
20 | * You should have received a copy of the GNU General Public License
|
---|
21 | * along with this program; if not, write to the Free Software
|
---|
22 | * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
|
---|
23 | *
|
---|
24 | *********************************************************************/
|
---|
25 |
|
---|
26 | #include <stdio.h>
|
---|
27 |
|
---|
28 | #define MAXNUMLEN 100 // bytes
|
---|
29 |
|
---|
30 |
|
---|
31 | // The number class is used to store very long unsigned integers.
|
---|
32 | // The least significant of each number is stored first. Numbers can
|
---|
33 | // be no longer than MAXNUMLEN bytes long. A number with length=0
|
---|
34 | // is considered to be equal to 0. There should be no extra zeros
|
---|
35 | // at the front of the number.
|
---|
36 |
|
---|
37 | struct number {
|
---|
38 | unsigned int len;
|
---|
39 | unsigned char num[MAXNUMLEN];
|
---|
40 |
|
---|
41 | number () {len = 0;}
|
---|
42 | };
|
---|
43 |
|
---|
44 |
|
---|
45 | // evaluates a = a + b
|
---|
46 | // a and b can be the same number
|
---|
47 | void my_inc (number &a, number &b) {
|
---|
48 | unsigned int carry = 0;
|
---|
49 | unsigned int num = 0;
|
---|
50 | unsigned int i = 0;
|
---|
51 |
|
---|
52 | while ((carry || (i < a.len) || (i < b.len)) && (i < MAXNUMLEN)) {
|
---|
53 | num = carry;
|
---|
54 | if (i < a.len) num += a.num[i];
|
---|
55 | if (i < b.len) num += b.num[i];
|
---|
56 |
|
---|
57 | if (num >= 256) {
|
---|
58 | num -= 256; // assumes that each number in the list < 256
|
---|
59 | carry = 1;
|
---|
60 | } else {
|
---|
61 | carry = 0;
|
---|
62 | }
|
---|
63 | a.num[i] = (unsigned char)num;
|
---|
64 |
|
---|
65 | i++;
|
---|
66 | }
|
---|
67 |
|
---|
68 | a.len = i;
|
---|
69 | }
|
---|
70 |
|
---|
71 |
|
---|
72 | // evaluates a = a - b if a >= b
|
---|
73 | // returns true if the subtraction was possible
|
---|
74 | int my_ifpos_dec (number &a, number &b) {
|
---|
75 | int carry = 0;
|
---|
76 | int num = 0;
|
---|
77 | int i = 0;
|
---|
78 | unsigned int len = 0; // the last non-zero digit
|
---|
79 |
|
---|
80 | // make sure a >= b
|
---|
81 | if (b.len > a.len) return 0;
|
---|
82 | if (b.len == a.len) {
|
---|
83 | i = a.len - 1;
|
---|
84 | while (i >= 0) {
|
---|
85 | if (a.num[i] > b.num[i]) break;
|
---|
86 | if (a.num[i] < b.num[i]) return 0;
|
---|
87 | i--;
|
---|
88 | }
|
---|
89 | }
|
---|
90 |
|
---|
91 | unsigned int j = 0;
|
---|
92 | len = 0;
|
---|
93 | while ((j < a.len) || (j < b.len)) {
|
---|
94 | num = -carry;
|
---|
95 | if (j < a.len) num += a.num[j];
|
---|
96 | if (j < b.len) num -= b.num[j];
|
---|
97 |
|
---|
98 | if (num < 0) {
|
---|
99 | num += 256; // assumes each number in the list < 256
|
---|
100 | carry = 1;
|
---|
101 | } else {
|
---|
102 | carry = 0;
|
---|
103 | }
|
---|
104 |
|
---|
105 | a.num[j] = (unsigned char) num;
|
---|
106 |
|
---|
107 | j++;
|
---|
108 | if (num != 0) len = j;
|
---|
109 | }
|
---|
110 |
|
---|
111 | a.len = len;
|
---|
112 |
|
---|
113 | if (carry) {
|
---|
114 | // panic, this should NOT happen
|
---|
115 | fprintf (stderr, "panic: error in my_ifpos_dec\n");
|
---|
116 | }
|
---|
117 |
|
---|
118 | return 1;
|
---|
119 | }
|
---|
120 |
|
---|
121 |
|
---|
122 | char *my_convert_num (number &a) {
|
---|
123 | char *result = new char[(a.len*2)+1];
|
---|
124 | int i,len;
|
---|
125 | char convert[16] = {'0','1','2','3','4','5','6','7',
|
---|
126 | '8','9','a','b','c','d','e','f'};
|
---|
127 |
|
---|
128 | // zero might be a special case
|
---|
129 | if (a.len == 0) {
|
---|
130 | result[0] = '0';
|
---|
131 | result[1] = '\0';
|
---|
132 | return result;
|
---|
133 | }
|
---|
134 |
|
---|
135 | for (i=a.len-1, len=0; i >= 0; i--) {
|
---|
136 | result[len++] = convert[a.num[i]/16];
|
---|
137 | result[len++] = convert[a.num[i]%16];
|
---|
138 | }
|
---|
139 | result[len] = '\0';
|
---|
140 |
|
---|
141 | return result;
|
---|
142 | }
|
---|
143 |
|
---|
144 |
|
---|
145 |
|
---|
146 | // create a hash string from the contents of a file.
|
---|
147 | //
|
---|
148 | // The file is treated as a large base 256 number (each char is a digit),
|
---|
149 | // and the hash value is the remainder when this number is divided by a
|
---|
150 | // very large prime.
|
---|
151 | //
|
---|
152 | // PROBLEM: This is a flawed hash function because the rightmost (highest)
|
---|
153 | // value in remainder is more likely to be a "1" than it is to be any other
|
---|
154 | // number.
|
---|
155 | //
|
---|
156 | // EVIDENCE: About 50% of the files in any GSDL directory have a hash code
|
---|
157 | // that starts with 01* which implies (based on the my_convert_num function)
|
---|
158 | // that the rightmost "digit" of the remainder = z such that (z % 16 = 0)
|
---|
159 | // and (z / 16 = 1) which means (z = 1).
|
---|
160 | //
|
---|
161 | // MAJOR REASON: suppose our prime number was 19. Then, if we have a
|
---|
162 | // reasonably random distribution of numbers for which we are going to
|
---|
163 | // calculate (N % 19), we expect to get a roughly uniform distribution
|
---|
164 | // of remainders where the possible values are 0, 1, 2, 3... 18. The
|
---|
165 | // problem is that 9 out of the 19 possible values (10,11... 18) start
|
---|
166 | // with the digit 1. Thus our hascode will start with "01".
|
---|
167 | //
|
---|
168 | // ANOTHER PROBLEM: Characters in the file are read one at a time; after each
|
---|
169 | // one is read it is prepended to the remainder, then the remainder is
|
---|
170 | // recalculated on the string thus far seen. I am sure the math here
|
---|
171 | // is wrong - if I try calculating (111 mod 7) by the same algorithm, it
|
---|
172 | // simply does nor work.
|
---|
173 | //
|
---|
174 | // ANOTHER POSSIBLE PROLEM: Each character from the file is read into
|
---|
175 | // remainder at the most significant end, and when that character
|
---|
176 | // is a zero you get a number like "01" which would be considered
|
---|
177 | // larger than a number like "8" because it is longer (two digits
|
---|
178 | // instead of 1).
|
---|
179 | //
|
---|
180 | // These comments added by Gordon Paynter ([email protected]) in
|
---|
181 | // June 2000. I didn't write any code, however.
|
---|
182 |
|
---|
183 | char *hashfile (char *filename) {
|
---|
184 | FILE *infile = (FILE *)NULL;
|
---|
185 | int i;
|
---|
186 |
|
---|
187 | number primepow[8];
|
---|
188 | number pow;
|
---|
189 | number remainder;
|
---|
190 | unsigned char c;
|
---|
191 |
|
---|
192 | // our prime number (618970019642690137449562111)
|
---|
193 | pow.num[0] = 255; pow.num[1] = 255;
|
---|
194 | pow.num[2] = 255; pow.num[3] = 255;
|
---|
195 | pow.num[4] = 255; pow.num[5] = 255;
|
---|
196 | pow.num[6] = 255; pow.num[7] = 255;
|
---|
197 | pow.num[8] = 255; pow.num[9] = 255;
|
---|
198 | pow.num[10] = 255; pow.num[11] = 1;
|
---|
199 | pow.len = 12;
|
---|
200 |
|
---|
201 | // calculate 8 multiples of the prime number.
|
---|
202 | // These are used to find the remainder using only subtraction operations
|
---|
203 | for (i=0; i<8; i++) {
|
---|
204 | primepow[i] = pow;
|
---|
205 | my_inc (pow, pow);
|
---|
206 | }
|
---|
207 |
|
---|
208 | // The "remainder" after division by the prime. Our result.
|
---|
209 | remainder.len = 0;
|
---|
210 |
|
---|
211 | // open the file
|
---|
212 | infile = fopen (filename, "rb");
|
---|
213 | if (infile == NULL) {
|
---|
214 | return (char *)NULL;
|
---|
215 | }
|
---|
216 | c = (unsigned char)fgetc(infile);
|
---|
217 |
|
---|
218 | while (!feof (infile)) {
|
---|
219 |
|
---|
220 | // make sure the remainder has not grown too large
|
---|
221 | if (remainder.len == MAXNUMLEN-1) {
|
---|
222 | fprintf (stderr, "ERROR - number overflow\n");
|
---|
223 | return (char *)NULL;
|
---|
224 | }
|
---|
225 |
|
---|
226 | // remainder = remainder * 256 + c
|
---|
227 | for (i=remainder.len; i>0; i--) {
|
---|
228 | remainder.num[i] = remainder.num[i-1];
|
---|
229 | }
|
---|
230 | remainder.num[0] = c;
|
---|
231 | if (remainder.len > 0 || c != 0) remainder.len = remainder.len+1;
|
---|
232 |
|
---|
233 | // remainder = (remainder % large-prime-number)
|
---|
234 | for (i=7; i>=0; i--) {
|
---|
235 | my_ifpos_dec (remainder, primepow[i]);
|
---|
236 | }
|
---|
237 |
|
---|
238 | // read a new character from the file
|
---|
239 | c = (unsigned char)fgetc(infile);
|
---|
240 | }
|
---|
241 |
|
---|
242 | fclose (infile);
|
---|
243 |
|
---|
244 | return my_convert_num (remainder);
|
---|
245 | }
|
---|
246 |
|
---|
247 |
|
---|
248 | int main (int argc, char *argv[]) {
|
---|
249 | int i;
|
---|
250 | char *result;
|
---|
251 |
|
---|
252 | for (i = 1; i < argc; i++) {
|
---|
253 | result = hashfile (argv[i]);
|
---|
254 | if (result == NULL) {
|
---|
255 | printf ("ERROR - could not process %s\n", argv[i]);
|
---|
256 | } else {
|
---|
257 | printf ("%s: %s\n", argv[i], result);
|
---|
258 | delete [] result;
|
---|
259 | }
|
---|
260 | }
|
---|
261 |
|
---|
262 | return 0;
|
---|
263 | }
|
---|