Context Navigation

source: main/tags/2.30c/gsdl/src/hashfile/hashfile.cpp@ 32020

Last change on this file since 32020 was 1238, checked in by gwp, 24 years ago
Additional explanation of hash problems in documentation.
Property svn:executable set to ``* Property svn:keywords set to `Author Date Id Revision`
File size: 7.1 KB

Line
1	/**********************************************************************
2	*
3	* hashfile.cpp --
4	* Copyright (C) 1999 The New Zealand Digital Library Project
5	*
6	* A component of the Greenstone digital library software
7	* from the New Zealand Digital Library Project at the
8	* University of Waikato, New Zealand.
9	*
10	* This program is free software; you can redistribute it and/or modify
11	* it under the terms of the GNU General Public License as published by
12	* the Free Software Foundation; either version 2 of the License, or
13	* (at your option) any later version.
14	*
15	* This program is distributed in the hope that it will be useful,
16	* but WITHOUT ANY WARRANTY; without even the implied warranty of
17	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18	* GNU General Public License for more details.
19	*
20	* You should have received a copy of the GNU General Public License
21	* along with this program; if not, write to the Free Software
22	* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23	*
24	*********************************************************************/
25
26	#include <stdio.h>
27
28	#define MAXNUMLEN 100 // bytes
29
30
31	// The number class is used to store very long unsigned integers.
32	// The least significant of each number is stored first. Numbers can
33	// be no longer than MAXNUMLEN bytes long. A number with length=0
34	// is considered to be equal to 0. There should be no extra zeros
35	// at the front of the number.
36
37	struct number {
38	unsigned int len;
39	unsigned char num[MAXNUMLEN];
40
41	number () {len = 0;}
42	};
43
44
45	// evaluates a = a + b
46	// a and b can be the same number
47	void my_inc (number &a, number &b) {
48	unsigned int carry = 0;
49	unsigned int num = 0;
50	unsigned int i = 0;
51
52	while ((carry \|\| (i < a.len) \|\| (i < b.len)) && (i < MAXNUMLEN)) {
53	num = carry;
54	if (i < a.len) num += a.num[i];
55	if (i < b.len) num += b.num[i];
56
57	if (num >= 256) {
58	num -= 256; // assumes that each number in the list < 256
59	carry = 1;
60	} else {
61	carry = 0;
62	}
63	a.num[i] = (unsigned char)num;
64
65	i++;
66	}
67
68	a.len = i;
69	}
70
71
72	// evaluates a = a - b if a >= b
73	// returns true if the subtraction was possible
74	int my_ifpos_dec (number &a, number &b) {
75	int carry = 0;
76	int num = 0;
77	int i = 0;
78	unsigned int len = 0; // the last non-zero digit
79
80	// make sure a >= b
81	if (b.len > a.len) return 0;
82	if (b.len == a.len) {
83	i = a.len - 1;
84	while (i >= 0) {
85	if (a.num[i] > b.num[i]) break;
86	if (a.num[i] < b.num[i]) return 0;
87	i--;
88	}
89	}
90
91	unsigned int j = 0;
92	len = 0;
93	while ((j < a.len) \|\| (j < b.len)) {
94	num = -carry;
95	if (j < a.len) num += a.num[j];
96	if (j < b.len) num -= b.num[j];
97
98	if (num < 0) {
99	num += 256; // assumes each number in the list < 256
100	carry = 1;
101	} else {
102	carry = 0;
103	}
104
105	a.num[j] = (unsigned char) num;
106
107	j++;
108	if (num != 0) len = j;
109	}
110
111	a.len = len;
112
113	if (carry) {
114	// panic, this should NOT happen
115	fprintf (stderr, "panic: error in my_ifpos_dec\n");
116	}
117
118	return 1;
119	}
120
121
122	char *my_convert_num (number &a) {
123	char result = new char[(a.len2)+1];
124	int i,len;
125	char convert[16] = {'0','1','2','3','4','5','6','7',
126	'8','9','a','b','c','d','e','f'};
127
128	// zero might be a special case
129	if (a.len == 0) {
130	result[0] = '0';
131	result[1] = '\0';
132	return result;
133	}
134
135	for (i=a.len-1, len=0; i >= 0; i--) {
136	result[len++] = convert[a.num[i]/16];
137	result[len++] = convert[a.num[i]%16];
138	}
139	result[len] = '\0';
140
141	return result;
142	}
143
144
145
146	// create a hash string from the contents of a file.
147	//
148	// The file is treated as a large base 256 number (each char is a digit),
149	// and the hash value is the remainder when this number is divided by a
150	// very large prime.
151	//
152	// PROBLEM: This is a flawed hash function because the rightmost (highest)
153	// value in remainder is more likely to be a "1" than it is to be any other
154	// number.
155	//
156	// EVIDENCE: About 50% of the files in any GSDL directory have a hash code
157	// that starts with 01* which implies (based on the my_convert_num function)
158	// that the rightmost "digit" of the remainder = z such that (z % 16 = 0)
159	// and (z / 16 = 1) which means (z = 1).
160	//
161	// MAJOR REASON: suppose our prime number was 19. Then, if we have a
162	// reasonably random distribution of numbers for which we are going to
163	// calculate (N % 19), we expect to get a roughly uniform distribution
164	// of remainders where the possible values are 0, 1, 2, 3... 18. The
165	// problem is that 9 out of the 19 possible values (10,11... 18) start
166	// with the digit 1. Thus our hascode will start with "01".
167	//
168	// ANOTHER PROBLEM: Characters in the file are read one at a time; after each
169	// one is read it is prepended to the remainder, then the remainder is
170	// recalculated on the string thus far seen. I am sure the math here
171	// is wrong - if I try calculating (111 mod 7) by the same algorithm, it
172	// simply does nor work.
173	//
174	// ANOTHER POSSIBLE PROLEM: Each character from the file is read into
175	// remainder at the most significant end, and when that character
176	// is a zero you get a number like "01" which would be considered
177	// larger than a number like "8" because it is longer (two digits
178	// instead of 1).
179	//
180	// These comments added by Gordon Paynter ([email protected]) in
181	// June 2000. I didn't write any code, however.
182
183	char hashfile (char filename) {
184	FILE infile = (FILE )NULL;
185	int i;
186
187	number primepow[8];
188	number pow;
189	number remainder;
190	unsigned char c;
191
192	// our prime number (618970019642690137449562111)
193	pow.num[0] = 255; pow.num[1] = 255;
194	pow.num[2] = 255; pow.num[3] = 255;
195	pow.num[4] = 255; pow.num[5] = 255;
196	pow.num[6] = 255; pow.num[7] = 255;
197	pow.num[8] = 255; pow.num[9] = 255;
198	pow.num[10] = 255; pow.num[11] = 1;
199	pow.len = 12;
200
201	// calculate 8 multiples of the prime number.
202	// These are used to find the remainder using only subtraction operations
203	for (i=0; i<8; i++) {
204	primepow[i] = pow;
205	my_inc (pow, pow);
206	}
207
208	// The "remainder" after division by the prime. Our result.
209	remainder.len = 0;
210
211	// open the file
212	infile = fopen (filename, "rb");
213	if (infile == NULL) {
214	return (char *)NULL;
215	}
216	c = (unsigned char)fgetc(infile);
217
218	while (!feof (infile)) {
219
220	// make sure the remainder has not grown too large
221	if (remainder.len == MAXNUMLEN-1) {
222	fprintf (stderr, "ERROR - number overflow\n");
223	return (char *)NULL;
224	}
225
226	// remainder = remainder * 256 + c
227	for (i=remainder.len; i>0; i--) {
228	remainder.num[i] = remainder.num[i-1];
229	}
230	remainder.num[0] = c;
231	if (remainder.len > 0 \|\| c != 0) remainder.len = remainder.len+1;
232
233	// remainder = (remainder % large-prime-number)
234	for (i=7; i>=0; i--) {
235	my_ifpos_dec (remainder, primepow[i]);
236	}
237
238	// read a new character from the file
239	c = (unsigned char)fgetc(infile);
240	}
241
242	fclose (infile);
243
244	return my_convert_num (remainder);
245	}
246
247
248	int main (int argc, char *argv[]) {
249	int i;
250	char *result;
251
252	for (i = 1; i < argc; i++) {
253	result = hashfile (argv[i]);
254	if (result == NULL) {
255	printf ("ERROR - could not process %s\n", argv[i]);
256	} else {
257	printf ("%s: %s\n", argv[i], result);
258	delete [] result;
259	}
260	}
261
262	return 0;
263	}

Note: See TracBrowser for help on using the repository browser.

Download in other formats: