source: branches/New_Config_Format-branch/gsdl/src/hashfile/hashfile.cpp@ 1279

Last change on this file since 1279 was 1279, checked in by sjboddie, 24 years ago

merged changes to trunk into New_Config_Format branch

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 7.1 KB
Line 
1/**********************************************************************
2 *
3 * hashfile.cpp --
4 * Copyright (C) 1999 The New Zealand Digital Library Project
5 *
6 * A component of the Greenstone digital library software
7 * from the New Zealand Digital Library Project at the
8 * University of Waikato, New Zealand.
9 *
10 * This program is free software; you can redistribute it and/or modify
11 * it under the terms of the GNU General Public License as published by
12 * the Free Software Foundation; either version 2 of the License, or
13 * (at your option) any later version.
14 *
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 * GNU General Public License for more details.
19 *
20 * You should have received a copy of the GNU General Public License
21 * along with this program; if not, write to the Free Software
22 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23 *
24 *********************************************************************/
25
26#include <stdio.h>
27
28#define MAXNUMLEN 100 // bytes
29
30
31// The number class is used to store very long unsigned integers.
32// The least significant of each number is stored first. Numbers can
33// be no longer than MAXNUMLEN bytes long. A number with length=0
34// is considered to be equal to 0. There should be no extra zeros
35// at the front of the number.
36
37struct number {
38 unsigned int len;
39 unsigned char num[MAXNUMLEN];
40
41 number () {len = 0;}
42};
43
44
45// evaluates a = a + b
46// a and b can be the same number
47void my_inc (number &a, number &b) {
48 unsigned int carry = 0;
49 unsigned int num = 0;
50 unsigned int i = 0;
51
52 while ((carry || (i < a.len) || (i < b.len)) && (i < MAXNUMLEN)) {
53 num = carry;
54 if (i < a.len) num += a.num[i];
55 if (i < b.len) num += b.num[i];
56
57 if (num >= 256) {
58 num -= 256; // assumes that each number in the list < 256
59 carry = 1;
60 } else {
61 carry = 0;
62 }
63 a.num[i] = (unsigned char)num;
64
65 i++;
66 }
67
68 a.len = i;
69}
70
71
72// evaluates a = a - b if a >= b
73// returns true if the subtraction was possible
74int my_ifpos_dec (number &a, number &b) {
75 int carry = 0;
76 int num = 0;
77 int i = 0;
78 unsigned int len = 0; // the last non-zero digit
79
80 // make sure a >= b
81 if (b.len > a.len) return 0;
82 if (b.len == a.len) {
83 i = a.len - 1;
84 while (i >= 0) {
85 if (a.num[i] > b.num[i]) break;
86 if (a.num[i] < b.num[i]) return 0;
87 i--;
88 }
89 }
90
91 unsigned int j = 0;
92 len = 0;
93 while ((j < a.len) || (j < b.len)) {
94 num = -carry;
95 if (j < a.len) num += a.num[j];
96 if (j < b.len) num -= b.num[j];
97
98 if (num < 0) {
99 num += 256; // assumes each number in the list < 256
100 carry = 1;
101 } else {
102 carry = 0;
103 }
104
105 a.num[j] = (unsigned char) num;
106
107 j++;
108 if (num != 0) len = j;
109 }
110
111 a.len = len;
112
113 if (carry) {
114 // panic, this should NOT happen
115 fprintf (stderr, "panic: error in my_ifpos_dec\n");
116 }
117
118 return 1;
119}
120
121
122char *my_convert_num (number &a) {
123 char *result = new char[(a.len*2)+1];
124 int i,len;
125 char convert[16] = {'0','1','2','3','4','5','6','7',
126 '8','9','a','b','c','d','e','f'};
127
128 // zero might be a special case
129 if (a.len == 0) {
130 result[0] = '0';
131 result[1] = '\0';
132 return result;
133 }
134
135 for (i=a.len-1, len=0; i >= 0; i--) {
136 result[len++] = convert[a.num[i]/16];
137 result[len++] = convert[a.num[i]%16];
138 }
139 result[len] = '\0';
140
141 return result;
142}
143
144
145
146// create a hash string from the contents of a file.
147//
148// The file is treated as a large base 256 number (each char is a digit),
149// and the hash value is the remainder when this number is divided by a
150// very large prime.
151//
152// PROBLEM: This is a flawed hash function because the rightmost (highest)
153// value in remainder is more likely to be a "1" than it is to be any other
154// number.
155//
156// EVIDENCE: About 50% of the files in any GSDL directory have a hash code
157// that starts with 01* which implies (based on the my_convert_num function)
158// that the rightmost "digit" of the remainder = z such that (z % 16 = 0)
159// and (z / 16 = 1) which means (z = 1).
160//
161// MAJOR REASON: suppose our prime number was 19. Then, if we have a
162// reasonably random distribution of numbers for which we are going to
163// calculate (N % 19), we expect to get a roughly uniform distribution
164// of remainders where the possible values are 0, 1, 2, 3... 18. The
165// problem is that 9 out of the 19 possible values (10,11... 18) start
166// with the digit 1. Thus our hascode will start with "01".
167//
168// ANOTHER PROBLEM: Characters in the file are read one at a time; after each
169// one is read it is prepended to the remainder, then the remainder is
170// recalculated on the string thus far seen. I am sure the math here
171// is wrong - if I try calculating (111 mod 7) by the same algorithm, it
172// simply does nor work.
173//
174// ANOTHER POSSIBLE PROLEM: Each character from the file is read into
175// remainder at the most significant end, and when that character
176// is a zero you get a number like "01" which would be considered
177// larger than a number like "8" because it is longer (two digits
178// instead of 1).
179//
180// These comments added by Gordon Paynter ([email protected]) in
181// June 2000. I didn't write any code, however.
182
183char *hashfile (char *filename) {
184 FILE *infile = (FILE *)NULL;
185 int i;
186
187 number primepow[8];
188 number pow;
189 number remainder;
190 unsigned char c;
191
192 // our prime number (618970019642690137449562111)
193 pow.num[0] = 255; pow.num[1] = 255;
194 pow.num[2] = 255; pow.num[3] = 255;
195 pow.num[4] = 255; pow.num[5] = 255;
196 pow.num[6] = 255; pow.num[7] = 255;
197 pow.num[8] = 255; pow.num[9] = 255;
198 pow.num[10] = 255; pow.num[11] = 1;
199 pow.len = 12;
200
201 // calculate 8 multiples of the prime number.
202 // These are used to find the remainder using only subtraction operations
203 for (i=0; i<8; i++) {
204 primepow[i] = pow;
205 my_inc (pow, pow);
206 }
207
208 // The "remainder" after division by the prime. Our result.
209 remainder.len = 0;
210
211 // open the file
212 infile = fopen (filename, "rb");
213 if (infile == NULL) {
214 return (char *)NULL;
215 }
216 c = (unsigned char)fgetc(infile);
217
218 while (!feof (infile)) {
219
220 // make sure the remainder has not grown too large
221 if (remainder.len == MAXNUMLEN-1) {
222 fprintf (stderr, "ERROR - number overflow\n");
223 return (char *)NULL;
224 }
225
226 // remainder = remainder * 256 + c
227 for (i=remainder.len; i>0; i--) {
228 remainder.num[i] = remainder.num[i-1];
229 }
230 remainder.num[0] = c;
231 if (remainder.len > 0 || c != 0) remainder.len = remainder.len+1;
232
233 // remainder = (remainder % large-prime-number)
234 for (i=7; i>=0; i--) {
235 my_ifpos_dec (remainder, primepow[i]);
236 }
237
238 // read a new character from the file
239 c = (unsigned char)fgetc(infile);
240 }
241
242 fclose (infile);
243
244 return my_convert_num (remainder);
245}
246
247
248int main (int argc, char *argv[]) {
249 int i;
250 char *result;
251
252 for (i = 1; i < argc; i++) {
253 result = hashfile (argv[i]);
254 if (result == NULL) {
255 printf ("ERROR - could not process %s\n", argv[i]);
256 } else {
257 printf ("%s: %s\n", argv[i], result);
258 delete [] result;
259 }
260 }
261
262 return 0;
263}
Note: See TracBrowser for help on using the repository browser.