1  /**********************************************************************


2  *


3  * hashfile.cpp 


4  * Copyright (C) 1999 The New Zealand Digital Library Project


5  *


6  * A component of the Greenstone digital library software


7  * from the New Zealand Digital Library Project at the


8  * University of Waikato, New Zealand.


9  *


10  * This program is free software; you can redistribute it and/or modify


11  * it under the terms of the GNU General Public License as published by


12  * the Free Software Foundation; either version 2 of the License, or


13  * (at your option) any later version.


14  *


15  * This program is distributed in the hope that it will be useful,


16  * but WITHOUT ANY WARRANTY; without even the implied warranty of


17  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the


18  * GNU General Public License for more details.


19  *


20  * You should have received a copy of the GNU General Public License


21  * along with this program; if not, write to the Free Software


22  * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.


23  *


24  *********************************************************************/


25 


26  #include <stdio.h>


27 


28  #define MAXNUMLEN 100 // bytes


29 


30 


31  // The number class is used to store very long unsigned integers.


32  // The least significant of each number is stored first. Numbers can


33  // be no longer than MAXNUMLEN bytes long. A number with length=0


34  // is considered to be equal to 0. There should be no extra zeros


35  // at the front of the number.


36 


37  struct number {


38  unsigned int len;


39  unsigned char num[MAXNUMLEN];


40 


41  number () {len = 0;}


42  };


43 


44 


45  // evaluates a = a + b


46  // a and b can be the same number


47  void my_inc (number &a, number &b) {


48  unsigned int carry = 0;


49  unsigned int num = 0;


50  unsigned int i = 0;


51 


52  while ((carry  (i < a.len)  (i < b.len)) && (i < MAXNUMLEN)) {


53  num = carry;


54  if (i < a.len) num += a.num[i];


55  if (i < b.len) num += b.num[i];


56 


57  if (num >= 256) {


58  num = 256; // assumes that each number in the list < 256


59  carry = 1;


60  } else {


61  carry = 0;


62  }


63  a.num[i] = (unsigned char)num;


64 


65  ++i;


66  }


67 


68  a.len = i;


69  }


70 


71 


72  // evaluates a = a  b if a >= b


73  // returns true if the subtraction was possible


74  int my_ifpos_dec (number &a, number &b) {


75  int carry = 0;


76  int num = 0;


77  int i = 0;


78  unsigned int len = 0; // the last nonzero digit


79 


80  // make sure a >= b


81  if (b.len > a.len) return 0;


82  if (b.len == a.len) {


83  i = a.len  1;


84  while (i >= 0) {


85  if (a.num[i] > b.num[i]) break;


86  if (a.num[i] < b.num[i]) return 0;


87  i;


88  }


89  }


90 


91  unsigned int j = 0;


92  len = 0;


93  while ((j < a.len)  (j < b.len)) {


94  num = carry;


95  if (j < a.len) num += a.num[j];


96  if (j < b.len) num = b.num[j];


97 


98  if (num < 0) {


99  num += 256; // assumes each number in the list < 256


100  carry = 1;


101  } else {


102  carry = 0;


103  }


104 


105  a.num[j] = (unsigned char) num;


106 


107  ++j;


108  if (num != 0) len = j;


109  }


110 


111  a.len = len;


112 


113  if (carry) {


114  // panic, this should NOT happen


115  fprintf (stderr, "panic: error in my_ifpos_dec\n");


116  }


117 


118  return 1;


119  }


120 


121 


122  char *my_convert_num (number &a) {


123  char *result = new char[(a.len*2)+1];


124  int i,len;


125  char convert[16] = {'0','1','2','3','4','5','6','7',


126  '8','9','a','b','c','d','e','f'};


127 


128  // zero might be a special case


129  if (a.len == 0) {


130  result[0] = '0';


131  result[1] = '\0';


132  return result;


133  }


134 


135  for (i=a.len1, len=0; i >= 0; i) {


136  result[len++] = convert[a.num[i]/16];


137  result[len++] = convert[a.num[i]%16];


138  }


139  result[len] = '\0';


140 


141  return result;


142  }


143 


144 


145 


146  // create a hash string from the contents of a file.


147  //


148  // The file is treated as a large base 256 number (each char is a digit),


149  // and the hash value is the remainder when this number is divided by a


150  // very large prime.


151  //


152  // PROBLEM: This is a flawed hash function because the rightmost (highest)


153  // value in remainder is more likely to be a "1" than it is to be any other


154  // number.


155  //


156  // EVIDENCE: About 50% of the files in any GSDL directory have a hash code


157  // that starts with 01* which implies (based on the my_convert_num function)


158  // that the rightmost "digit" of the remainder = z such that (z % 16 = 0)


159  // and (z / 16 = 1) which means (z = 1).


160  //


161  // MAJOR REASON: suppose our prime number was 19. Then, if we have a


162  // reasonably random distribution of numbers for which we are going to


163  // calculate (N % 19), we expect to get a roughly uniform distribution


164  // of remainders where the possible values are 0, 1, 2, 3... 18. The


165  // problem is that 9 out of the 19 possible values (10,11... 18) start


166  // with the digit 1. Thus our hascode will start with "01".


167  //


168  // ANOTHER PROBLEM: Characters in the file are read one at a time; after each


169  // one is read it is prepended to the remainder, then the remainder is


170  // recalculated on the string thus far seen. I am sure the math here


171  // is wrong  if I try calculating (111 mod 7) by the same algorithm, it


172  // simply does nor work.


173  //


174  // ANOTHER POSSIBLE PROLEM: Each character from the file is read into


175  // remainder at the most significant end, and when that character


176  // is a zero you get a number like "01" which would be considered


177  // larger than a number like "8" because it is longer (two digits


178  // instead of 1).


179  //


180  // These comments added by Gordon Paynter ([email protected]) in


181  // June 2000. I didn't write any code, however.


182 


183  char *hashfile (char *filename) {


184  FILE *infile = (FILE *)NULL;


185  int i;


186 


187  number primepow[8];


188  number pow;


189  number remainder;


190  unsigned char c;


191 


192  // our prime number (618970019642690137449562111)


193  pow.num[0] = 255; pow.num[1] = 255;


194  pow.num[2] = 255; pow.num[3] = 255;


195  pow.num[4] = 255; pow.num[5] = 255;


196  pow.num[6] = 255; pow.num[7] = 255;


197  pow.num[8] = 255; pow.num[9] = 255;


198  pow.num[10] = 255; pow.num[11] = 1;


199  pow.len = 12;


200 


201  // calculate 8 multiples of the prime number.


202  // These are used to find the remainder using only subtraction operations


203  for (i=0; i<8; ++i) {


204  primepow[i] = pow;


205  my_inc (pow, pow);


206  }


207 


208  // The "remainder" after division by the prime. Our result.


209  remainder.len = 0;


210 


211  // open the file


212  infile = fopen (filename, "rb");


213  if (infile == NULL) {


214  return (char *)NULL;


215  }


216  c = (unsigned char)fgetc(infile);


217 


218  while (!feof (infile)) {


219 


220  // make sure the remainder has not grown too large


221  if (remainder.len == MAXNUMLEN1) {


222  fprintf (stderr, "ERROR  number overflow\n");


223  return (char *)NULL;


224  }


225 


226  // remainder = remainder * 256 + c


227  for (i=remainder.len; i>0; i) {


228  remainder.num[i] = remainder.num[i1];


229  }


230  remainder.num[0] = c;


231  if (remainder.len > 0  c != 0) remainder.len = remainder.len+1;


232 


233  // remainder = (remainder % largeprimenumber)


234  for (i=7; i>=0; i) {


235  my_ifpos_dec (remainder, primepow[i]);


236  }


237 


238  // read a new character from the file


239  c = (unsigned char)fgetc(infile);


240  }


241 


242  fclose (infile);


243 


244  return my_convert_num (remainder);


245  }


246 


247 


248  int main (int argc, char *argv[]) {


249  int i;


250  char *result;


251 


252  for (i = 1; i < argc; ++i) {


253  result = hashfile (argv[i]);


254  if (result == NULL) {


255  printf ("ERROR  could not process %s\n", argv[i]);


256  } else {


257  printf ("%s: %s\n", argv[i], result);


258  delete [] result;


259  }


260  }


261 


262  return 0;


263  }

