Show
Ignore:
Timestamp:
28.11.2012 11:59:17 (7 years ago)
Author:
davidb
Message:

Introduction of two new OIDtype values (hash_on_full_filename and full_filename) designed to help provide more stable document IDs for collections that are rebuilt over time, including rebuilt after the Greenstone install has been upgraded

Files:
1 modified

Legend:

Unmodified
Added
Removed
  • main/trunk/greenstone2/perllib/doc.pm

    r26221 r26536  
    204204    my ($type, $metadata) = @_; 
    205205 
    206     if (defined $type && $type =~ /^(hash|hash_on_file|hash_on_ga_xml|incremental|filename|dirname|assigned)$/) { 
     206    if (defined $type && $type =~ /^(hash|hash_on_file|hash_on_ga_xml|hash_on_full_filename|incremental|filename|dirname|full_filename|assigned)$/) { 
    207207    $self->{'OIDtype'} = $type; 
    208208    } else { 
     
    404404        my $filename = $self->get_source_filename(); 
    405405        $OID = &File::Basename::fileparse($filename, qr/\.[^.]*/); 
     406        $OID = &util::tidy_up_oid($OID); 
     407    } elsif ($self->{'OIDtype'} eq "full_filename") { 
     408        my $source_filename = $self->get_source_filename(); 
     409        my $dirsep = &util::get_os_dirsep(); 
     410 
     411        $source_filename =~ s/^import$dirsep//; 
     412        $source_filename =~ s/$dirsep/-/g; 
     413        $source_filename =~ s/\./_/g; 
     414 
     415        $OID = $source_filename; 
    406416        $OID = &util::tidy_up_oid($OID); 
    407417    } elsif ($self->{'OIDtype'} eq "dirname") { 
     
    439449    if ($use_hash_oid) { 
    440450        my $hash_on_file = 1;  
     451        my $hash_on_ga_xml = 0; 
     452 
    441453        if ($self->{'OIDtype'} eq "hash_on_ga_xml") { 
    442454        $hash_on_file = 0; 
     455        $hash_on_ga_xml = 1; 
    443456        } 
     457 
     458        if ($self->{'OIDtype'} eq "hash_on_full_filename") { 
     459        $hash_on_file = 0; 
     460        $hash_on_ga_xml = 0; 
     461 
     462        my $source_filename = $self->get_source_filename(); 
     463        my $dirsep = &util::get_os_dirsep(); 
     464 
     465        $source_filename =~ s/^import$dirsep//; 
     466        $source_filename =~ s/$dirsep/-/g; 
     467        $source_filename =~ s/\./_/g; 
     468         
     469        # If the filename is very short then (handled naively) 
     470        # this can cause conjestion in the hash-values 
     471        # computed, leading documents sharing the same leading 
     472        # Hex values in the computed has. 
     473        # 
     474        # The solution taken here is to replace the name of 
     475        # the file name a sufficient number of times (up to 
     476        # the character limit defined in 'rep_limit' and 
     477        # make that the content that is hashed on 
     478 
     479        # *** Think twice before changing the following value 
     480        # as it will break backward compatability of computed 
     481        # document HASH values 
     482 
     483        my $rep_limit  = 256;  
     484        my $hash_content = undef; 
     485 
     486        if (length($source_filename)<$rep_limit) { 
     487            my $rep_string = "$source_filename|"; 
     488            my $rs_len = length($rep_string); 
     489 
     490            my $clone_times = int(($rep_limit-1)/$rs_len) +1; 
     491             
     492            $hash_content = substr($rep_string x $clone_times, 0, $rep_limit); 
     493        } 
     494        else { 
     495            $hash_content = $source_filename; 
     496        } 
     497 
     498        my $filename = &util::get_tmp_filename(); 
     499        if (!open (OUTFILE, ">:utf8", $filename)) { 
     500            print STDERR "doc::set_OID could not write to $filename\n"; 
     501        } else { 
     502            print OUTFILE $hash_content; 
     503            close (OUTFILE); 
     504        } 
     505        $OID = $self->_calc_OID ($filename); 
     506 
     507        print STDERR "****!!! the computed hash for: '", $source_filename, "' is: ", $OID,"\n\n"; 
     508 
     509        &util::rm ($filename); 
     510        } 
     511 
    444512        if ($hash_on_file) { 
    445513        # "hash" OID - feed file to hashfile.exe 
     
    450518            $OID = $self->_calc_OID ($filename); 
    451519        } else { 
    452             $hash_on_file = 0; 
     520            $hash_on_ga_xml = 1; # switch to back-up plan, and hash on GA file instead 
    453521        } 
    454522        } 
    455         if (!$hash_on_file) { 
     523 
     524        if ($hash_on_ga_xml) { 
     525        # In addition being asked to explicity calculate the has based on the GA file, 
     526        # can also end up coming into this block is doing 'hash_on_file' but the file 
     527        # itself is of zero bytes (as could be the case with 'doc.nul' file 
     528 
    456529        my $filename = &util::get_tmp_filename(); 
    457530        if (!open (OUTFILE, ">:utf8", $filename)) {