Ignore:
Timestamp:
2012-11-28T11:59:17+13:00 (11 years ago)
Author:
davidb
Message:

Introduction of two new OIDtype values (hash_on_full_filename and full_filename) designed to help provide more stable document IDs for collections that are rebuilt over time, including rebuilt after the Greenstone install has been upgraded

File:
1 edited

Legend:

Unmodified
Added
Removed
  • main/trunk/greenstone2/perllib/doc.pm

    r26221 r26536  
    204204    my ($type, $metadata) = @_;
    205205
    206     if (defined $type && $type =~ /^(hash|hash_on_file|hash_on_ga_xml|incremental|filename|dirname|assigned)$/) {
     206    if (defined $type && $type =~ /^(hash|hash_on_file|hash_on_ga_xml|hash_on_full_filename|incremental|filename|dirname|full_filename|assigned)$/) {
    207207    $self->{'OIDtype'} = $type;
    208208    } else {
     
    404404        my $filename = $self->get_source_filename();
    405405        $OID = &File::Basename::fileparse($filename, qr/\.[^.]*/);
     406        $OID = &util::tidy_up_oid($OID);
     407    } elsif ($self->{'OIDtype'} eq "full_filename") {
     408        my $source_filename = $self->get_source_filename();
     409        my $dirsep = &util::get_os_dirsep();
     410
     411        $source_filename =~ s/^import$dirsep//;
     412        $source_filename =~ s/$dirsep/-/g;
     413        $source_filename =~ s/\./_/g;
     414
     415        $OID = $source_filename;
    406416        $OID = &util::tidy_up_oid($OID);
    407417    } elsif ($self->{'OIDtype'} eq "dirname") {
     
    439449    if ($use_hash_oid) {
    440450        my $hash_on_file = 1;
     451        my $hash_on_ga_xml = 0;
     452
    441453        if ($self->{'OIDtype'} eq "hash_on_ga_xml") {
    442454        $hash_on_file = 0;
     455        $hash_on_ga_xml = 1;
    443456        }
     457
     458        if ($self->{'OIDtype'} eq "hash_on_full_filename") {
     459        $hash_on_file = 0;
     460        $hash_on_ga_xml = 0;
     461
     462        my $source_filename = $self->get_source_filename();
     463        my $dirsep = &util::get_os_dirsep();
     464
     465        $source_filename =~ s/^import$dirsep//;
     466        $source_filename =~ s/$dirsep/-/g;
     467        $source_filename =~ s/\./_/g;
     468       
     469        # If the filename is very short then (handled naively)
     470        # this can cause conjestion in the hash-values
     471        # computed, leading documents sharing the same leading
     472        # Hex values in the computed has.
     473        #
     474        # The solution taken here is to replace the name of
     475        # the file name a sufficient number of times (up to
     476        # the character limit defined in 'rep_limit' and
     477        # make that the content that is hashed on
     478
     479        # *** Think twice before changing the following value
     480        # as it will break backward compatability of computed
     481        # document HASH values
     482
     483        my $rep_limit  = 256;
     484        my $hash_content = undef;
     485
     486        if (length($source_filename)<$rep_limit) {
     487            my $rep_string = "$source_filename|";
     488            my $rs_len = length($rep_string);
     489
     490            my $clone_times = int(($rep_limit-1)/$rs_len) +1;
     491           
     492            $hash_content = substr($rep_string x $clone_times, 0, $rep_limit);
     493        }
     494        else {
     495            $hash_content = $source_filename;
     496        }
     497
     498        my $filename = &util::get_tmp_filename();
     499        if (!open (OUTFILE, ">:utf8", $filename)) {
     500            print STDERR "doc::set_OID could not write to $filename\n";
     501        } else {
     502            print OUTFILE $hash_content;
     503            close (OUTFILE);
     504        }
     505        $OID = $self->_calc_OID ($filename);
     506
     507        print STDERR "****!!! the computed hash for: '", $source_filename, "' is: ", $OID,"\n\n";
     508
     509        &util::rm ($filename);
     510        }
     511
    444512        if ($hash_on_file) {
    445513        # "hash" OID - feed file to hashfile.exe
     
    450518            $OID = $self->_calc_OID ($filename);
    451519        } else {
    452             $hash_on_file = 0;
     520            $hash_on_ga_xml = 1; # switch to back-up plan, and hash on GA file instead
    453521        }
    454522        }
    455         if (!$hash_on_file) {
     523
     524        if ($hash_on_ga_xml) {
     525        # In addition being asked to explicity calculate the has based on the GA file,
     526        # can also end up coming into this block is doing 'hash_on_file' but the file
     527        # itself is of zero bytes (as could be the case with 'doc.nul' file
     528
    456529        my $filename = &util::get_tmp_filename();
    457530        if (!open (OUTFILE, ">:utf8", $filename)) {
Note: See TracChangeset for help on using the changeset viewer.