Changeset 26536

Show
Ignore:
Timestamp:
28.11.2012 11:59:17 (7 years ago)
Author:
davidb
Message:

Introduction of two new OIDtype values (hash_on_full_filename and full_filename) designed to help provide more stable document IDs for collections that are rebuilt over time, including rebuilt after the Greenstone install has been upgraded

Location:
main/trunk/greenstone2
Files:
7 modified

Legend:

Unmodified
Added
Removed
  • main/trunk/greenstone2/bin/script/import.pl

    r23372 r26536  
    7373    [ { 'name' => "hash", 
    7474        'desc' => "{import.OIDtype.hash}" }, 
     75      { 'name' => "hash_on_full_filename", 
     76        'desc' => "{import.OIDtype.hash_on_full_filename}" }, 
    7577      { 'name' => "assigned", 
    7678        'desc' => "{import.OIDtype.assigned}" }, 
     
    7880        'desc' => "{import.OIDtype.incremental}" }, 
    7981      { 'name' => "dirname", 
    80         'desc' => "{import.OIDtype.dirname}" } ]; 
     82        'desc' => "{import.OIDtype.dirname}" }, 
     83      { 'name' => "full_filename", 
     84        'desc' => "{import.OIDtype.full_filename}" } ]; 
    8185 
    8286 
  • main/trunk/greenstone2/perllib/doc.pm

    r26221 r26536  
    204204    my ($type, $metadata) = @_; 
    205205 
    206     if (defined $type && $type =~ /^(hash|hash_on_file|hash_on_ga_xml|incremental|filename|dirname|assigned)$/) { 
     206    if (defined $type && $type =~ /^(hash|hash_on_file|hash_on_ga_xml|hash_on_full_filename|incremental|filename|dirname|full_filename|assigned)$/) { 
    207207    $self->{'OIDtype'} = $type; 
    208208    } else { 
     
    404404        my $filename = $self->get_source_filename(); 
    405405        $OID = &File::Basename::fileparse($filename, qr/\.[^.]*/); 
     406        $OID = &util::tidy_up_oid($OID); 
     407    } elsif ($self->{'OIDtype'} eq "full_filename") { 
     408        my $source_filename = $self->get_source_filename(); 
     409        my $dirsep = &util::get_os_dirsep(); 
     410 
     411        $source_filename =~ s/^import$dirsep//; 
     412        $source_filename =~ s/$dirsep/-/g; 
     413        $source_filename =~ s/\./_/g; 
     414 
     415        $OID = $source_filename; 
    406416        $OID = &util::tidy_up_oid($OID); 
    407417    } elsif ($self->{'OIDtype'} eq "dirname") { 
     
    439449    if ($use_hash_oid) { 
    440450        my $hash_on_file = 1;  
     451        my $hash_on_ga_xml = 0; 
     452 
    441453        if ($self->{'OIDtype'} eq "hash_on_ga_xml") { 
    442454        $hash_on_file = 0; 
     455        $hash_on_ga_xml = 1; 
    443456        } 
     457 
     458        if ($self->{'OIDtype'} eq "hash_on_full_filename") { 
     459        $hash_on_file = 0; 
     460        $hash_on_ga_xml = 0; 
     461 
     462        my $source_filename = $self->get_source_filename(); 
     463        my $dirsep = &util::get_os_dirsep(); 
     464 
     465        $source_filename =~ s/^import$dirsep//; 
     466        $source_filename =~ s/$dirsep/-/g; 
     467        $source_filename =~ s/\./_/g; 
     468         
     469        # If the filename is very short then (handled naively) 
     470        # this can cause conjestion in the hash-values 
     471        # computed, leading documents sharing the same leading 
     472        # Hex values in the computed has. 
     473        # 
     474        # The solution taken here is to replace the name of 
     475        # the file name a sufficient number of times (up to 
     476        # the character limit defined in 'rep_limit' and 
     477        # make that the content that is hashed on 
     478 
     479        # *** Think twice before changing the following value 
     480        # as it will break backward compatability of computed 
     481        # document HASH values 
     482 
     483        my $rep_limit  = 256;  
     484        my $hash_content = undef; 
     485 
     486        if (length($source_filename)<$rep_limit) { 
     487            my $rep_string = "$source_filename|"; 
     488            my $rs_len = length($rep_string); 
     489 
     490            my $clone_times = int(($rep_limit-1)/$rs_len) +1; 
     491             
     492            $hash_content = substr($rep_string x $clone_times, 0, $rep_limit); 
     493        } 
     494        else { 
     495            $hash_content = $source_filename; 
     496        } 
     497 
     498        my $filename = &util::get_tmp_filename(); 
     499        if (!open (OUTFILE, ">:utf8", $filename)) { 
     500            print STDERR "doc::set_OID could not write to $filename\n"; 
     501        } else { 
     502            print OUTFILE $hash_content; 
     503            close (OUTFILE); 
     504        } 
     505        $OID = $self->_calc_OID ($filename); 
     506 
     507        print STDERR "****!!! the computed hash for: '", $source_filename, "' is: ", $OID,"\n\n"; 
     508 
     509        &util::rm ($filename); 
     510        } 
     511 
    444512        if ($hash_on_file) { 
    445513        # "hash" OID - feed file to hashfile.exe 
     
    450518            $OID = $self->_calc_OID ($filename); 
    451519        } else { 
    452             $hash_on_file = 0; 
     520            $hash_on_ga_xml = 1; # switch to back-up plan, and hash on GA file instead 
    453521        } 
    454522        } 
    455         if (!$hash_on_file) { 
     523 
     524        if ($hash_on_ga_xml) { 
     525        # In addition being asked to explicity calculate the has based on the GA file, 
     526        # can also end up coming into this block is doing 'hash_on_file' but the file 
     527        # itself is of zero bytes (as could be the case with 'doc.nul' file 
     528 
    456529        my $filename = &util::get_tmp_filename(); 
    457530        if (!open (OUTFILE, ">:utf8", $filename)) { 
  • main/trunk/greenstone2/perllib/docproc.pm

    r17747 r26536  
    5454    my ($type, $metadata) = @_; 
    5555 
    56     if ($type =~ /^(hash|incremental|dirname|assigned)$/) { 
     56    if ($type =~ /^(hash|hash_on_full_filename|incremental|dirname|full_filename|assigned)$/) { 
    5757    $self->{'OIDtype'} = $type; 
    5858    } else { 
  • main/trunk/greenstone2/perllib/inexport.pm

    r26451 r26536  
    297297 
    298298    if (!defined $self->{'OIDtype'}  
    299     || ($self->{'OIDtype'} !~ /^(hash|incremental|assigned|dirname)$/ )) { 
     299    || ($self->{'OIDtype'} !~ /^(hash|hash_on_full_filename|incremental|assigned|dirname|full_filename)$/ )) { 
     300    # OIDtype was either not defined on the command-line, or if it was not one of the recognized values 
    300301    if (defined $collectcfg->{'OIDtype'}  
    301         && $collectcfg->{'OIDtype'} =~ /^(hash|incremental|assigned|dirname)$/) { 
     302        && $collectcfg->{'OIDtype'} =~ /^(hash|hash_on_full_filename|incremental|assigned|dirname|full_filename)$/) { 
    302303        $self->{'OIDtype'} = $collectcfg->{'OIDtype'}; 
    303304    } else { 
     
    548549    $processor->setoutputdir ($archivedir); 
    549550    $processor->set_sortmeta ($sortmeta, $removeprefix, $removesuffix) if defined $sortmeta; 
     551 
    550552    $processor->set_OIDtype ($OIDtype, $OIDmetadata); 
    551553     
  • main/trunk/greenstone2/perllib/plugins/BasePlugin.pm

    r26221 r26536  
    9797      { 'name' => "hash_on_ga_xml", 
    9898        'desc' => "{import.OIDtype.hash_on_ga_xml}" }, 
     99      { 'name' => "hash_on_full_filename", 
     100        'desc' => "{import.OIDtype.hash_on_full_filename}" }, 
    99101      { 'name' => "assigned", 
    100102        'desc' => "{import.OIDtype.assigned}" }, 
     
    104106        'desc' => "{import.OIDtype.filename}" }, 
    105107      { 'name' => "dirname", 
    106         'desc' => "{import.OIDtype.dirname}" } ]; 
     108        'desc' => "{import.OIDtype.dirname}" }, 
     109      { 'name' => "full_filename", 
     110        'desc' => "{import.OIDtype.full_filename}" } ]; 
    107111 
    108112my $arguments = 
  • main/trunk/greenstone2/perllib/plugouts/BasePlugout.pm

    r24958 r26536  
    292292    my ($type, $metadata) = @_; 
    293293 
    294     if ($type =~ /^(hash|incremental|dirname|assigned)$/) { 
     294    if ($type =~ /^(hash|hash_on_full_filename|incremental|dirname|full_filename|assigned)$/) { 
    295295    $self->{'OIDtype'} = $type; 
    296296    } else { 
  • main/trunk/greenstone2/perllib/strings.properties

    r26268 r26536  
    279279import.OIDtype.hash:Hash the contents of the file. Document identifiers will be the same every time the collection is imported. 
    280280import.OIDtype.hash_on_ga_xml:Hash the contents of the Greenstone Archive XML file. Document identifiers will be the same every time the collection is imported as long as the metadata does not change. 
     281import.OIDtype.hash_on_full_filename:Hash on the full filename to the document within the 'import' folder (and not its contents).  Helps make document identifiers more stable across upgrades of the software, although it means that duplicate documents contained in the collection are no longer detected automatically. 
    281282 
    282283import.OIDtype.incremental:Use a simple document count. Significantly faster than "hash", but does not necessarily assign the same identifier to the same document content if the collection is reimported. 
     
    285286 
    286287import.OIDtype.dirname:Use the parent directory name (preceded by 'J'). There should only be one document per directory, and directory names should be unique. E.g. import/b13as/h15ef/page.html will get an identifier of Jh15ef. 
     288 
     289import.OIDtype.filename:Use the tail file name.  Requires every filename across all the folders within 'import' to be unique. 
     290 
     291import.OIDtype.full_filename:Use the full file name within the 'import' folder as the identifier for the document (with _ and - substitutions made for symbols such as directory separators and the fullstop in a filename extension) 
    287292 
    288293import.OIDmetadata:Specifies the metadata element that hold's the document's unique identifier, for use with -OIDtype=assigned.