Changeset 26536


Ignore:
Timestamp:
2012-11-28T11:59:17+13:00 (11 years ago)
Author:
davidb
Message:

Introduction of two new OIDtype values (hash_on_full_filename and full_filename) designed to help provide more stable document IDs for collections that are rebuilt over time, including rebuilt after the Greenstone install has been upgraded

Location:
main/trunk/greenstone2
Files:
7 edited

Legend:

Unmodified
Added
Removed
  • main/trunk/greenstone2/bin/script/import.pl

    r23372 r26536  
    7373    [ { 'name' => "hash",
    7474        'desc' => "{import.OIDtype.hash}" },
     75      { 'name' => "hash_on_full_filename",
     76        'desc' => "{import.OIDtype.hash_on_full_filename}" },
    7577      { 'name' => "assigned",
    7678        'desc' => "{import.OIDtype.assigned}" },
     
    7880        'desc' => "{import.OIDtype.incremental}" },
    7981      { 'name' => "dirname",
    80         'desc' => "{import.OIDtype.dirname}" } ];
     82        'desc' => "{import.OIDtype.dirname}" },
     83      { 'name' => "full_filename",
     84        'desc' => "{import.OIDtype.full_filename}" } ];
    8185
    8286
  • main/trunk/greenstone2/perllib/doc.pm

    r26221 r26536  
    204204    my ($type, $metadata) = @_;
    205205
    206     if (defined $type && $type =~ /^(hash|hash_on_file|hash_on_ga_xml|incremental|filename|dirname|assigned)$/) {
     206    if (defined $type && $type =~ /^(hash|hash_on_file|hash_on_ga_xml|hash_on_full_filename|incremental|filename|dirname|full_filename|assigned)$/) {
    207207    $self->{'OIDtype'} = $type;
    208208    } else {
     
    404404        my $filename = $self->get_source_filename();
    405405        $OID = &File::Basename::fileparse($filename, qr/\.[^.]*/);
     406        $OID = &util::tidy_up_oid($OID);
     407    } elsif ($self->{'OIDtype'} eq "full_filename") {
     408        my $source_filename = $self->get_source_filename();
     409        my $dirsep = &util::get_os_dirsep();
     410
     411        $source_filename =~ s/^import$dirsep//;
     412        $source_filename =~ s/$dirsep/-/g;
     413        $source_filename =~ s/\./_/g;
     414
     415        $OID = $source_filename;
    406416        $OID = &util::tidy_up_oid($OID);
    407417    } elsif ($self->{'OIDtype'} eq "dirname") {
     
    439449    if ($use_hash_oid) {
    440450        my $hash_on_file = 1;
     451        my $hash_on_ga_xml = 0;
     452
    441453        if ($self->{'OIDtype'} eq "hash_on_ga_xml") {
    442454        $hash_on_file = 0;
     455        $hash_on_ga_xml = 1;
    443456        }
     457
     458        if ($self->{'OIDtype'} eq "hash_on_full_filename") {
     459        $hash_on_file = 0;
     460        $hash_on_ga_xml = 0;
     461
     462        my $source_filename = $self->get_source_filename();
     463        my $dirsep = &util::get_os_dirsep();
     464
     465        $source_filename =~ s/^import$dirsep//;
     466        $source_filename =~ s/$dirsep/-/g;
     467        $source_filename =~ s/\./_/g;
     468       
     469        # If the filename is very short then (handled naively)
     470        # this can cause conjestion in the hash-values
     471        # computed, leading documents sharing the same leading
     472        # Hex values in the computed has.
     473        #
     474        # The solution taken here is to replace the name of
     475        # the file name a sufficient number of times (up to
     476        # the character limit defined in 'rep_limit' and
     477        # make that the content that is hashed on
     478
     479        # *** Think twice before changing the following value
     480        # as it will break backward compatability of computed
     481        # document HASH values
     482
     483        my $rep_limit  = 256;
     484        my $hash_content = undef;
     485
     486        if (length($source_filename)<$rep_limit) {
     487            my $rep_string = "$source_filename|";
     488            my $rs_len = length($rep_string);
     489
     490            my $clone_times = int(($rep_limit-1)/$rs_len) +1;
     491           
     492            $hash_content = substr($rep_string x $clone_times, 0, $rep_limit);
     493        }
     494        else {
     495            $hash_content = $source_filename;
     496        }
     497
     498        my $filename = &util::get_tmp_filename();
     499        if (!open (OUTFILE, ">:utf8", $filename)) {
     500            print STDERR "doc::set_OID could not write to $filename\n";
     501        } else {
     502            print OUTFILE $hash_content;
     503            close (OUTFILE);
     504        }
     505        $OID = $self->_calc_OID ($filename);
     506
     507        print STDERR "****!!! the computed hash for: '", $source_filename, "' is: ", $OID,"\n\n";
     508
     509        &util::rm ($filename);
     510        }
     511
    444512        if ($hash_on_file) {
    445513        # "hash" OID - feed file to hashfile.exe
     
    450518            $OID = $self->_calc_OID ($filename);
    451519        } else {
    452             $hash_on_file = 0;
     520            $hash_on_ga_xml = 1; # switch to back-up plan, and hash on GA file instead
    453521        }
    454522        }
    455         if (!$hash_on_file) {
     523
     524        if ($hash_on_ga_xml) {
     525        # In addition being asked to explicity calculate the has based on the GA file,
     526        # can also end up coming into this block is doing 'hash_on_file' but the file
     527        # itself is of zero bytes (as could be the case with 'doc.nul' file
     528
    456529        my $filename = &util::get_tmp_filename();
    457530        if (!open (OUTFILE, ">:utf8", $filename)) {
  • main/trunk/greenstone2/perllib/docproc.pm

    r17747 r26536  
    5454    my ($type, $metadata) = @_;
    5555
    56     if ($type =~ /^(hash|incremental|dirname|assigned)$/) {
     56    if ($type =~ /^(hash|hash_on_full_filename|incremental|dirname|full_filename|assigned)$/) {
    5757    $self->{'OIDtype'} = $type;
    5858    } else {
  • main/trunk/greenstone2/perllib/inexport.pm

    r26451 r26536  
    297297
    298298    if (!defined $self->{'OIDtype'}
    299     || ($self->{'OIDtype'} !~ /^(hash|incremental|assigned|dirname)$/ )) {
     299    || ($self->{'OIDtype'} !~ /^(hash|hash_on_full_filename|incremental|assigned|dirname|full_filename)$/ )) {
     300    # OIDtype was either not defined on the command-line, or if it was not one of the recognized values
    300301    if (defined $collectcfg->{'OIDtype'}
    301         && $collectcfg->{'OIDtype'} =~ /^(hash|incremental|assigned|dirname)$/) {
     302        && $collectcfg->{'OIDtype'} =~ /^(hash|hash_on_full_filename|incremental|assigned|dirname|full_filename)$/) {
    302303        $self->{'OIDtype'} = $collectcfg->{'OIDtype'};
    303304    } else {
     
    548549    $processor->setoutputdir ($archivedir);
    549550    $processor->set_sortmeta ($sortmeta, $removeprefix, $removesuffix) if defined $sortmeta;
     551
    550552    $processor->set_OIDtype ($OIDtype, $OIDmetadata);
    551553   
  • main/trunk/greenstone2/perllib/plugins/BasePlugin.pm

    r26221 r26536  
    9797      { 'name' => "hash_on_ga_xml",
    9898        'desc' => "{import.OIDtype.hash_on_ga_xml}" },
     99      { 'name' => "hash_on_full_filename",
     100        'desc' => "{import.OIDtype.hash_on_full_filename}" },
    99101      { 'name' => "assigned",
    100102        'desc' => "{import.OIDtype.assigned}" },
     
    104106        'desc' => "{import.OIDtype.filename}" },
    105107      { 'name' => "dirname",
    106         'desc' => "{import.OIDtype.dirname}" } ];
     108        'desc' => "{import.OIDtype.dirname}" },
     109      { 'name' => "full_filename",
     110        'desc' => "{import.OIDtype.full_filename}" } ];
    107111
    108112my $arguments =
  • main/trunk/greenstone2/perllib/plugouts/BasePlugout.pm

    r24958 r26536  
    292292    my ($type, $metadata) = @_;
    293293
    294     if ($type =~ /^(hash|incremental|dirname|assigned)$/) {
     294    if ($type =~ /^(hash|hash_on_full_filename|incremental|dirname|full_filename|assigned)$/) {
    295295    $self->{'OIDtype'} = $type;
    296296    } else {
  • main/trunk/greenstone2/perllib/strings.properties

    r26268 r26536  
    279279import.OIDtype.hash:Hash the contents of the file. Document identifiers will be the same every time the collection is imported.
    280280import.OIDtype.hash_on_ga_xml:Hash the contents of the Greenstone Archive XML file. Document identifiers will be the same every time the collection is imported as long as the metadata does not change.
     281import.OIDtype.hash_on_full_filename:Hash on the full filename to the document within the 'import' folder (and not its contents).  Helps make document identifiers more stable across upgrades of the software, although it means that duplicate documents contained in the collection are no longer detected automatically.
    281282
    282283import.OIDtype.incremental:Use a simple document count. Significantly faster than "hash", but does not necessarily assign the same identifier to the same document content if the collection is reimported.
     
    285286
    286287import.OIDtype.dirname:Use the parent directory name (preceded by 'J'). There should only be one document per directory, and directory names should be unique. E.g. import/b13as/h15ef/page.html will get an identifier of Jh15ef.
     288
     289import.OIDtype.filename:Use the tail file name.  Requires every filename across all the folders within 'import' to be unique.
     290
     291import.OIDtype.full_filename:Use the full file name within the 'import' folder as the identifier for the document (with _ and - substitutions made for symbols such as directory separators and the fullstop in a filename extension)
    287292
    288293import.OIDmetadata:Specifies the metadata element that hold's the document's unique identifier, for use with -OIDtype=assigned.
Note: See TracChangeset for help on using the changeset viewer.