Changeset 27513 for main/trunk


Ignore:
Timestamp:
2013-05-30T10:58:59+12:00 (11 years ago)
Author:
jmt12
Message:

Restoring the original logic around working_info (although still not sure what this does), removing limit of 256 on incremental directory generation (as Kathy points out some MARC collections have thousands of documents all with the same HASH), and had a good go at commenting the function especially the new stuff I've added

File:
1 edited

Legend:

Unmodified
Added
Removed
  • main/trunk/greenstone2/perllib/plugouts/BasePlugout.pm

    r27511 r27513  
    610610}
    611611
    612 # we now create the directory as part of this call, to try and avoid race
    613 # conditions caused by parallel processing [jmt12]
    614 sub get_new_doc_dir{
    615    my $self = shift (@_); 
    616    my($working_info,$working_dir,$OID) = @_;     
    617    
    618    
    619    my $doc_dir = "";
    620    my $doc_dir_rest = $OID;
    621 
    622    # remove any \ and / from the OID
    623    $doc_dir_rest =~ s/[\\\/]//g;
    624 
    625    # Remove ":" if we are on Windows OS, as otherwise they get confused with the drive letters
    626    $doc_dir_rest =~ s/\://g if ($ENV{'GSDLOS'} =~ /^windows$/i);
    627 
    628    my $doc_dir_num = 0;
    629    my $created_directory = 0;
    630    do {
    631        $doc_dir .= "/" if $doc_dir_num > 0;
    632        my $pattern = '^(.{1,' . $self->{'subdir_split_length'} . '})';
    633        if ($self->{'subdir_hash_prefix'})
    634        {
    635          $pattern = '^((HASH)?.{1,' . $self->{'subdir_split_length'} . '})';
    636        }
    637        #if ($doc_dir_rest =~ s/^(.{1,$limit})//) {
    638        if ($doc_dir_rest =~ s/$pattern//i)
    639        {
    640        $doc_dir .= $1;
    641        $doc_dir_num++;
    642        }
    643        my $new_dir = &FileUtils::filenameConcatenate($working_dir, $doc_dir . '.dir');
    644        $created_directory = &FileUtils::makeAllDirectories($new_dir);
    645        #rint STDERR "[DEBUG] create directory: $new_dir => $created_directory\n";
    646        #rint STDERR "[DEBUG] rest: $doc_dir_rest\n";
    647        #rint STDERR "[DEBUG] working_info->size(): " . $working_info->size() . " [ < 1024 ?]\n";
    648        #rint STDERR "[DEBUG] doc_dir_num: " . $doc_dir_num . "\n";
    649    } while ($doc_dir_rest ne "" && $created_directory == 0 && $doc_dir_num < 2);
    650    # what is working_info, because it's values seems to start at 1?
    651    # ($working_info->size() >= 1024)
    652    my $i = 1;
    653    my $doc_dir_base = $doc_dir;
    654    while ($created_directory == 0 && $i < 256) {
    655        $doc_dir = "$doc_dir_base-$i";
    656        $created_directory = &FileUtils::makeAllDirectories(&FileUtils::filenameConcatenate($working_dir, $doc_dir . '.dir'));
    657        $i++;
    658    }
    659 
    660    if (!$created_directory)
    661    {
    662      die("Error! Failed to create directory for document: " . $doc_dir_base . "\n");
    663    }
    664 
    665    return "$doc_dir.dir";
    666 }
     612## @function get_new_doc_dir()
     613#
     614# Once a doc object is ready to write to disk (and hence has a nice OID),
     615# generate a unique subdirectory to write the information to.
     616# - create the directory as part of this call, to try and avoid race conditions
     617#   found in parallel processing [jmt12]
     618#
     619sub get_new_doc_dir
     620{
     621  my $self = shift (@_);
     622  my($working_info,$working_dir,$OID) = @_;
     623
     624  my $doc_dir = "";
     625  my $doc_dir_rest = $OID;
     626
     627  # remove any \ and / from the OID
     628  $doc_dir_rest =~ s/[\\\/]//g;
     629
     630  # Remove ":" if we are on Windows OS, as otherwise they get confused with the drive letters
     631  if ($ENV{'GSDLOS'} =~ /^windows$/i)
     632  {
     633    $doc_dir_rest =~ s/\://g;
     634  }
     635
     636  # we generally create a unique directory by adding consequtive fragments of
     637  # the document identifier (split by some predefined length - defaulting to
     638  # 8) until we find a directory that doesn't yet exist. Note that directories
     639  # that contain a document have a suffix ".dir" (whereas those that contain
     640  # only subdirectories have no suffix).
     641  my $doc_dir_num = 0; # how many directories deep we are
     642  my $created_directory = 0; # have we successfully created a new directory
     643  do
     644  {
     645    # (does this work on windows? - jmt12)
     646    if ($doc_dir_num > 0)
     647    {
     648      $doc_dir .= '/';
     649    }
     650    # the default matching pattern grabs the next 'subdir_split_length'
     651    # characters of the OID to act as the next subdirectory
     652    my $pattern = '^(.{1,' . $self->{'subdir_split_length'} . '})';
     653    # Do we count any "HASH" prefix against the split length limit?
     654    if ($self->{'subdir_hash_prefix'} && $doc_dir_num == 0)
     655    {
     656      $pattern = '^((HASH)?.{1,' . $self->{'subdir_split_length'} . '})';
     657    }
     658    # Note the use of 's' to both capture the next chuck of OID and to remove
     659    # it from OID at the same time
     660    if ($doc_dir_rest =~ s/$pattern//i)
     661    {
     662      $doc_dir .= $1;
     663      $doc_dir_num++;
     664      $created_directory = &FileUtils::makeAllDirectories(&FileUtils::filenameConcatenate($working_dir, $doc_dir . '.dir'));
     665    }
     666    my $new_dir = &FileUtils::filenameConcatenate($working_dir, $doc_dir . '.dir');
     667    ###rint STDERR "[DEBUG] BasePlugout::get_new_doc_dir(<working_info>, $working_dir, $oid)\n";
     668    ###rint STDERR " - create directory: $new_dir => $created_directory\n";
     669    ###rint STDERR " - rest: $doc_dir_rest\n";
     670    ###rint STDERR " - working_info->size(): " . $working_info->size() . " [ < 1024 ?]\n";
     671    ###rint STDERR " - doc_dir_num: " . $doc_dir_num . "\n";
     672  }
     673  while ($doc_dir_rest ne '' && ($created_directory == 0 || ($working_info->size() >= 1024 && $doc_dir_num < 2)));
     674
     675  # not unique yet? Add on an incremental suffix until we are unique
     676  my $i = 1;
     677  my $doc_dir_base = $doc_dir;
     678  while ($created_directory == 0)
     679  {
     680    $doc_dir = $doc_dir_base . '-' . $i;
     681    $created_directory = &FileUtils::makeAllDirectories(&FileUtils::filenameConcatenate($working_dir, $doc_dir . '.dir'));
     682    $i++;
     683  }
     684
     685  # in theory this should never happen
     686  if (!$created_directory)
     687  {
     688    die("Error! Failed to create directory for document: " . $doc_dir_base . "\n");
     689  }
     690
     691  return $doc_dir . '.dir';
     692}
     693## get_new_doc_dir()
     694
    667695
    668696sub process_assoc_files {
Note: See TracChangeset for help on using the changeset viewer.