Changeset 27513

Show
Ignore:
Timestamp:
30.05.2013 10:58:59 (6 years ago)
Author:
jmt12
Message:

Restoring the original logic around working_info (although still not sure what this does), removing limit of 256 on incremental directory generation (as Kathy points out some MARC collections have thousands of documents all with the same HASH), and had a good go at commenting the function especially the new stuff I've added

Files:
1 modified

Legend:

Unmodified
Added
Removed
  • main/trunk/greenstone2/perllib/plugouts/BasePlugout.pm

    r27511 r27513  
    610610} 
    611611 
    612 # we now create the directory as part of this call, to try and avoid race 
    613 # conditions caused by parallel processing [jmt12] 
    614 sub get_new_doc_dir{ 
    615    my $self = shift (@_);   
    616    my($working_info,$working_dir,$OID) = @_;       
    617     
    618     
    619    my $doc_dir = ""; 
    620    my $doc_dir_rest = $OID; 
    621  
    622    # remove any \ and / from the OID 
    623    $doc_dir_rest =~ s/[\\\/]//g; 
    624  
    625    # Remove ":" if we are on Windows OS, as otherwise they get confused with the drive letters 
    626    $doc_dir_rest =~ s/\://g if ($ENV{'GSDLOS'} =~ /^windows$/i); 
    627  
    628    my $doc_dir_num = 0; 
    629    my $created_directory = 0; 
    630    do { 
    631        $doc_dir .= "/" if $doc_dir_num > 0; 
    632        my $pattern = '^(.{1,' . $self->{'subdir_split_length'} . '})'; 
    633        if ($self->{'subdir_hash_prefix'}) 
    634        { 
    635          $pattern = '^((HASH)?.{1,' . $self->{'subdir_split_length'} . '})'; 
    636        } 
    637        #if ($doc_dir_rest =~ s/^(.{1,$limit})//) { 
    638        if ($doc_dir_rest =~ s/$pattern//i) 
    639        { 
    640        $doc_dir .= $1; 
    641        $doc_dir_num++; 
    642        } 
    643        my $new_dir = &FileUtils::filenameConcatenate($working_dir, $doc_dir . '.dir'); 
    644        $created_directory = &FileUtils::makeAllDirectories($new_dir); 
    645        #rint STDERR "[DEBUG] create directory: $new_dir => $created_directory\n"; 
    646        #rint STDERR "[DEBUG] rest: $doc_dir_rest\n"; 
    647        #rint STDERR "[DEBUG] working_info->size(): " . $working_info->size() . " [ < 1024 ?]\n"; 
    648        #rint STDERR "[DEBUG] doc_dir_num: " . $doc_dir_num . "\n"; 
    649    } while ($doc_dir_rest ne "" && $created_directory == 0 && $doc_dir_num < 2); 
    650    # what is working_info, because it's values seems to start at 1? 
    651    # ($working_info->size() >= 1024) 
    652    my $i = 1; 
    653    my $doc_dir_base = $doc_dir; 
    654    while ($created_directory == 0 && $i < 256) { 
    655        $doc_dir = "$doc_dir_base-$i"; 
    656        $created_directory = &FileUtils::makeAllDirectories(&FileUtils::filenameConcatenate($working_dir, $doc_dir . '.dir')); 
    657        $i++; 
    658    } 
    659  
    660    if (!$created_directory) 
    661    { 
    662      die("Error! Failed to create directory for document: " . $doc_dir_base . "\n"); 
    663    } 
    664  
    665    return "$doc_dir.dir"; 
    666 } 
     612## @function get_new_doc_dir() 
     613# 
     614# Once a doc object is ready to write to disk (and hence has a nice OID), 
     615# generate a unique subdirectory to write the information to. 
     616# - create the directory as part of this call, to try and avoid race conditions 
     617#   found in parallel processing [jmt12] 
     618# 
     619sub get_new_doc_dir 
     620{ 
     621  my $self = shift (@_); 
     622  my($working_info,$working_dir,$OID) = @_; 
     623 
     624  my $doc_dir = ""; 
     625  my $doc_dir_rest = $OID; 
     626 
     627  # remove any \ and / from the OID 
     628  $doc_dir_rest =~ s/[\\\/]//g; 
     629 
     630  # Remove ":" if we are on Windows OS, as otherwise they get confused with the drive letters 
     631  if ($ENV{'GSDLOS'} =~ /^windows$/i) 
     632  { 
     633    $doc_dir_rest =~ s/\://g; 
     634  } 
     635 
     636  # we generally create a unique directory by adding consequtive fragments of 
     637  # the document identifier (split by some predefined length - defaulting to 
     638  # 8) until we find a directory that doesn't yet exist. Note that directories 
     639  # that contain a document have a suffix ".dir" (whereas those that contain 
     640  # only subdirectories have no suffix). 
     641  my $doc_dir_num = 0; # how many directories deep we are 
     642  my $created_directory = 0; # have we successfully created a new directory 
     643  do 
     644  { 
     645    # (does this work on windows? - jmt12) 
     646    if ($doc_dir_num > 0) 
     647    { 
     648      $doc_dir .= '/'; 
     649    } 
     650    # the default matching pattern grabs the next 'subdir_split_length' 
     651    # characters of the OID to act as the next subdirectory 
     652    my $pattern = '^(.{1,' . $self->{'subdir_split_length'} . '})'; 
     653    # Do we count any "HASH" prefix against the split length limit? 
     654    if ($self->{'subdir_hash_prefix'} && $doc_dir_num == 0) 
     655    { 
     656      $pattern = '^((HASH)?.{1,' . $self->{'subdir_split_length'} . '})'; 
     657    } 
     658    # Note the use of 's' to both capture the next chuck of OID and to remove 
     659    # it from OID at the same time 
     660    if ($doc_dir_rest =~ s/$pattern//i) 
     661    { 
     662      $doc_dir .= $1; 
     663      $doc_dir_num++; 
     664      $created_directory = &FileUtils::makeAllDirectories(&FileUtils::filenameConcatenate($working_dir, $doc_dir . '.dir')); 
     665    } 
     666    my $new_dir = &FileUtils::filenameConcatenate($working_dir, $doc_dir . '.dir'); 
     667    ###rint STDERR "[DEBUG] BasePlugout::get_new_doc_dir(<working_info>, $working_dir, $oid)\n"; 
     668    ###rint STDERR " - create directory: $new_dir => $created_directory\n"; 
     669    ###rint STDERR " - rest: $doc_dir_rest\n"; 
     670    ###rint STDERR " - working_info->size(): " . $working_info->size() . " [ < 1024 ?]\n"; 
     671    ###rint STDERR " - doc_dir_num: " . $doc_dir_num . "\n"; 
     672  } 
     673  while ($doc_dir_rest ne '' && ($created_directory == 0 || ($working_info->size() >= 1024 && $doc_dir_num < 2))); 
     674 
     675  # not unique yet? Add on an incremental suffix until we are unique 
     676  my $i = 1; 
     677  my $doc_dir_base = $doc_dir; 
     678  while ($created_directory == 0) 
     679  { 
     680    $doc_dir = $doc_dir_base . '-' . $i; 
     681    $created_directory = &FileUtils::makeAllDirectories(&FileUtils::filenameConcatenate($working_dir, $doc_dir . '.dir')); 
     682    $i++; 
     683  } 
     684 
     685  # in theory this should never happen 
     686  if (!$created_directory) 
     687  { 
     688    die("Error! Failed to create directory for document: " . $doc_dir_base . "\n"); 
     689  } 
     690 
     691  return $doc_dir . '.dir'; 
     692} 
     693## get_new_doc_dir() 
     694 
    667695 
    668696sub process_assoc_files {