Changeset 17087 for gsdl/trunk/perllib


Ignore:
Timestamp:
2008-08-29T13:10:39+12:00 (16 years ago)
Author:
davidb
Message:

Introduction of new GDBM alternative for archives.inf as step towards full incremental building. Information traditionally stored in archives.inf PLUS additional information that will help with working out what files have changed since last build, and what doc-id they hashed to is stored in two GDBM databases. For now these databases aren't read, but in the future ArchivesInfPlugin will be upgraded to use these to support these.

Location:
gsdl/trunk/perllib
Files:
7 edited

Legend:

Unmodified
Added
Removed
  • gsdl/trunk/perllib/ClassifyTreeModel.pm

    r15894 r17087  
    120120    # Unfortunately I have to check that there is text to retrieve before I
    121121    # create a new node.
    122     if(&GDBMUtils::gdbmGet($self->getCollection(), $clid) =~ /\w+/)
     122    if(&GDBMUtils::gdbmCachedCollectionGet($self->getCollection(), $clid) =~ /\w+/)
    123123      {
    124124        # Since the CLID can directly reference the correct entry in the GDBM we
  • gsdl/trunk/perllib/ClassifyTreeNode.pm

    r15890 r17087  
    2626    $self->{'model'} = $model;
    2727    $self->{'clid'} = $clid;
     28
     29    my $collection = $model->getCollection();
     30
    2831    # Check if this node already exists in the database, and if not insert it
    2932    # now
    30     my $text = &GDBMUtils::gdbmGet($model->getCollection(), $clid);
     33    my $text = &GDBMUtils::gdbmCachedCollectionGet($collection, $clid);
    3134    if($text !~ /\w+/ && $force_new)
    3235      {
    33         &GDBMUtils::gdbmSet($model->getCollection(), $clid,
     36        &GDBMUtils::gdbmCachedCollectionSet($collection, $clid,
    3437                            "<doctype>classify\n<hastxt>0\n<childtype>VList\n<Title>\n<numleafdocs>0\n<contains>\n");
    3538      }
     
    108111    # CLID
    109112    my @child_nodes = $self->getChildren();
     113
    110114    # Retrieve the current document
    111115    my $text = $self->toString();
     116
     117    my $collection = $self->{'model'}->getCollection();
     118
    112119    # Create a new document with the correct CLID
    113     &GDBMUtils::gdbmSet($self->{'model'}->getCollection(), $new_clid, $text);
     120    &GDBMUtils::gdbmCachedCollectionSet($collection, $new_clid, $text);
     121
    114122    # Remove the old document
    115     &GDBMUtils::gdbmSet($self->{'model'}->getCollection(), $self->{'clid'});
     123    &GDBMUtils::gdbmCachedCollectionSet($collection, $self->{'clid'});
     124
    116125    # Finally, change the clid stored in this document
    117126    $self->{'clid'} = $new_clid;
     127
    118128    # Now go through this nodes children, and shift them too
    119129    foreach my $child_node (@child_nodes)
     
    221231    push(@clid_parts, $suffix);
    222232    my $next_clid = join(".", @clid_parts);
     233
     234    my $collection = $self->{'model'}->getCollection();
     235
    223236    # Now determine if this node exists.
    224     if(&GDBMUtils::gdbmGet($self->{'model'}->getCollection(), $next_clid) =~ /\w+/)
     237    if(&GDBMUtils::gdbmCachedCollectionGet($collection, $next_clid) =~ /\w+/)
    225238      {
    226239        # And if so, create it.
     
    474487    # Now remove the node from the database. We do this calling set gdbm with
    475488    # no value argument.
    476     &GDBMUtils::gdbmSet($self->{'model'}->getCollection(), $self->{'clid'});
     489    my $collection = $self->{'model'}->getCollection();
     490    &GDBMUtils::gdbmCachedCollectionSet($collection, $self->{'clid'});
     491
    477492    # Return the leaf count (so we can adjust the numleafdocs at the root node
    478493    # of this deletion.
     
    605620    # Replace any occurance of this nodes CLID with "
    606621    $contains =~ s/$self->{'clid'}/\"/g;
     622
     623    my $collection = $self->{'model'}->getCollection();
     624    my $clid = $self->{'clid'};
     625
    607626    # Load the text of this node
    608     my $text = &GDBMUtils::gdbmGet($self->{'model'}->getCollection(), $self->{'clid'});
     627    my $text = &GDBMUtils::gdbmCachedCollectionGet($collection, $clid);
     628
    609629    # Replace the contains
    610630    #rint STDERR "Before: $text\n";
     
    612632    #rint STDERR "After:  $text\n";
    613633    # Store the changed text
    614     &GDBMUtils::gdbmSet($self->{'model'}->getCollection(), $self->{'clid'}, $text);
     634    &GDBMUtils::gdbmCachedCollectionSet($collection, $clid, $text);
    615635  }
    616636# /** setContains() **/
     
    626646    my ($self, $numleafdocs) = @_;
    627647    print STDERR "ClassifyTreeNode::setNumLeafDocs(numleafdocs)\n" unless !$self->{'debug'};
     648
     649    my $collection = $self->{'model'}->getCollection();
     650    my $clid = $self->{'clid'};
     651
    628652    # Load the text of this node
    629     my $text = &GDBMUtils::gdbmGet($self->{'model'}->getCollection(), $self->{'clid'});
     653    my $text = &GDBMUtils::gdbmCachedCollectionGet($collection, $clid);
    630654    # Replace the numleafdocs
    631655    $text =~ s/<numleafdocs>\d*?\n+/<numleafdocs>$numleafdocs\n/;
    632656    # Store the changed text
    633     &GDBMUtils::gdbmSet($self->{'model'}->getCollection(), $self->{'clid'}, $text);
     657    &GDBMUtils::gdbmCachedCollectionSet($collection, $clid, $text);
    634658  }
    635659# /** setNumLeafDocs() **/
     
    647671    my ($self, $title) = @_;
    648672    print STDERR "ClassifyTreeNode::setTitle(\"$title\")\n" unless !$self->{'debug'};
     673
     674    my $collection = $self->{'model'}->getCollection();
     675    my $clid = $self->{'clid'};
     676
    649677    # Load the text of this node
    650     my $text = &GDBMUtils::gdbmGet($self->{'model'}->getCollection(), $self->{'clid'});
     678    my $text = &GDBMUtils::gdbmCachedCollectionGet($collection, $clid);
    651679    # Replace the title
    652680    $text =~ s/<Title>.*?\n+/<Title>$title\n/;
    653681    # Store the changed text
    654     &GDBMUtils::gdbmSet($self->{'model'}->getCollection(), $self->{'clid'}, $text);
     682    &GDBMUtils::gdbmCachedCollectionSet($collection, $clid, $text);
    655683  }
    656684# /** setValue() **/
     
    666694    my ($self) = @_;
    667695    print STDERR "ClassifyTreeNode::toString()\n" unless !$self->{'debug'};
    668     my $text = &GDBMUtils::gdbmGet($self->{'model'}->getCollection(), $self->{'clid'});
     696    my $collection = $self->{'model'}->getCollection();
     697    my $clid = $self->{'clid'};
     698
     699    my $text = &GDBMUtils::gdbmCachedCollectionGet($collection, $clid);
    669700    return $text;
    670701  }
  • gsdl/trunk/perllib/GDBMUtils.pm

    r15890 r17087  
    66my $debug = 0;
    77
    8 # /** Global variable to hold a string containing the last collection a gdbmGet
    9 #  *  was performed on.
     8# /** Global variables to hold a strings containing:
     9#  *    the last collection, oid and value
     10#  *  a gdbmCachedCollectionGet() was performed on.
    1011#  */
    1112my $gdbmget_previous_collection = "";
    12 # /** Global variable to hold a string containing the last oid a gdbmGet was
    13 #  *  performed on.
    14 #  */
    1513my $gdbmget_previous_oid = "";
    16 # /** Global variable to hold a string containing the resulting value of the
    17 #  *  last gdbmGet request.
    18 #  */
    1914my $gdbmget_previous_value = "";
     15
     16
     17
     18sub gdbmDatabaseGet
     19  {
     20    my ($database, $oid) = @_;
     21
     22    # Are we in windows? Do we need .exe?
     23    my $exe = &util::get_os_exe();
     24
     25    # Retrieve the raw document content
     26    print STDERR "#Get document\ncmd: gdbmget$exe \"$database\" \"$oid\"\n" if $debug;
     27    my $value = `gdbmget$exe "$database" "$oid"`;
     28
     29    # Done
     30    return $value;
     31  }
     32
     33sub gdbmDatabaseAppend
     34  {
     35    my ($database, $oid, $value) = @_;
     36
     37    # Are we in windows? Do we need .exe?
     38    my $exe = &util::get_os_exe();
     39
     40    # Escape any speech marks in the value
     41    $value =~ s/\"/\\\"/g;
     42    # Set the document content
     43    print STDERR "#Set document\ncmd: gdbmset$exe \"$database\" \"$oid\" \"$value\" append\n" if $debug;
     44    `gdbmset$exe "$database" "$oid" "$value" append`;
     45}
     46
     47
     48sub gdbmDatabaseSet
     49  {
     50    my ($database, $oid, $value) = @_;
     51
     52    # Are we in windows? Do we need .exe?
     53    my $exe = &util::get_os_exe();
     54
     55    # Escape any speech marks in the value
     56    $value =~ s/\"/\\\"/g;
     57    # Set the document content
     58    print STDERR "#Set document\ncmd: gdbmset$exe \"$database\" \"$oid\" \"$value\"\n" if $debug;
     59    `gdbmset$exe "$database" "$oid" "$value"`;
     60}
     61
     62
     63sub gdbmDatabaseRemove
     64  {
     65    my ($database, $oid) = @_;
     66
     67    # Are we in windows? Do we need .exe?
     68    my $exe = &util::get_os_exe();
     69
     70    # Remove the document from the database
     71    print STDERR "#Set document\ncmd: gdbmset$exe \"$database\" \"$oid\"\n" if $debug;
     72
     73    # Think it would be clearer if this funcctionality was done
     74    # by a separate executable, e.g. gdbmremove
     75    `gdbmset$exe "$database" "$oid"`;
     76}
     77
     78
    2079
    2180# /** This wraps John T's gdbmget executable to get the gdbm database entry for
     
    2988#  *  @author John Thompson, DL Consulting Ltd.
    3089#  */
    31 sub gdbmGet()
     90sub gdbmCachedCollectionGet
    3291  {
    3392    my ($collection, $oid) = @_;
     
    42101        return $gdbmget_previous_value;
    43102      }
     103
    44104    # Where's the database?
    45     my $database = &getDatabasePath($collection);
    46     # Are we in windows? Do we need .exe?
    47     my $exe = "";
    48     $exe = ".exe" if $ENV{'GSDLOS'} =~ /^windows$/i;
    49     # Retrieve the raw document content
    50     print STDERR "#Get document\ncmd: gdbmget$exe \"$database\" \"$oid\"\n" if $debug;
    51     my $value = `gdbmget$exe "$database" "$oid"`;
     105    my $database = _getDatabasePath($collection);
     106
     107    my $value = gdbmDatbaseGet($database,$oid);
     108
    52109    # Tidy up the ever growing number of newlines at the end of the value
    53110    $value =~ s/(\r?\n)+/$1/g;
     111    # Why do we need the above line?  At the very least it would seem
     112    # better that the data going in to the database through 'set' is
     113    # monitored for superfluous \r\n which are then removed before being
     114    # saved in GDBM
     115
    54116    # Cache this result
    55117    $gdbmget_previous_collection = $collection;
    56118    $gdbmget_previous_oid = $oid;
    57119    $gdbmget_previous_value = $value;
     120
    58121    # Done
    59122    return $value;
    60123  }
    61 # /** gdbmGet() **/
     124# /** gdbmCachedCollectionGet **/
    62125
    63126# /** This wraps John T's gdbmset executable to set the gdbm database entry for
     
    70133#  *  @author John Rowe, DL Consulting Ltd.
    71134#  */
    72 sub gdbmSet()
     135sub gdbmCachedCollectionSet
    73136  {
    74137    my ($collection, $oid, $value) = @_;
     138
    75139    # Where's the database?
    76     my $database = &getDatabasePath($collection);
    77     # Are we in windows? Do we need .exe?
    78     my $exe = &util::get_os_exe();
     140    my $database = _getDatabasePath($collection);
     141
     142
    79143    # Check whether value is set
    80144    if (defined($value))
    81145      {
    82         # Escape any speech marks in the value
    83         $value =~ s/\"/\\\"/g;
    84         # Set the document content
    85         print STDERR "#Set document\ncmd: gdbmset$exe \"$database\" \"$oid\" \"$value\"\n" if $debug;
    86         `gdbmset$exe "$database" "$oid" "$value"`;
     146      gdbmDatabaseSet($database,$oid,$value);
    87147      }
    88148    else
    89149      {
    90         # Remove the document from the database
    91         print STDERR "#Set document\ncmd: gdbmset$exe \"$database\" \"$oid\"\n" if $debug;
    92         `gdbmset$exe "$database" "$oid"`;
     150      gdbmDtabaseRemove($database,$oid);
    93151      }
     152
    94153    # Empty any cached values, as they may now be invalid
     154
    95155    # Cache this result
    96156    $gdbmget_previous_collection = "";
     
    98158    $gdbmget_previous_value = 0;
    99159  }
    100 # /** gdbmSet() **/
     160# /** gdbmCollectionSet **/
    101161
    102162# /** This works out the database path and returns it to the calling
     
    107167#  *  @author John Rowe, DL Consulting Ltd.
    108168#  */
    109 sub getDatabasePath()
     169
     170sub _getDatabasePath
    110171  {
    111172    my $collection = shift(@_);
     173
    112174    # Find out the database extension
    113     my $ext = ".bdb";
    114     $ext = ".ldb" if &util::is_little_endian();
     175    my $ext = &util::is_little_endian() ? ".ldb" : ".bdb";
     176
    115177    # Now return the full filename of the database
     178
    116179    return &util::filename_cat($ENV{'GSDLHOME'}, "collect", $collection, "index", "text", $collection.$ext);
    117180  }
    118 # /** getDatabasePath() **/
     181# /** getDatabasePath **/
    119182
    1201831;
  • gsdl/trunk/perllib/IncrementalBuildUtils.pm

    r15890 r17087  
    390390        # Now use the GDBM utils to write a blank string to this oid in the
    391391        # database
    392         &GDBMUtils::gdbmSet($collection, $oid, "");
     392        &GDBMUtils::gdbmCachedCollectionSet($collection, $oid, "");
    393393        # Remove reverse lookup
    394         &GDBMUtils::gdbmSet($collection, $doc_num, "");
     394        &GDBMUtils::gdbmCachedCollectionSet($collection, $doc_num, "");
    395395        # And remove from the database
    396396        &callGS2LuceneDelete($collection, $doc_num);
  • gsdl/trunk/perllib/IncrementalDocument.pm

    r15894 r17087  
    3535  {
    3636    my ($class, $collection, $oid) = @_;
     37
    3738    #rint STDERR "IncrementalDocument::new($collection, $oid)\n";
     39
    3840    # Test the parameters
    3941    die ("Error! Can't create a document that doesn't belong to a collection!") unless $collection;
    4042    die ("Error! Can't create a document that doesn't have a unique id (OID)!") unless $oid;
     43
    4144    # Store the variables
    4245    my $self = {};
     46
    4347    # The collection this document object has been loaded from.
    4448    $self->{'collection'} = $collection;
     49
    4550    # An associative array of information retrieved from the GDBM database
    4651    # which maps a key string to a nested associative array listing values.
    4752    $self->{'data'} = {};
     53
    4854    # The unique identifier of the document loaded
    4955    $self->{'oid'} = $oid;
     56
    5057    # Stores the order in which metadata keys where discovered/added.
    5158    $self->{'order'} = {};
    52     # Bless me father for I have sinned
     59
    5360    bless $self, $class;
    5461    return $self;
     
    6168  {
    6269    my ($self, $key, $value, $internal) = @_;
     70
    6371    # Validate the arguments
    6472    die ("Error! Can't add a metadata value to a document without a valid key!") unless $key =~ /[\w]+/;
    6573    die ("Error! Can't add a metadata key to a document without a valid value!") unless $value =~ /[\w\d]+/;
     74
    6675    # Is this a new key that we haven't encountered before? If so ensure an
    6776    # array exists for its values, and record the order in which we encountered
     
    7584        $self->{'data'}->{$key} = {};
    7685      }
     86
    7787    # Set the value of the associative path to 1.
    7888    $self->{'data'}->{$key}->{$value} = 1;
     
    8494#  */
    8595sub getAllMetadata()
    86   {
     96{
     97    my ($self) = @_;
     98    my @all_metadata;
     99
    87100    print STDERR "IncrementalDocument.getAllMetadata()\n";
    88     my ($self) = @_;
    89     my @all_metadata;
    90101
    91102    my $key_count = scalar(keys %{$self->{'order'}});
     
    137148    #rint STDERR "IncrementalDocument::loadDocument()\n";
    138149    # Load the raw text for the document object from GDBM
    139     my $text = &GDBMUtils::gdbmGet($self->{'collection'}, $self->{'oid'});
     150    my $collection = $self->{'collection'};
     151    my $text = &GDBMUtils::gdbmCachedCollectionGet($collection, $self->{'oid'});
    140152    # For each line in the raw text, extract the key (enclosed in angle
    141153    # brackets) and the value
     
    170182    # Get a textual version of this object
    171183    my $text = $self->toString();
     184
    172185    # Now store the object in the database using the GDBM utilities
    173     &GDBMUtils::gdbmSet($self->{'collection'}, $self->{'oid'}, $text);
     186    my $collection = $self->{'collection'};
     187    &GDBMUtils::gdbmCachedCollectionSet($collection, $self->{'oid'}, $text);
     188
    174189    # There is a little bit of extra complexity when saving an incremental
    175190    # document in that we should ensure that a reverse lookup-from DocNum or
     
    178193    if($doc_num >= 0)
    179194      {
    180         my $text = &GDBMUtils::gdbmGet($self->{'collection'}, $doc_num);
     195        my $text = &GDBMUtils::gdbmCachedCollectionGet($collection, $doc_num);
    181196        # If there is no reverse lookup, then add one now
    182197        if($text !~ /<section>/)
    183198          {
    184             &GDBMUtils::gdbmSet($self->{'collection'}, $doc_num, "<section>" . $self->{'oid'});
     199            &GDBMUtils::gdbmCachedCollectionSet($collection, $doc_num, "<section>" . $self->{'oid'});
    185200          }
    186201      }
  • gsdl/trunk/perllib/dbutil.pm

    r16726 r17087  
    2929
    3030
     31sub open_infodb_write_handle
     32{
     33    my $infodb_type = shift(@_);
     34    my $infodb_file_path = shift(@_);
     35
     36    if ($infodb_type eq "sqlite") {
     37    return &open_infodb_write_handle_sqlite($infodb_file_path);
     38    }
     39    elsif ($infodb_type eq "gdbm") {
     40    return &open_infodb_write_handle_gdbm($infodb_file_path);
     41    }
     42
     43    # Use text (gzipped) version ready for convertion to GDBM
     44    # if the infodb type is empty or not one of the values above
     45    return &open_infodb_write_handle_gdbm_txtgz($infodb_file_path);
     46}
     47
     48
    3149sub close_infodb_write_handle
    3250{
     
    3452    my $infodb_handle = shift(@_);
    3553
    36     if ($infodb_type eq "sqlite")
    37     {
     54    if ($infodb_type eq "sqlite") {
    3855    return &close_infodb_write_handle_sqlite($infodb_handle);
    3956    }
    40 
    41     # Use GDBM if the infodb type is empty or not one of the values above
    42     return &close_infodb_write_handle_gdbm($infodb_handle);
    43 }
     57    elsif ($infodb_type eq "gdbm") {
     58    return &close_infodb_write_handle_gdbm($infodb_handle);
     59    }
     60
     61    # Use text (gzipped) version ready for convertion to GDBM
     62    # if the infodb type is empty or not one of the values above return
     63
     64    &close_infodb_write_handle_gdbm_txtgz($infodb_handle); }
    4465
    4566
    4667sub get_default_infodb_type
    4768{
    48     return "gdbm";
     69    return "gdbm-txtgz";
    4970}
    5071
     
    6081    return &get_infodb_file_path_sqlite($collection_name, $infodb_directory_path);
    6182    }
    62 
    63     # Use GDBM if the infodb type is empty or not one of the values above
    64     return &get_infodb_file_path_gdbm($collection_name, $infodb_directory_path);
    65 }
    66 
    67 
    68 sub open_infodb_write_handle
     83    elsif ($infodb_type eq "gdbm") {
     84    return &get_infodb_file_path_gdbm($collection_name, $infodb_directory_path);
     85    }
     86
     87    # Use text (gzipped) version ready for convertion to GDBM
     88    # if the infodb type is empty or not one of the values above return
     89
     90    return &get_infodb_file_path_gdbm_txtgz($collection_name, $infodb_directory_path);
     91}
     92
     93
     94
     95
     96sub read_infodb_file
    6997{
    7098    my $infodb_type = shift(@_);
    7199    my $infodb_file_path = shift(@_);
     100    my $infodb_map = shift(@_);
    72101
    73102    if ($infodb_type eq "sqlite")
    74103    {
    75     return &open_infodb_write_handle_sqlite($infodb_file_path);
    76     }
    77 
    78     # Use GDBM if the infodb type is empty or not one of the values above
    79     return &open_infodb_write_handle_gdbm($infodb_file_path);
    80 }
    81 
    82 
    83 sub read_infodb_file
     104    return &read_infodb_file_sqlite($infodb_file_path, $infodb_map);
     105    }
     106    elsif ($infodb_type eq "gdbm") {
     107    return &read_infodb_file_gdbm($infodb_file_path, $infodb_map);
     108    }
     109
     110    # Use text (gzipped) version ready for convertion to GDBM
     111    # if the infodb type is empty or not one of the values above return
     112
     113    return &read_infodb_file_gdbm_txtgz($infodb_file_path, $infodb_map);
     114}
     115
     116
     117sub write_infodb_entry
    84118{
    85119    my $infodb_type = shift(@_);
    86     my $infodb_file_path = shift(@_);
     120    my $infodb_handle = shift(@_);
     121    my $infodb_key = shift(@_);
    87122    my $infodb_map = shift(@_);
    88123
    89124    if ($infodb_type eq "sqlite")
    90125    {
    91     return &read_infodb_file_sqlite($infodb_file_path, $infodb_map);
    92     }
    93 
    94     # Use GDBM if the infodb type is empty or not one of the values above
    95     return &read_infodb_file_gdbm($infodb_file_path, $infodb_map);
    96 }
    97 
    98 
    99 sub write_infodb_entry
    100 {
    101     my $infodb_type = shift(@_);
    102     my $infodb_handle = shift(@_);
    103     my $infodb_key = shift(@_);
    104     my $infodb_map = shift(@_);
    105 
    106     if ($infodb_type eq "sqlite")
    107     {
    108126    return &write_infodb_entry_sqlite($infodb_handle, $infodb_key, $infodb_map);
    109127    }
    110 
    111     # Use GDBM if the infodb type is empty or not one of the values above
    112     return &write_infodb_entry_gdbm($infodb_handle, $infodb_key, $infodb_map);
    113 }
    114 
    115 
    116 
    117 # ----------------------------------------------------------------------------------------
    118 #   GDBM IMPLEMENTATION
    119 # ----------------------------------------------------------------------------------------
    120 
    121 sub close_infodb_write_handle_gdbm
     128    elsif ($infodb_type eq "gdbm") {
     129    return &write_infodb_entry_gdbm($infodb_handle, $infodb_key, $infodb_map);
     130    }
     131
     132    # Use text (gzipped) version ready for convertion to GDBM
     133    # if the infodb type is empty or not one of the values above return
     134    return &write_infodb_entry_gdbm_txtgz($infodb_handle, $infodb_key, $infodb_map);
     135}
     136
     137
     138
     139# -----------------------------------------------------------------------------
     140#   GDBM TXT-GZ IMPLEMENTATION
     141# -----------------------------------------------------------------------------
     142
     143sub open_infodb_write_handle_gdbm_txtgz
     144{
     145    # Keep infodb in GDBM neutral form => save data as compressed text file,
     146    # read for txt2db to be run on it later (i.e. by the runtime system,
     147    # first time the collection is ever accessed).  This makes it easier
     148    # distribute pre-built collections to various architectures.
     149    #
     150    # NB: even if two architectures are little endian (e.g. Intel and
     151    # ARM procesors) GDBM does *not* guarantee that the database generated on
     152    # one will work on the other
     153
     154    my $infodb_file_path = shift(@_);
     155
     156    # Greenstone ships with gzip for windows, on $PATH
     157
     158    my $infodb_file_handle = undef;
     159    if (!open($infodb_file_handle, "| gzip - > \"$infodb_file_path\""))
     160    {
     161    return undef;
     162    }
     163
     164    return $infodb_file_handle;
     165}
     166
     167sub close_infodb_write_handle_gdbm_txtgz
    122168{
    123169    my $infodb_handle = shift(@_);
     
    127173
    128174
    129 sub get_infodb_file_path_gdbm
     175sub get_infodb_file_path_gdbm_txtgz
    130176{
    131177    my $collection_name = shift(@_);
    132178    my $infodb_directory_path = shift(@_);
    133179
    134     my $infodb_file_extension = (&util::is_little_endian() ? ".ldb" : ".bdb");
    135     my $infodb_file_name = &util::get_dirsep_tail($collection_name) . $infodb_file_extension;
     180    my $infodb_file_name = &util::get_dirsep_tail($collection_name).".txt.gz";
    136181    return &util::filename_cat($infodb_directory_path, $infodb_file_name);
    137182}
    138183
    139184
    140 sub open_infodb_write_handle_gdbm
    141 {
    142     my $infodb_file_path = shift(@_);
    143 
    144     my $txt2db_exe = &util::filename_cat("$ENV{'GSDLHOME'}/bin/$ENV{'GSDLOS'}", "txt2db" . &util::get_os_exe());
    145     my $infodb_file_handle = undef;
    146     if (!-e "$txt2db_exe" || !open($infodb_file_handle, "| \"$txt2db_exe\" \"$infodb_file_path\""))
    147     {
    148     return undef;
    149     }
    150 
    151     return $infodb_file_handle;
    152 }
    153 
    154 
    155 sub read_infodb_file_gdbm
    156 {
    157     my $infodb_file_path = shift(@_);
    158     my $infodb_map = shift(@_);
    159 
    160     open (PIPEIN, "db2txt \"$infodb_file_path\" |") || die "couldn't open pipe from db2txt\n";
     185sub read_infodb_file_gdbm_txtgz
     186{
     187    my $infodb_file_path = shift(@_);
     188    my $infodb_map = shift(@_);
     189
     190    my $cmd = "gzip --decompress \"$infodb_file_path\"";
     191
     192    open (PIPEIN, "$cmd |")
     193    || die "Error: Couldn't open pipe from gzip: $!\n  $cmd\n";
     194
    161195    my $infodb_line = "";
    162196    my $infodb_key = "";
     
    184218
    185219   
    186 sub write_infodb_entry_gdbm
    187 {
     220sub write_infodb_entry_gdbm_txtgz
     221{
     222
    188223    my $infodb_handle = shift(@_);
    189224    my $infodb_key = shift(@_);
     
    209244
    210245
    211 # ----------------------------------------------------------------------------------------
     246# -----------------------------------------------------------------------------
     247#   GDBM IMPLEMENTATION
     248# -----------------------------------------------------------------------------
     249
     250sub open_infodb_write_handle_gdbm
     251{
     252    my $infodb_file_path = shift(@_);
     253
     254    my $txt2db_exe = &util::filename_cat($ENV{'GSDLHOME'},"bin",$ENV{'GSDLOS'}, "txt2db" . &util::get_os_exe());
     255    my $infodb_file_handle = undef;
     256    if (!-e "$txt2db_exe" || !open($infodb_file_handle, "| \"$txt2db_exe\" \"$infodb_file_path\""))
     257    {
     258    return undef;
     259    }
     260
     261    return $infodb_file_handle;
     262}
     263
     264sub close_infodb_write_handle_gdbm
     265{
     266    my $infodb_handle = shift(@_);
     267
     268    close($infodb_handle);
     269}
     270
     271
     272sub get_infodb_file_path_gdbm
     273{
     274    my $collection_name = shift(@_);
     275    my $infodb_directory_path = shift(@_);
     276
     277    my $infodb_file_extension = (&util::is_little_endian() ? ".ldb" : ".bdb");
     278    my $infodb_file_name = &util::get_dirsep_tail($collection_name) . $infodb_file_extension;
     279    return &util::filename_cat($infodb_directory_path, $infodb_file_name);
     280}
     281
     282
     283
     284
     285sub read_infodb_file_gdbm
     286{
     287    my $infodb_file_path = shift(@_);
     288    my $infodb_map = shift(@_);
     289
     290    open (PIPEIN, "db2txt \"$infodb_file_path\" |") || die "couldn't open pipe from db2txt\n";
     291    my $infodb_line = "";
     292    my $infodb_key = "";
     293    my $infodb_value = "";
     294    while (defined ($infodb_line = <PIPEIN>))
     295    {
     296    if ($infodb_line =~ /^\[([^\]]+)\]$/)
     297    {
     298        $infodb_key = $1;
     299    }
     300    elsif ($infodb_line =~ /^-{70}$/)
     301    {
     302        $infodb_map->{$infodb_key} = $infodb_value;
     303        $infodb_key = "";
     304        $infodb_value = "";
     305    }
     306    else
     307    {
     308        $infodb_value .= $infodb_line;
     309    }
     310    }
     311
     312    close (PIPEIN);
     313}
     314
     315   
     316sub write_infodb_entry_gdbm
     317{
     318    # With infodb_handle already set up, works the same as _gdbm_txtgz version
     319    write_infodb_entry_gdbm_txtgz(@_);
     320}
     321
     322
     323
     324# -----------------------------------------------------------------------------
    212325#   SQLITE IMPLEMENTATION
    213 # ----------------------------------------------------------------------------------------
     326# -----------------------------------------------------------------------------
     327
     328sub open_infodb_write_handle_sqlite
     329{
     330    my $infodb_file_path = shift(@_);
     331
     332    my $sqlite3_exe = &util::filename_cat($ENV{'GSDLHOME'},"bin",$ENV{'GSDLOS'}, "sqlite3" . &util::get_os_exe());
     333    my $infodb_handle = undef;
     334    if (!-e "$sqlite3_exe" || !open($infodb_handle, "| \"$sqlite3_exe\" \"$infodb_file_path\""))
     335    {
     336    return undef;
     337    }
     338
     339    print $infodb_handle "CREATE TABLE IF NOT EXISTS data (key TEXT PRIMARY KEY, value TEXT);\n";
     340    print $infodb_handle "CREATE TABLE IF NOT EXISTS document_metadata (id INTEGER PRIMARY KEY, docOID TEXT, element TEXT, value TEXT);\n";
     341
     342    # This is crucial for efficiency when importing large amounts of data
     343    print $infodb_handle "CREATE INDEX IF NOT EXISTS dmd ON document_metadata(docOID);\n";
     344
     345    # This is very important for efficiency, otherwise each command will be actioned one at a time
     346    print $infodb_handle "BEGIN TRANSACTION;\n";
     347
     348    return $infodb_handle;
     349}
     350
     351
    214352
    215353sub close_infodb_write_handle_sqlite
     
    237375}
    238376
    239 
    240 sub open_infodb_write_handle_sqlite
    241 {
    242     my $infodb_file_path = shift(@_);
    243 
    244     my $sqlite3_exe = &util::filename_cat("$ENV{'GSDLHOME'}/bin/$ENV{'GSDLOS'}", "sqlite3" . &util::get_os_exe());
    245     my $infodb_handle = undef;
    246     if (!-e "$sqlite3_exe" || !open($infodb_handle, "| \"$sqlite3_exe\" \"$infodb_file_path\""))
    247     {
    248     return undef;
    249     }
    250 
    251     print $infodb_handle "CREATE TABLE IF NOT EXISTS data (key TEXT PRIMARY KEY, value TEXT);\n";
    252     print $infodb_handle "CREATE TABLE IF NOT EXISTS document_metadata (id INTEGER PRIMARY KEY, docOID TEXT, element TEXT, value TEXT);\n";
    253 
    254     # This is crucial for efficiency when importing large amounts of data
    255     print $infodb_handle "CREATE INDEX IF NOT EXISTS dmd ON document_metadata(docOID);\n";
    256 
    257     # This is very important for efficiency, otherwise each command will be actioned one at a time
    258     print $infodb_handle "BEGIN TRANSACTION;\n";
    259 
    260     return $infodb_handle;
    261 }
    262377
    263378
  • gsdl/trunk/perllib/plugouts/BasPlugout.pm

    r16252 r17087  
    3535use printusage;
    3636use parse2;
     37use GDBMUtils;
     38
    3739
    3840# suppress the annoying "subroutine redefined" warning that various
     
    402404    ##############################
    403405    $self->saveas($doc_obj,$doc_dir);
    404    
     406    $self->archiveinf_gdbm($doc_obj,$doc_dir);
     407
    405408}
    406409
     
    630633}
    631634
     635
     636sub archiveinf_gdbm
     637{
     638    my $self = shift (@_);
     639    my ($doc_obj) = @_;
     640
     641    my $verbosity = $self->{'verbosity'};
     642
     643    my $collect_dir = $ENV{'GSDLCOLLECTDIR'};
     644    if (defined $collect_dir) {
     645    my $dirsep_regexp = &util::get_os_dirsep();
     646
     647    if ($collect_dir !~ /$dirsep_regexp$/) {
     648        # ensure there is a slash at the end
     649        $collect_dir .= &util::get_dirsep();
     650    }
     651    }
     652
     653    my $oid = $doc_obj->get_OID();
     654    my $source_filename = $doc_obj->get_source_filename();
     655
     656    my $oid_files = { 'src-file' => $source_filename,
     657              'assoc-files' => [] };
     658   
     659    my @reverse_lookups = ($source_filename);
     660
     661    foreach my $assoc_file_rec (@{$doc_obj->get_assoc_files()}) {
     662    my $real_filename = $assoc_file_rec->[0];
     663    my $full_afile = $assoc_file_rec->[1];
     664
     665    # for some reasons the image associate file has / before the full path
     666    $real_filename =~ s/^\\(.*)/$1/i;
     667    if (-e $real_filename) {
     668
     669        if (defined $collect_dir) {
     670        $real_filename =~ s/^$collect_dir//;
     671        }
     672
     673        push(@reverse_lookups,$real_filename);
     674
     675        push(@{$oid_files->{'assoc-files'}},$full_afile);
     676
     677    }
     678    else {
     679        print STDERR "Warning: archiveinf_gdbm()\n  $real_filename does not appear to be on the file system\n";
     680    }
     681    }
     682
     683    # better not to commit to a particular db implementation, but
     684    # for simplicity, will use GDBM for now.
     685
     686    my $output_dir = $self->{'output_dir'};
     687    my $db_ext = &util::is_little_endian() ? ".ldb" : ".bdb";
     688
     689    my $doc_db = &util::filename_cat($output_dir,"archiveinf-doc$db_ext");
     690    my $src_db = &util::filename_cat($output_dir,"archiveinf-src$db_ext");
     691
     692    my $doc_db_text = "";
     693    $doc_db_text .= "<src-file>$oid_files->{'src-file'}\n";
     694    foreach my $af (@{$oid_files->{'assoc-files'}}) {
     695    $doc_db_text .= "<assoc-file>$af\n";
     696    }
     697    chomp($doc_db_text); # remove trailing \n
     698
     699    &GDBMUtils::gdbmDatabaseSet($doc_db,$oid,$doc_db_text);
     700
     701    foreach my $rl (@reverse_lookups) {
     702    &GDBMUtils::gdbmDatabaseAppend($src_db,$rl,"<oid>$oid\n");
     703    }
     704   
     705}
     706
     707
    632708sub set_sortmeta {
    633709    my $self = shift (@_);
Note: See TracChangeset for help on using the changeset viewer.