Changeset 12706


Ignore:
Timestamp:
2006-09-07T16:38:12+12:00 (18 years ago)
Author:
mdewsnip
Message:

Added a "-records_per_folder" option to explode_metadata_database.pl, which explodes the records into multiple folders.

Location:
trunk/gsdl
Files:
2 edited

Legend:

Unmodified
Added
Removed
  • trunk/gsdl/bin/script/explode_metadata_database.pl

    r12545 r12706  
    7171    'type' => "string",
    7272    'reqd' => "no"},
     73      { 'name' => "records_per_folder",
     74    'desc' => "{explode.records_per_folder}",
     75    'type' => "int",
     76    'range' => "0,",
     77    'deft' => "100",
     78    'reqd' => "no" },
    7379      { 'name' => "verbosity",
    7480    'desc' => "{import.verbosity}",
     
    9399{
    94100    my ($language, $input_encoding, $metadata_set, $plugin,
    95     $document_field, $document_prefix, $document_suffix, $verbosity);
     101    $document_field, $document_prefix, $document_suffix, $records_per_folder, $verbosity);
    96102
    97103    my $xml = 0;
     
    174180    # Use the plugin's read_file function to avoid duplicating code
    175181    $plugobj->read_file($filename, $input_encoding, undef, \$text);
    176 
    177182    # is there any text in the file??
    178183    die "\n" unless length($text);
     184
    179185    # Create a directory to store the document files...
    180     my ($documents_directory) = ($filename =~ /(.*)\.[^\.]+$/);
    181     if (-d $documents_directory) {
    182     die "Error: document directory $documents_directory already exists (bailing).\n";
    183     }
    184     &util::mk_dir($documents_directory);
    185 
    186     # ...and a metadata.xml file for the document metadata (extracted from the database)
    187     my $documents_metadata_xml_file = &util::filename_cat($documents_directory, "metadata.xml");
    188     if (-e $documents_metadata_xml_file) {
    189     die "Error: document metadata.xml file $documents_metadata_xml_file already exists (bailing).\n";
    190     }
    191 
    192     # Start the metadata.xml file
    193     open(METADATA_XML_FILE, ">$documents_metadata_xml_file");
    194     print METADATA_XML_FILE
    195     "<?xml version=\"1.0\" encoding=\"UTF-8\" standalone=\"no\"?>\n" .
    196     "<!DOCTYPE DirectoryMetadata SYSTEM \"http://greenstone.org/dtd/DirectoryMetadata/1.0/DirectoryMetadata.dtd\">\n" .
    197     "<DirectoryMetadata>\n";
     186    my ($documents_directory_base) = ($filename =~ /(.*)\.[^\.]+$/);
    198187
    199188    # Split the text into records, using the plugin's split_exp
    200189    my $split_exp = $plugobj->{'split_exp'};
    201190    my @metadata_records = split(/$split_exp/, $text);
    202     print STDERR "Number of records: " . @metadata_records . "\n";
     191    print STDERR "Number of records: " . scalar(@metadata_records) . "\n";
    203192
    204193    # Write the metadata from each record to the metadata.xml file
    205     my $record_number = 0;
     194    my $record_number = 1;
     195    my $documents_directory;
    206196    foreach my $record_text (@metadata_records) {
    207    
     197    # Check if we need to start a new directory for these records
     198    if (($record_number % $records_per_folder) == 1) {
     199        $documents_directory = $documents_directory_base;
     200        if (scalar(@metadata_records) > $records_per_folder) {
     201        $documents_directory .= "." . sprintf("%8.8d", $record_number);
     202        }
     203        if (-d $documents_directory) {
     204        die "Error: document directory $documents_directory already exists (bailing).\n";
     205        }
     206        &util::mk_dir($documents_directory);
     207
     208        my $documents_metadata_xml_file = &util::filename_cat($documents_directory, "metadata.xml");
     209        if (-e $documents_metadata_xml_file) {
     210        die "Error: documents metadata.xml file $documents_metadata_xml_file already exists (bailing).\n";
     211        }
     212
     213        # Start the metadata.xml file
     214        open(METADATA_XML_FILE, ">$documents_metadata_xml_file");
     215        print METADATA_XML_FILE
     216        "<?xml version=\"1.0\" encoding=\"UTF-8\" standalone=\"no\"?>\n" .
     217        "<!DOCTYPE DirectoryMetadata SYSTEM \"http://greenstone.org/dtd/DirectoryMetadata/1.0/DirectoryMetadata.dtd\">\n" .
     218        "<DirectoryMetadata>\n";
     219    }
     220
    208221    # Use the plugin's process function to avoid duplicating code
    209222    my $doc_obj = new doc($filename, "nonindexed_doc");
     
    227240    # do we need to create a dummy doc??
    228241    if (not defined $document_file) {
    229         $record_number = $record_number + 1;
    230242        $document_file = sprintf("%8.8d", $record_number) . ".nul";
    231 
    232243        open(DUMMY_FILE, ">$documents_directory/$document_file");
    233244        close(DUMMY_FILE);
     
    235246
    236247    &write_metadata_xml_file_entry(METADATA_XML_FILE, $document_file, $record_metadata, $metadata_set);
    237     }
    238 
    239     # Finish and close the metadata.xml file
    240     print METADATA_XML_FILE "\n</DirectoryMetadata>\n";
    241     close(METADATA_XML_FILE);
     248
     249    if (($record_number % $records_per_folder) == 0 || $record_number == scalar(@metadata_records)) {
     250        # Finish and close the metadata.xml file
     251        print METADATA_XML_FILE "\n</DirectoryMetadata>\n";
     252        close(METADATA_XML_FILE);
     253    }
     254    $record_number = $record_number + 1;
     255    }
    242256
    243257    # Explode means just that: the original file is deleted
  • trunk/gsdl/perllib/strings.properties

    r12692 r12706  
    177177explode.plugin: Plugin to use for exploding
    178178explode.params: [options] filename
     179explode.records_per_folder: The number of records to put in each subfolder.
     180
    179181# -- exportcol.pl --
    180182
Note: See TracChangeset for help on using the changeset viewer.