Changeset 16790


Ignore:
Timestamp:
2008-08-14T16:39:09+12:00 (13 years ago)
Author:
davidb
Message:

Support for exploding oai records. Added at the request of John Rose.

File:
1 edited

Legend:

Unmodified
Added
Removed
  • gsdl/trunk/bin/script/explode_metadata_database.pl

    r15074 r16790  
    1616use parse2;
    1717use FileHandle;
     18
     19use File::Spec;
     20use File::Basename;
    1821
    1922my $unicode_list =
     
    9598        'args' => $arguments };
    9699
    97        
     100
     101
    98102sub main
    99103{
     
    131135    }
    132136
     137   
    133138    # There should one arg left after parsing (the filename)
    134139    # Or the user may have specified -h, in which case we output the usage
     
    177182    }
    178183
    179     my $text = "";
    180     # Use the plugin's read_file function to avoid duplicating code
    181     $plugobj->read_file($filename, $input_encoding, undef, \$text);
    182     # is there any text in the file??
    183     die "\n" unless length($text);
    184 
    185184    # Create a directory to store the document files...
    186     my ($documents_directory_base) = ($filename =~ /(.*)\.[^\.]+$/);
    187 
    188     # Split the text into records, using the plugin's split_exp
     185    my ($exploded_base_dir) = ($filename =~ /(.*)\.[^\.]+$/);
     186
     187    my $orig_base_dir = &File::Basename::dirname($filename);
     188
     189
    189190    my $split_exp = $plugobj->{'split_exp'};
    190     my @metadata_records = split(/$split_exp/, $text);
    191     print STDERR "Number of records: " . scalar(@metadata_records) . "\n";
    192 
    193     # Write the metadata from each record to the metadata.xml file
    194     my $record_number = 1;
    195     my $documents_directory;
    196     foreach my $record_text (@metadata_records) {
    197     # Check if we need to start a new directory for these records
    198     if (($record_number % $records_per_folder) == 1) {
    199         $documents_directory = $documents_directory_base;
    200         if (scalar(@metadata_records) > $records_per_folder) {
    201         $documents_directory .= "." . sprintf("%8.8d", $record_number);
    202         }
    203         if (-d $documents_directory) {
    204         die "Error: document directory $documents_directory already exists (bailing).\n";
    205         }
    206         &util::mk_dir($documents_directory);
    207 
    208         my $documents_metadata_xml_file = &util::filename_cat($documents_directory, "metadata.xml");
    209         if (-e $documents_metadata_xml_file) {
    210         die "Error: documents metadata.xml file $documents_metadata_xml_file already exists (bailing).\n";
    211         }
    212 
    213         # Start the metadata.xml file
    214         open(METADATA_XML_FILE, ">$documents_metadata_xml_file");
    215         print METADATA_XML_FILE
    216         "<?xml version=\"1.0\" encoding=\"UTF-8\" standalone=\"no\"?>\n" .
    217         "<!DOCTYPE DirectoryMetadata SYSTEM \"http://greenstone.org/dtd/DirectoryMetadata/1.0/DirectoryMetadata.dtd\">\n" .
    218         "<DirectoryMetadata>\n";
    219     }
    220 
    221     # Use the plugin's process function to avoid duplicating code
     191    if (defined $split_exp) {
     192    # Read in file, and then split and process individual records
     193
     194    my $text = "";
     195    # Use the plugin's read_file function to avoid duplicating code
     196    $plugobj->read_file($filename, $input_encoding, undef, \$text);
     197    # is there any text in the file??
     198    die "\n" unless length($text);
     199
     200    # Split the text into records, using the plugin's split_exp
     201
     202    my @metadata_records = split(/$split_exp/, $text);
     203    print STDERR "Number of records: " . scalar(@metadata_records) . "\n";
     204   
     205    # Write the metadata from each record to the metadata.xml file
     206    my $record_number = 1;
     207    foreach my $record_text (@metadata_records) {
     208       
     209        # Check if we need to start a new directory for these records
     210        my $documents_directory;
     211        check_need_new_directory($exploded_base_dir,$record_number,$records_per_folder,
     212                     \@metadata_records,\$documents_directory);
     213       
     214        # Use the plugin's process function to avoid duplicating code
     215        my $doc_obj = new doc($filename, "nonindexed_doc");
     216        $plugobj->process(\$record_text, undef, undef, $filename, undef, $doc_obj, 0);
     217       
     218       
     219        # Try to get a doc to attach the metadata to
     220        # If no match found, create a dummy .nul file
     221        attach_metadata_or_nul_doc($document_field, $doc_obj, $record_number,
     222                       $documents_directory, $orig_base_dir,
     223                       $document_prefix, $document_suffix, $metadata_set, $verbosity);
     224               
     225       
     226        check_close_directory($record_number,$records_per_folder,\@metadata_records);
     227       
     228        $record_number = $record_number + 1;
     229    }
     230    }
     231    else {
     232    # Call metadata_read to sets up associated metadata
     233
     234    my $pluginfo = undef;
     235    my $metadata = {};
     236
     237    my $processor = undef;
     238    my $maxdocs = undef;
     239    my $gli = undef;
     240
     241    my $extrametakeys = {};
     242    my $extrametadata = {};
     243
     244
     245    $plugobj->metadata_read($pluginfo, "", $filename, $metadata,
     246                $extrametakeys, $extrametadata, $processor, $maxdocs, $gli);
     247
     248
     249    my $documents_directory = need_new_directory($exploded_base_dir);
     250
     251    # Attach metadata to object
     252    # => use the plugin's extra_metadata function to avoid duplicating code
    222253    my $doc_obj = new doc($filename, "nonindexed_doc");
    223     $plugobj->process(\$record_text, undef, undef, $filename, undef, $doc_obj, 0);
    224     # Get all the metadata assigned to this record
    225     my $record_metadata = $doc_obj->get_all_metadata($doc_obj->get_top_section());
    226     my $document_file;
    227    
    228     # try to get a doc to attach the metadata to
    229     if (defined $document_field) {
    230         foreach my $pair (@$record_metadata) {
    231         my ($field, $value) = (@$pair);
    232         $value =~ s/\\\\/\\/g;
    233 
    234         # Does this metadata element specify a document to obtain?
    235         if ($field eq $document_field) {
    236             my $document_file_full = $document_prefix . $value . $document_suffix;
    237             $document_file = &obtain_document($document_file_full, $documents_directory, $verbosity);
    238             &write_metadata_xml_file_entry(METADATA_XML_FILE, $document_file, $record_metadata, $metadata_set);
    239         }
    240         }
    241     }
    242     # Create a dummy .nul file if we haven't obtained any documents for this record
    243     if (not defined $document_file) {
    244         $document_file = sprintf("%8.8d", $record_number) . ".nul";
    245         open(DUMMY_FILE, ">$documents_directory/$document_file");
    246         close(DUMMY_FILE);
    247         &write_metadata_xml_file_entry(METADATA_XML_FILE, $document_file, $record_metadata, $metadata_set);
    248     }
    249 
    250     if (($record_number % $records_per_folder) == 0 || $record_number == scalar(@metadata_records)) {
    251         # Finish and close the metadata.xml file
    252         print METADATA_XML_FILE "\n</DirectoryMetadata>\n";
    253         close(METADATA_XML_FILE);
    254     }
    255     $record_number = $record_number + 1;
    256     }
     254
     255    $plugobj->extra_metadata ($doc_obj, $doc_obj->get_top_section(), $metadata);
     256
     257    # Try to get a doc to attach the metadata to
     258    # If no match found, create a dummy .nul file
     259    attach_metadata_or_make_nul_doc($document_field, $doc_obj, undef,
     260                    $documents_directory, $orig_base_dir,
     261                    $document_prefix, $document_suffix, $metadata_set, $verbosity);
     262
     263
     264    close_directory();
     265    }
     266
    257267
    258268    # Explode means just that: the original file is deleted
    259269    &util::rm($filename);
    260270    $plugobj->clean_up_after_exploding();
    261 }
     271
     272}
     273
     274
     275sub need_new_directory
     276{
     277    my ($exploded_base_dir) = @_;
     278   
     279    my $documents_directory = $exploded_base_dir;
     280
     281    if (-d $documents_directory) {
     282    die "Error: document directory $documents_directory already exists (bailing).\n";
     283    }
     284    &util::mk_dir($documents_directory);
     285
     286    my $documents_metadata_xml_file = &util::filename_cat($documents_directory, "metadata.xml");
     287    if (-e $documents_metadata_xml_file) {
     288    die "Error: documents metadata.xml file $documents_metadata_xml_file already exists (bailing).\n";
     289    }
     290
     291    # Start the metadata.xml file
     292    open(METADATA_XML_FILE, ">$documents_metadata_xml_file");
     293    print METADATA_XML_FILE
     294    "<?xml version=\"1.0\" encoding=\"UTF-8\" standalone=\"no\"?>\n" .
     295    "<!DOCTYPE DirectoryMetadata SYSTEM \"http://greenstone.org/dtd/DirectoryMetadata/1.0/DirectoryMetadata.dtd\">\n" .
     296    "<DirectoryMetadata>\n";
     297
     298    return $documents_directory;
     299}
     300
     301sub check_need_new_directory
     302{
     303    my ($exploded_base_dir,$record_number, $records_per_folder,$metadata_records,
     304    $documents_dir_ref) = @_;
     305   
     306
     307    # Check if we need to start a new directory for these records
     308    if (($record_number % $records_per_folder) == 1) {
     309    my $documents_directory = $exploded_base_dir;
     310
     311    if (scalar(@$metadata_records) > $records_per_folder) {
     312        $documents_directory .= "." . sprintf("%8.8d", $record_number);
     313    }
     314
     315    $$documents_dir_ref = need_new_directory($documents_directory);
     316    }
     317}
     318
     319
     320
     321
     322
     323sub attach_metadata_or_make_nul_doc
     324{
     325    my ($document_field, $doc_obj, $record_number,
     326    $documents_directory, $orig_base_dir,
     327    $document_prefix, $document_suffix, $metadata_set, $verbosity) = @_;
     328
     329    my $record_metadata = $doc_obj->get_all_metadata($doc_obj->get_top_section());
     330    my $document_file;
     331
     332    # try to get a doc to attach the metadata to
     333    if (defined $document_field) {
     334    foreach my $pair (@$record_metadata) {
     335        my ($field, $value) = (@$pair);
     336
     337        $value =~ s/\\\\/\\/g;
     338       
     339        # Does this metadata element specify a document to obtain?
     340        if ($field eq $document_field) {
     341        my $document_file_full = $document_prefix . $value . $document_suffix;
     342
     343        $document_file = &obtain_document($document_file_full, $documents_directory, $orig_base_dir, $verbosity);
     344        &write_metadata_xml_file_entry(METADATA_XML_FILE, $document_file, $record_metadata, $metadata_set);
     345        }
     346    }
     347    }
     348   
     349    # Create a dummy .nul file if we haven't obtained any documents for this record
     350    if (not defined $document_file) {
     351
     352    if (defined ($record_number)) {
     353        $document_file = sprintf("%8.8d", $record_number) . ".nul";
     354    }
     355    else {
     356        $document_file = "doc.nul";
     357    }
     358
     359    open(DUMMY_FILE, ">$documents_directory/$document_file");
     360    close(DUMMY_FILE);
     361    &write_metadata_xml_file_entry(METADATA_XML_FILE, $document_file, $record_metadata, $metadata_set);
     362    }
     363
     364}
     365
     366sub close_directory
     367{
     368    # Finish and close the metadata.xml file
     369    print METADATA_XML_FILE "\n</DirectoryMetadata>\n";
     370    close(METADATA_XML_FILE);
     371
     372}
     373
     374
     375sub check_close_directory
     376{
     377    my ($record_number,$records_per_folder,$metadata_records) = @_;
     378
     379    if (($record_number % $records_per_folder) == 0 || $record_number == scalar(@$metadata_records)) {
     380    # Finish and close the metadata.xml file
     381    close_directory();
     382    }
     383}
     384       
    262385
    263386
     
    322445sub obtain_document
    323446{
    324     my $document_file_full = shift(@_);
    325     my $documents_directory = shift(@_);
    326     my $verbosity = shift(@_);
     447    my ($document_file_full,$documents_directory,$orig_base_dir,$verbosity) = @_;
    327448   
    328449    print STDERR "Obtaining document file $document_file_full...\n" if ($verbosity > 1);
     
    332453
    333454    # Document specified is on the web
    334     if ($document_file_full =~ /^http:/ || $document_file_full =~ /^ftp:/) {
     455    if ($document_file_full =~ /^https?:/ || $document_file_full =~ /^ftp:/) {
    335456    $document_file_full =~ /([^\/]+)$/;
    336457    $document_file_name = $1;
     
    351472    else {
    352473    my $dir_sep = &util::get_os_dirsep();
    353     $document_file_full =~ /(.+$dir_sep)?(.*)$/;
     474
     475    $document_file_full =~ m/(.+$dir_sep)?(.*)$/;
    354476    $document_file_name = $2;
     477
     478
     479    my $is_absolute = File::Spec->file_name_is_absolute($document_file_full);
     480    print STDERR "doc file full = $document_file_full\n";
     481
     482    if (!$is_absolute) {
     483        $document_file_full
     484        = &util::filename_cat($orig_base_dir,$document_file_full);
     485    }
     486
    355487    $local_document_file = &util::filename_cat($documents_directory, $document_file_name);
    356488
    357     # Only bother trying to copy the file if it contained some path information
    358     if ($document_file_full ne $document_file_name) {
    359         &util::cp($document_file_full, $documents_directory);
    360 
    361         # Check the document was obtained successfully
    362         if (!-e $local_document_file) {
    363         print STDERR "WARNING: Could not obtain document file $document_file_full\n";
     489    &util::cp($document_file_full, $documents_directory);
     490
     491    # Check the document was obtained successfully
     492    if (!-e $local_document_file) {
     493        print STDERR "WARNING: Could not obtain document file $document_file_full\n";
     494    }
     495    else {
     496        if ($document_file_full =~ m/^$orig_base_dir.*/) {
     497        # file local to metadata record
     498        # => copy has been made successfully, so remove original
     499        &util::rm($document_file_full);
    364500        }
    365501    }
     
    377513
    378514&main(@ARGV);
     515
Note: See TracChangeset for help on using the changeset viewer.