Changeset 16790

Show
Ignore:
Timestamp:
14.08.2008 16:39:09 (11 years ago)
Author:
davidb
Message:

Support for exploding oai records. Added at the request of John Rose.

Files:
1 modified

Legend:

Unmodified
Added
Removed
  • gsdl/trunk/bin/script/explode_metadata_database.pl

    r15074 r16790  
    1616use parse2; 
    1717use FileHandle; 
     18 
     19use File::Spec; 
     20use File::Basename; 
    1821 
    1922my $unicode_list = 
     
    9598        'args' => $arguments }; 
    9699 
    97          
     100 
     101 
    98102sub main 
    99103{ 
     
    131135    } 
    132136 
     137     
    133138    # There should one arg left after parsing (the filename) 
    134139    # Or the user may have specified -h, in which case we output the usage 
     
    177182    } 
    178183 
    179     my $text = ""; 
    180     # Use the plugin's read_file function to avoid duplicating code 
    181     $plugobj->read_file($filename, $input_encoding, undef, \$text); 
    182     # is there any text in the file?? 
    183     die "\n" unless length($text); 
    184  
    185184    # Create a directory to store the document files... 
    186     my ($documents_directory_base) = ($filename =~ /(.*)\.[^\.]+$/); 
    187  
    188     # Split the text into records, using the plugin's split_exp 
     185    my ($exploded_base_dir) = ($filename =~ /(.*)\.[^\.]+$/); 
     186 
     187    my $orig_base_dir = &File::Basename::dirname($filename); 
     188 
     189 
    189190    my $split_exp = $plugobj->{'split_exp'}; 
    190     my @metadata_records = split(/$split_exp/, $text); 
    191     print STDERR "Number of records: " . scalar(@metadata_records) . "\n"; 
    192  
    193     # Write the metadata from each record to the metadata.xml file 
    194     my $record_number = 1; 
    195     my $documents_directory; 
    196     foreach my $record_text (@metadata_records) { 
    197     # Check if we need to start a new directory for these records 
    198     if (($record_number % $records_per_folder) == 1) { 
    199         $documents_directory = $documents_directory_base; 
    200         if (scalar(@metadata_records) > $records_per_folder) { 
    201         $documents_directory .= "." . sprintf("%8.8d", $record_number); 
    202         } 
    203         if (-d $documents_directory) { 
    204         die "Error: document directory $documents_directory already exists (bailing).\n"; 
    205         } 
    206         &util::mk_dir($documents_directory); 
    207  
    208         my $documents_metadata_xml_file = &util::filename_cat($documents_directory, "metadata.xml"); 
    209         if (-e $documents_metadata_xml_file) { 
    210         die "Error: documents metadata.xml file $documents_metadata_xml_file already exists (bailing).\n"; 
    211         } 
    212  
    213         # Start the metadata.xml file 
    214         open(METADATA_XML_FILE, ">$documents_metadata_xml_file"); 
    215         print METADATA_XML_FILE 
    216         "<?xml version=\"1.0\" encoding=\"UTF-8\" standalone=\"no\"?>\n" . 
    217         "<!DOCTYPE DirectoryMetadata SYSTEM \"http://greenstone.org/dtd/DirectoryMetadata/1.0/DirectoryMetadata.dtd\">\n" . 
    218         "<DirectoryMetadata>\n"; 
    219     } 
    220  
    221     # Use the plugin's process function to avoid duplicating code 
     191    if (defined $split_exp) { 
     192    # Read in file, and then split and process individual records 
     193 
     194    my $text = ""; 
     195    # Use the plugin's read_file function to avoid duplicating code 
     196    $plugobj->read_file($filename, $input_encoding, undef, \$text); 
     197    # is there any text in the file?? 
     198    die "\n" unless length($text); 
     199 
     200    # Split the text into records, using the plugin's split_exp 
     201 
     202    my @metadata_records = split(/$split_exp/, $text); 
     203    print STDERR "Number of records: " . scalar(@metadata_records) . "\n"; 
     204     
     205    # Write the metadata from each record to the metadata.xml file 
     206    my $record_number = 1; 
     207    foreach my $record_text (@metadata_records) { 
     208         
     209        # Check if we need to start a new directory for these records 
     210        my $documents_directory; 
     211        check_need_new_directory($exploded_base_dir,$record_number,$records_per_folder, 
     212                     \@metadata_records,\$documents_directory); 
     213         
     214        # Use the plugin's process function to avoid duplicating code 
     215        my $doc_obj = new doc($filename, "nonindexed_doc"); 
     216        $plugobj->process(\$record_text, undef, undef, $filename, undef, $doc_obj, 0); 
     217         
     218         
     219        # Try to get a doc to attach the metadata to 
     220        # If no match found, create a dummy .nul file 
     221        attach_metadata_or_nul_doc($document_field, $doc_obj, $record_number, 
     222                       $documents_directory, $orig_base_dir, 
     223                       $document_prefix, $document_suffix, $metadata_set, $verbosity); 
     224                 
     225         
     226        check_close_directory($record_number,$records_per_folder,\@metadata_records); 
     227         
     228        $record_number = $record_number + 1; 
     229    } 
     230    } 
     231    else { 
     232    # Call metadata_read to sets up associated metadata  
     233 
     234    my $pluginfo = undef; 
     235    my $metadata = {}; 
     236 
     237    my $processor = undef; 
     238    my $maxdocs = undef; 
     239    my $gli = undef; 
     240 
     241    my $extrametakeys = {}; 
     242    my $extrametadata = {}; 
     243 
     244 
     245    $plugobj->metadata_read($pluginfo, "", $filename, $metadata,  
     246                $extrametakeys, $extrametadata, $processor, $maxdocs, $gli); 
     247 
     248 
     249    my $documents_directory = need_new_directory($exploded_base_dir); 
     250 
     251    # Attach metadata to object 
     252    # => use the plugin's extra_metadata function to avoid duplicating code 
    222253    my $doc_obj = new doc($filename, "nonindexed_doc"); 
    223     $plugobj->process(\$record_text, undef, undef, $filename, undef, $doc_obj, 0); 
    224     # Get all the metadata assigned to this record 
    225     my $record_metadata = $doc_obj->get_all_metadata($doc_obj->get_top_section()); 
    226     my $document_file; 
    227      
    228     # try to get a doc to attach the metadata to 
    229     if (defined $document_field) { 
    230         foreach my $pair (@$record_metadata) { 
    231         my ($field, $value) = (@$pair); 
    232         $value =~ s/\\\\/\\/g; 
    233  
    234         # Does this metadata element specify a document to obtain? 
    235         if ($field eq $document_field) { 
    236             my $document_file_full = $document_prefix . $value . $document_suffix; 
    237             $document_file = &obtain_document($document_file_full, $documents_directory, $verbosity); 
    238             &write_metadata_xml_file_entry(METADATA_XML_FILE, $document_file, $record_metadata, $metadata_set); 
    239         } 
    240         } 
    241     } 
    242     # Create a dummy .nul file if we haven't obtained any documents for this record 
    243     if (not defined $document_file) { 
    244         $document_file = sprintf("%8.8d", $record_number) . ".nul"; 
    245         open(DUMMY_FILE, ">$documents_directory/$document_file"); 
    246         close(DUMMY_FILE); 
    247         &write_metadata_xml_file_entry(METADATA_XML_FILE, $document_file, $record_metadata, $metadata_set); 
    248     } 
    249  
    250     if (($record_number % $records_per_folder) == 0 || $record_number == scalar(@metadata_records)) { 
    251         # Finish and close the metadata.xml file 
    252         print METADATA_XML_FILE "\n</DirectoryMetadata>\n"; 
    253         close(METADATA_XML_FILE); 
    254     } 
    255     $record_number = $record_number + 1; 
    256     } 
     254 
     255    $plugobj->extra_metadata ($doc_obj, $doc_obj->get_top_section(), $metadata); 
     256 
     257    # Try to get a doc to attach the metadata to 
     258    # If no match found, create a dummy .nul file 
     259    attach_metadata_or_make_nul_doc($document_field, $doc_obj, undef,  
     260                    $documents_directory, $orig_base_dir, 
     261                    $document_prefix, $document_suffix, $metadata_set, $verbosity); 
     262 
     263 
     264    close_directory(); 
     265    } 
     266 
    257267 
    258268    # Explode means just that: the original file is deleted 
    259269    &util::rm($filename); 
    260270    $plugobj->clean_up_after_exploding(); 
    261 } 
     271 
     272} 
     273 
     274 
     275sub need_new_directory 
     276{ 
     277    my ($exploded_base_dir) = @_; 
     278     
     279    my $documents_directory = $exploded_base_dir; 
     280 
     281    if (-d $documents_directory) { 
     282    die "Error: document directory $documents_directory already exists (bailing).\n"; 
     283    } 
     284    &util::mk_dir($documents_directory); 
     285 
     286    my $documents_metadata_xml_file = &util::filename_cat($documents_directory, "metadata.xml"); 
     287    if (-e $documents_metadata_xml_file) { 
     288    die "Error: documents metadata.xml file $documents_metadata_xml_file already exists (bailing).\n"; 
     289    } 
     290 
     291    # Start the metadata.xml file 
     292    open(METADATA_XML_FILE, ">$documents_metadata_xml_file"); 
     293    print METADATA_XML_FILE 
     294    "<?xml version=\"1.0\" encoding=\"UTF-8\" standalone=\"no\"?>\n" . 
     295    "<!DOCTYPE DirectoryMetadata SYSTEM \"http://greenstone.org/dtd/DirectoryMetadata/1.0/DirectoryMetadata.dtd\">\n" . 
     296    "<DirectoryMetadata>\n"; 
     297 
     298    return $documents_directory; 
     299} 
     300 
     301sub check_need_new_directory 
     302{ 
     303    my ($exploded_base_dir,$record_number, $records_per_folder,$metadata_records, 
     304    $documents_dir_ref) = @_; 
     305     
     306 
     307    # Check if we need to start a new directory for these records 
     308    if (($record_number % $records_per_folder) == 1) { 
     309    my $documents_directory = $exploded_base_dir; 
     310 
     311    if (scalar(@$metadata_records) > $records_per_folder) { 
     312        $documents_directory .= "." . sprintf("%8.8d", $record_number); 
     313    } 
     314 
     315    $$documents_dir_ref = need_new_directory($documents_directory); 
     316    } 
     317} 
     318 
     319 
     320 
     321 
     322 
     323sub attach_metadata_or_make_nul_doc 
     324{ 
     325    my ($document_field, $doc_obj, $record_number,  
     326    $documents_directory, $orig_base_dir, 
     327    $document_prefix, $document_suffix, $metadata_set, $verbosity) = @_; 
     328 
     329    my $record_metadata = $doc_obj->get_all_metadata($doc_obj->get_top_section()); 
     330    my $document_file; 
     331 
     332    # try to get a doc to attach the metadata to 
     333    if (defined $document_field) { 
     334    foreach my $pair (@$record_metadata) { 
     335        my ($field, $value) = (@$pair); 
     336 
     337        $value =~ s/\\\\/\\/g; 
     338         
     339        # Does this metadata element specify a document to obtain? 
     340        if ($field eq $document_field) { 
     341        my $document_file_full = $document_prefix . $value . $document_suffix; 
     342 
     343        $document_file = &obtain_document($document_file_full, $documents_directory, $orig_base_dir, $verbosity); 
     344        &write_metadata_xml_file_entry(METADATA_XML_FILE, $document_file, $record_metadata, $metadata_set); 
     345        } 
     346    } 
     347    } 
     348     
     349    # Create a dummy .nul file if we haven't obtained any documents for this record 
     350    if (not defined $document_file) { 
     351 
     352    if (defined ($record_number)) { 
     353        $document_file = sprintf("%8.8d", $record_number) . ".nul"; 
     354    } 
     355    else { 
     356        $document_file = "doc.nul"; 
     357    } 
     358 
     359    open(DUMMY_FILE, ">$documents_directory/$document_file"); 
     360    close(DUMMY_FILE); 
     361    &write_metadata_xml_file_entry(METADATA_XML_FILE, $document_file, $record_metadata, $metadata_set); 
     362    } 
     363 
     364} 
     365 
     366sub close_directory 
     367{ 
     368    # Finish and close the metadata.xml file 
     369    print METADATA_XML_FILE "\n</DirectoryMetadata>\n"; 
     370    close(METADATA_XML_FILE); 
     371 
     372} 
     373 
     374 
     375sub check_close_directory 
     376{ 
     377    my ($record_number,$records_per_folder,$metadata_records) = @_; 
     378 
     379    if (($record_number % $records_per_folder) == 0 || $record_number == scalar(@$metadata_records)) { 
     380    # Finish and close the metadata.xml file 
     381    close_directory(); 
     382    } 
     383} 
     384         
    262385 
    263386 
     
    322445sub obtain_document 
    323446{ 
    324     my $document_file_full = shift(@_); 
    325     my $documents_directory = shift(@_); 
    326     my $verbosity = shift(@_); 
     447    my ($document_file_full,$documents_directory,$orig_base_dir,$verbosity) = @_; 
    327448     
    328449    print STDERR "Obtaining document file $document_file_full...\n" if ($verbosity > 1); 
     
    332453 
    333454    # Document specified is on the web 
    334     if ($document_file_full =~ /^http:/ || $document_file_full =~ /^ftp:/) { 
     455    if ($document_file_full =~ /^https?:/ || $document_file_full =~ /^ftp:/) { 
    335456    $document_file_full =~ /([^\/]+)$/; 
    336457    $document_file_name = $1; 
     
    351472    else { 
    352473    my $dir_sep = &util::get_os_dirsep(); 
    353     $document_file_full =~ /(.+$dir_sep)?(.*)$/; 
     474 
     475    $document_file_full =~ m/(.+$dir_sep)?(.*)$/; 
    354476    $document_file_name = $2; 
     477 
     478 
     479    my $is_absolute = File::Spec->file_name_is_absolute($document_file_full); 
     480    print STDERR "doc file full = $document_file_full\n"; 
     481 
     482    if (!$is_absolute) { 
     483        $document_file_full  
     484        = &util::filename_cat($orig_base_dir,$document_file_full); 
     485    } 
     486 
    355487    $local_document_file = &util::filename_cat($documents_directory, $document_file_name); 
    356488 
    357     # Only bother trying to copy the file if it contained some path information 
    358     if ($document_file_full ne $document_file_name) { 
    359         &util::cp($document_file_full, $documents_directory); 
    360  
    361         # Check the document was obtained successfully 
    362         if (!-e $local_document_file) { 
    363         print STDERR "WARNING: Could not obtain document file $document_file_full\n"; 
     489    &util::cp($document_file_full, $documents_directory); 
     490 
     491    # Check the document was obtained successfully 
     492    if (!-e $local_document_file) { 
     493        print STDERR "WARNING: Could not obtain document file $document_file_full\n"; 
     494    } 
     495    else { 
     496        if ($document_file_full =~ m/^$orig_base_dir.*/) { 
     497        # file local to metadata record 
     498        # => copy has been made successfully, so remove original 
     499        &util::rm($document_file_full); 
    364500        } 
    365501    } 
     
    377513 
    378514&main(@ARGV); 
     515