Changeset 9147


Ignore:
Timestamp:
2005-02-23T14:01:04+13:00 (19 years ago)
Author:
kjdon
Message:

added in lots more options, mainly to do with downloading files

File:
1 edited

Legend:

Unmodified
Added
Removed
  • trunk/gsdl/bin/script/explode_metadata_database.pl

    r9121 r9147  
    2525    'desc' => "{explode.encoding}",
    2626    'type' => "enum",
    27     'deft' => "iso_8859_1",
     27    'deft' => "auto",
    2828    'list' => $unicode_list,
    29     'reqd' => "no" } ,
     29    'reqd' => "no" },
    3030      { 'name' => "metadata_set",
    3131    'desc' => "{explode.metadata_set}",
    3232    'type' => "string",
    33     'reqd' => "no" } ,
     33    'reqd' => "no",
     34    'hiddengli' => "yes"},
    3435      { 'name' => "plugin",
    3536    'desc' => "{explode.plugin}",
    3637    'type' => "string",
    37     'reqd' => "yes" },
     38    'reqd' => "yes",
     39    'hiddengli' => "yes"},
     40      { 'name' => "document_field",
     41    'desc' => "{explode.document_field}",
     42    'type' => "string",
     43    'reqd' => "no"},
     44       { 'name' => "document_prefix",
     45    'desc' => "{explode.document_prefix}",
     46    'type' => "string",
     47    'reqd' => "no"},
     48      { 'name' => "document_suffix",
     49    'desc' => "{explode.document_suffix}",
     50    'type' => "string",
     51    'reqd' => "no"},
    3852      { 'name' => "filename_field",
    3953    'desc' => "{explode.filename_field}",
    4054    'type' => "string",
    41     'reqd' => "no"}
     55    'reqd' => "no"}     
    4256      ];
    4357   
     
    4963sub main
    5064{
    51     my ($encoding, $metadata_set, $plugin, $filename_field);
    52 
    53    
     65    my ($encoding, $metadata_set, $plugin, $filename_field,
     66    $document_field, $document_prefix, $document_suffix);
     67
     68    my $xml = 0;
    5469    # Parse command line arguments
    5570    if (!parsargv::parse(\@ARGV,
     71             'language/.*/', \$language,
    5672             'input_encoding/.*/auto', \$encoding,
    5773             'metadata_set/.*/', \$metadata_set,
    5874             'plugin/.*/', \$plugin,
    59              'filename_field/.*/', \$filename_field)) {
     75             'filename_field/.*/', \$filename_field,
     76             'document_field/.*/', \$document_field,
     77             'document_prefix/.*/', \$document_prefix,
     78             'document_suffix/.*/', \$document_suffix,
     79             q^xml^, \$xml)) {
    6080    &PrintUsage::print_txt_usage($options, "{explode.params}");
    6181    die "\n";
     82    }
     83   
     84    # If $language has been specified, load the appropriate resource bundle
     85    # (Otherwise, the default resource bundle will be loaded automatically)
     86    if ($language) {
     87    &gsprintf::load_language_specific_resource_bundle($language);
     88    }
     89
     90    if ($xml) {
     91        &PrintUsage::print_xml_usage($options);
     92    print "\n";
     93    return;
    6294    }
    6395
     
    89121
    90122    #check filename field
    91    
     123    if (defined $fileanme_field && $filenmae_field eq "") {
     124    undef $filename_field;
     125    }
    92126    my $plugobj;
    93127    require "$plugin.pm";
     
    137171    my $doc_obj = new doc($filename, "nonindexed_doc");
    138172    $plugobj->process(\$record_text, undef, undef, $filename, undef, $doc_obj, 0);
    139     # try to get a file name
     173    # Get all the metadata assigned to this record
     174    my $record_metadata = $doc_obj->get_all_metadata($doc_obj->get_top_section());
    140175    my $document_file;
    141     if (defined $filename_field) {
    142         my $meta = $doc_obj->get_metadata_element($doc_obj->get_top_section(), $filename_field);
    143         if (defined $meta) {
    144         $meta =~ s/&\w{1,10};//g; # remove entities
    145         $document_file = "$meta.nul";
    146         my $num = 0;
    147         while (-e "$documents_directory/$document_file") {
    148             $num++;
    149             $document_file = "$meta$num.nul";
     176   
     177    # try to get a doc to attach the metadata to
     178    if (defined $document_field) {
     179        foreach my $pair (@$record_metadata) {
     180        my ($field, $value) = (@$pair);
     181       
     182        # Does this metadata element specify a document to obtain?
     183        if ($field eq $document_field) {
     184            my $document_file_full = $document_prefix . $value . $document_suffix;
     185            $document_file = &obtain_document($self, $document_file_full, $documents_directory);
     186        }
     187        }
     188    }
     189    # do we need to create a dummy doc??
     190    if (not defined $document_file) {
     191        # try to get a file name
     192        if (defined $filename_field) {
     193        my $meta = $doc_obj->get_metadata_element($doc_obj->get_top_section(), $filename_field);
     194        if (defined $meta) {
     195            $meta =~ s/&\w{1,10};//g; # remove entities
     196            $document_file = "$meta.nul";
     197            my $num = 0;
     198            while (-e "$documents_directory/$document_file") {
     199            $num++;
     200            $document_file = "$meta$num.nul";
     201            }
     202        } else {
     203            $record_number = $record_number + 1;
     204            $document_file = sprintf("%4.4d", $record_number) . ".nul";
    150205        }
    151206        } else {
     
    153208        $document_file = sprintf("%4.4d", $record_number) . ".nul";
    154209        }
    155     } else {
    156         $record_number = $record_number + 1;
    157         $document_file = sprintf("%4.4d", $record_number) . ".nul";
     210        open(DUMMY_FILE, ">$documents_directory/$document_file");
     211        close(DUMMY_FILE);
     212
    158213    }
    159214   
    160     open(DUMMY_FILE, ">$documents_directory/$document_file");
    161     close(DUMMY_FILE);
    162 
    163     # Look at all the metadata assigned to this record
    164     my $record_metadata = $doc_obj->get_all_metadata($cursection);
     215
    165216    &write_metadata_xml_file_entry(METADATA_XML_FILE, $document_file, $record_metadata, $metadata_set);
    166217    }
     
    171222
    172223    # Explode means just that: the original file is deleted
    173     #&util::rm($filename);
     224    &util::rm($filename);
    174225}
    175226
     
    231282}
    232283
     284sub obtain_document
     285{
     286    my $self = shift(@_);
     287    my $document_file_full = shift(@_);
     288    my $documents_directory = shift(@_);
     289
     290    my $outhandle = $self->{'outhandle'};
     291    print $outhandle "Obtaining document file $document_file_full...\n"
     292    if ($self->{'verbosity'} > 1);
     293
     294    my $document_file_name;
     295    my $local_document_file;
     296
     297    # Document specified is on the web
     298    if ($document_file_full =~ /^http:/ || $document_file_full =~ /^ftp:/) {
     299    $document_file_full =~ /([^\/]+)$/;
     300    $document_file_name = $1;
     301    $local_document_file = &util::filename_cat($documents_directory, $document_file_name);
     302
     303    my $wget_options = "--quiet";
     304    $wget_options = "--verbose" if ($self->{'verbosity'} > 2);
     305    $wget_options .= " --timestamping";  # Only re-download files if they're newer
     306    `wget $wget_options $document_file_full --output-document $local_document_file`;
     307    }
     308    # Document specified is on the disk
     309    else {
     310    my $dir_sep = &util::get_os_dirsep();
     311    $document_file_full =~ /(.+$dir_sep)?(.*)$/;
     312    $document_file_name = $2;
     313    $local_document_file = &util::filename_cat($documents_directory, $document_file_name);
     314
     315    &util::cp($document_file_full, $documents_directory);
     316    }
     317
     318    # Check the document was obtained successfully
     319    if (!-e $local_document_file) {
     320    print STDERR "WARNING: Could not obtain document file $document_file_full\n";
     321    return undef;
     322    }
     323
     324    return $document_file_name;
     325}
    233326
    234327&main(@ARGV);
Note: See TracChangeset for help on using the changeset viewer.