Changeset 8563


Ignore:
Timestamp:
2004-11-16T13:17:06+13:00 (19 years ago)
Author:
mdewsnip
Message:

Ripped all the obtaining referenced documents and exploding database code out into a new Perl script (bin/script/process_metadata_databases.pl) and changed the subfield character to be "".

File:
1 edited

Legend:

Unmodified
Added
Removed
  • trunk/gsdl/perllib/plugins/ISISPlug.pm

    r8121 r8563  
    6565    'type' => "string",
    6666    'reqd' => "no",
    67     'deft' => ", " },
    68       { 'name' => "document_field",
    69     'desc' => "{ISISPlug.document_field}",
    70     'type' => "string",
    71     'reqd' => "no",
    72     'deft' => "" },
    73       { 'name' => "document_prefix",
    74     'desc' => "{ISISPlug.document_prefix}",
    75     'type' => "string",
    76     'reqd' => "no",
    77     'deft' => "" },
    78       { 'name' => "document_suffix",
    79     'desc' => "{ISISPlug.document_suffix}",
    80     'type' => "string",
    81     'reqd' => "no",
    82     'deft' => "" }
     67    'deft' => ", " }
    8368      ];
    8469
     
    10893
    10994
    110 sub new {
     95sub new
     96{
    11197    my $class = shift(@_);
    11298
     
    115101             q^subfield_separator/.*/, ^, \$self->{'subfield_separator'},
    116102             q^entry_separator/.*/<br>^, \$self->{'entry_separator'},
    117              q^document_field/.*/^, \$self->{'document_field'},
    118              q^document_prefix/.*/^, \$self->{'document_prefix'},
    119              q^document_suffix/.*/^, \$self->{'document_suffix'},
    120103             "allow_extra_options")) {
    121104    print STDERR "\nIncorrect options passed to ISISPlug, check your collect.cfg configuration file\n";
     
    132115
    133116
    134 sub read
    135 {
    136     my $self = shift(@_);
    137     my ($pluginfo, $base_dir, $file, $metadata, $processor, $maxdocs) = @_;
    138 
    139     my $result = &SplitPlug::read($self, @_);
    140     if ($file =~ /$self->{'process_exp'}/ && $self->{'document_field'}) {
    141     &end_metadata_xml_file($self->{'documents_metadata_xml_file'});
    142     }
    143 
    144     return $result;
    145 }
    146 
    147 
    148 sub read_file {
     117sub read_file
     118{
    149119    my $self = shift (@_);
    150120    my ($filename, $encoding, $language, $textref) = @_;
    151121
    152     my ($databasename) = ($filename =~ /([^\.]+)\.mst$/i);
     122    my ($databasename) = ($filename =~ /(.*)\.mst$/i);
    153123
    154124    # Check the associated .fdt and .xrf files exist
     
    168138
    169139    my $reader = new multiread();
    170     $reader->set_handle ('ISISPlug::FILE');
    171     $reader->set_encoding ($encoding);
    172     $reader->read_file ($textref);
     140    $reader->set_handle('ISISPlug::FILE');
     141    $reader->set_encoding($encoding);
     142    $reader->read_file($textref);
    173143
    174144    close(FILE);
     
    182152    # Remove the line at the start so it is split and processed properly
    183153    $$textref =~ s/^----------\n//;
    184 
    185     # Obtain the documents specified in the CDS/ISIS database, if requested
    186     if ($self->{'document_field'}) {
    187     # Create a directory to store the document files
    188     $self->{'documents_directory'} = $databasename . ".all";
    189     if (-e $self->{'documents_directory'}) {
    190         &util::rm_r($self->{'documents_directory'});
    191     }
    192     &util::mk_dir($self->{'documents_directory'});
    193 
    194     # ...and a metadata.xml file for the document metadata (extracted from the database)
    195     $self->{'documents_metadata_xml_file'} = &util::filename_cat($self->{'documents_directory'}, "metadata.xml");
    196     if (-e $self->{'documents_metadata_xml_file'}) {
    197         &util::rm($self->{'documents_metadata_xml_file'});
    198     }
    199     &begin_metadata_xml_file($self->{'documents_metadata_xml_file'});
    200     }
    201154}
    202155
     
    244197        my $subfieldname = "";
    245198        if ($rawtagvalue =~ s/^\^([a-z])//) {
    246             $subfieldname = "." . $1;
     199            $subfieldname = "^$1";
    247200        }
    248201
     
    251204        my $metadatafieldname = $tagname . $subfieldname;
    252205        my $metadatafieldvalue = $1;
    253         # print "Metadata: $metadatafieldname -> $metadatafieldvalue\n";
    254206
    255207        # Handle Keywords specially
     
    278230    }
    279231
    280     # print "Metadata: $tagname.all -> $completetagvalue\n";
    281     $doc_obj->add_utf8_metadata($cursection, $tagname . ".all", $completetagvalue);
    282     }
    283     # print "\n";
    284     # Add fileFormat as the metadata
    285     $doc_obj->add_metadata($cursection, "FileFormat", "CDS/ISIS");
     232    $doc_obj->add_utf8_metadata($cursection, $tagname . "^all", $completetagvalue);
     233    }
     234
    286235    # Add the full record as the document text
    287236    $$textref =~ s/\</&lt;/g;
    288237    $$textref =~ s/\>/&gt;/g;
    289     $doc_obj->add_utf8_text ($cursection, $$textref);
    290 
    291     # Obtain the documents specified in the CDS/ISIS database, if requested
    292     if ($self->{'document_field'}) {
    293     my $document_field = $self->{'document_field'};
    294     my $document_prefix = $self->{'document_prefix'} || "";
    295     my $document_suffix = $self->{'document_suffix'} || "";
    296 
    297     my $documents_directory = $self->{'documents_directory'};
    298     my $document_obtained = 0;
    299 
    300     # Look at all the metadata assigned to this record
    301     my $record_metadata = $doc_obj->get_all_metadata($cursection);
    302     foreach my $pair (@$record_metadata) {
    303         my ($field, $value) = (@$pair);
    304 
    305         # Does this metadata element specify a document to obtain?
    306         if ($field eq $document_field) {
    307         my $document_file_full = $document_prefix . $value . $document_suffix;
    308 
    309         my $document_file = &obtain_document($self, $document_file_full, $documents_directory);
    310         if ($document_file) {
    311             $document_obtained = 1;
    312             &write_metadata_xml_file($self->{'documents_metadata_xml_file'},
    313                          $document_file, $record_metadata);
    314         }
    315         }
    316     }
    317 
    318     # If there was a document obtained for this record we don't want the record as well
    319     if ($document_obtained) {
    320         return 0;
    321     }
    322     }
     238    $doc_obj->add_utf8_text($cursection, $$textref);
     239
     240    # Add FileFormat metadata
     241    $doc_obj->add_utf8_metadata($cursection, "FileFormat", "CDS/ISIS");
    323242
    324243    # Record was processed successfully (and there was no document obtained)
     
    364283
    365284
    366 sub obtain_document
    367 {
    368     my $self = shift(@_);
    369     my $document_file_full = shift(@_);
    370     my $documents_directory = shift(@_);
    371 
    372     my $outhandle = $self->{'outhandle'};
    373     print $outhandle "Obtaining document file $document_file_full...\n"
    374     if ($self->{'verbosity'} > 1);
    375 
    376     my $document_file_name;
    377     my $local_document_file;
    378 
    379     # Document specified is on the web
    380     if ($document_file_full =~ /^http:/ || $document_file_full =~ /^ftp:/) {
    381     $document_file_full =~ /([^\/]+)$/;
    382     $document_file_name = $1;
    383     $local_document_file = &util::filename_cat($documents_directory, $document_file_name);
    384 
    385     my $wget_options = "--quiet";
    386     $wget_options = "--verbose" if ($self->{'verbosity'} > 2);
    387     $wget_options .= " --timestamping";  # Only re-download files if they're newer
    388     `wget $wget_options $document_file_full --output-document $local_document_file`;
    389     }
    390     # Document specified is on the disk
    391     else {
    392     my $dir_sep = &util::get_os_dirsep();
    393     $document_file_full =~ /(.+$dir_sep)?(.*)$/;
    394     $document_file_name = $2;
    395     $local_document_file = &util::filename_cat($documents_directory, $document_file_name);
    396 
    397     &util::cp($document_file_full, $documents_directory);
    398     }
    399 
    400     # Check the document was obtained successfully
    401     if (!-e $local_document_file) {
    402     print STDERR "WARNING: Could not obtain document file $document_file_full\n";
    403     return undef;
    404     }
    405 
    406     return $document_file_name;
    407 }
    408 
    409 
    410 sub begin_metadata_xml_file
    411 {
    412     my $metadata_xml_file = shift(@_);
    413 
    414     open(METADATA_XML_FILE, ">$metadata_xml_file");
    415     print METADATA_XML_FILE
    416     "<?xml version=\"1.0\" encoding=\"UTF-8\" standalone=\"no\"?>\n" .
    417     "<!DOCTYPE DirectoryMetadata SYSTEM \"http://greenstone.org/dtd/DirectoryMetadata/1.0/DirectoryMetadata.dtd\">\n" .
    418     "<DirectoryMetadata>\n";
    419     close(METADATA_XML_FILE);
    420 }
    421 
    422 
    423 sub write_metadata_xml_file
    424 {
    425     my $metadata_xml_file = shift(@_);
    426     my $file_name = shift(@_);
    427     my $record_metadata = shift(@_);
    428 
    429     # Make $file_name XML-safe
    430     $file_name =~ s/</&lt;/g;
    431     $file_name =~ s/>/&gt;/g;
    432 
    433     open(METADATA_XML_FILE, ">>$metadata_xml_file");
    434 
    435     print METADATA_XML_FILE
    436     "\n" .
    437         "  <FileSet>\n" .
    438     "    <FileName>$file_name</FileName>\n" .
    439     "    <Description>\n";
    440 
    441     foreach my $pair (@$record_metadata) {
    442     my ($field, $value) = (@$pair);
    443 
    444     # We're only interested in metadata from the database
    445     next if ($field eq "gsdlsourcefilename");
    446     next if ($field eq "gsdldoctype");
    447     next if ($field eq "Language");
    448     next if ($field eq "Encoding");
    449     next if ($field eq "Identifier");
    450     next if ($field eq "Source");
    451     next if ($field eq "SourceSegment");
    452     next if ($field eq "Plugin");
    453 
    454     # Make $value XML-safe
    455     $value =~ s/</&lt;/g;
    456     $value =~ s/>/&gt;/g;
    457    
    458     print METADATA_XML_FILE "      <Metadata name=\"$field\">$value</Metadata>\n";
    459     }
    460 
    461     print METADATA_XML_FILE
    462     "    </Description>\n" .
    463         "  </FileSet>\n";
    464 
    465     close(METADATA_XML_FILE);
    466 }
    467 
    468 
    469 sub end_metadata_xml_file
    470 {
    471     my $metadata_xml_file = shift(@_);
    472 
    473     open(METADATA_XML_FILE, ">>$metadata_xml_file");
    474     print METADATA_XML_FILE "\n</DirectoryMetadata>\n";
    475     close(METADATA_XML_FILE);
    476 }
    477 
    478 
    4792851;
Note: See TracChangeset for help on using the changeset viewer.