Changeset 2813 for trunk/gsdl


Ignore:
Timestamp:
2001-11-01T22:03:46+13:00 (22 years ago)
Author:
sjboddie
Message:

Altered RecPlug's -use_metadata_files option to use better XML files and
the XML::Parser module to parse them

File:
1 edited

Legend:

Unmodified
Added
Removed
  • trunk/gsdl/perllib/plugins/RecPlug.pm

    r2795 r2813  
    2828
    2929# RecPlug has one option: use_metadata_files.  When this is set, it will
    30 # check each directory for an XML file called "metadata" that specifies
    31 # metadata for the files (and subdirectories) in the directory.  It will
    32 # also look in any file of the form *.metadata for metadata about the file
    33 # with the same prefix.
    34 #
    35 # Here's an example of a metadata file that cuses theree metadata structures
     30# check each directory for an XML file called "metadata.xml" that specifies
     31# metadata for the files (and subdirectories) in the directory.
     32#
     33# Here's an example of a metadata file that uses three FileSet structures
    3634# (ignore the # characters):
    3735
    38 #<metadata>
    39 #  <filename>nugget.*</filename>
    40 #  <Title>Nugget Point, The Catlins</Title>
    41 #  <Place mode=accumulate>Nugget Point</Place>
    42 #</metadata>
    43 #
    44 #<metadata>
    45 #  <filename>nugget-point-1.jpg</filename>
    46 #  <Title>Nugget Point Lighthouse, The Catlins</Title>
    47 #  <Subject>Lighthouse</Subject>
    48 #</metadata>
    49 #
    50 #<metadata>
    51 #  <filename>kaka-point-dir</filename>
    52 #  <Title>Kaka Point, The Catlins</Title>
    53 #</metadata>
     36#<?xml version="1.0" encoding="UTF-8" standalone="no"?>
     37#<!DOCTYPE GreenstoneDirectoryMetadata SYSTEM "http://greenstone.org/dtd/GreenstoneDirectoryMetadata/1.0/GreenstoneDirectoryMetadata.dtd">
     38#<DirectoryMetadata>
     39#  <FileSet>
     40#    <FileName>nugget.*</FileName>
     41#    <Description>
     42#      <Metadata name="Title">Nugget Point, The Catlins</Metadata>
     43#      <Metadata name="Place" mode="accumulate">Nugget Point</Metadata>
     44#    </Description>
     45#  </FileSet>
     46#  <FileSet>
     47#    <FileName>nugget-point-1.jpg</FileName>
     48#    <Description>
     49#      <Metadata name="Title">Nugget Point Lighthouse, The Catlins</Metadata>
     50#      <Metadata name="Subject">Lighthouse</Metadata>
     51#    </Description>
     52#  </FileSet>
     53#  <FileSet>
     54#    <FileName>kaka-point-dir</FileName>
     55#    <Description>
     56#      <Metadata name="Title">Kaka Point, The Catlins</Metadata>
     57#    </Description>
     58#  </FileSet>
     59#</DirectoryMetadata>
    5460
    5561# Metadata elements are read and applied to files in the order they appear
    56 # in the file.  The directory's "metadata" file is erad first, and then any
    57 # other files of the form "*.metadata" are read in alphabetical order.
    58 #
    59 # The filename element describes the subfiles in the directory that the
    60 # metadata applies to as a perl regular expression, so
    61 # <filename>nugget.*</filename> indicates that the first metadata record
    62 # applies to every subfile that starts with "nugget".  For these files, a
     62# in the file.
     63#
     64# The FileName element describes the subfiles in the directory that the
     65# metadata applies to as a perl regular expression (a FileSet group may
     66# contain multiple FileName elements). So, <FileName>nugget.*</FileName>
     67# indicates that the metadata records in the following Description block
     68# apply to every subfile that starts with "nugget".  For these files, a
    6369# Title metadata element is set, overriding any old value that the Title
    6470# might have had.
     
    6672# Occasionally, we want to have multiple metadata values applied to a
    6773# document; in this case we use the "mode=accumulate" attribute of the
    68 # particular metadata item.  In the first metadata element above, the
    69 # "Place" metadata is accumulating, and is therefore given several values.
    70 # If we wanted to override these values and use a single metadata element
    71 # again, we could write <Place mode=override>New Zealand</Place> instead.
    72 # Remember: every element is assumed to be in override mode unless you
    73 # specify otherwise, so if you want to accumulate metadata for some field,
    74 # every occurance must have "mode=accumulate" specified.
    75 #
    76 # The second metadata element applies to a specific file, called
    77 # nugget-point-1.jpg.  This element overrides the Title set in the first
    78 # element above, and adds a "Subject" ,etadata field.
    79 #
    80 # The third and fional metadata element sets metadata for a subdirectory
    81 # rather than a file.  The metadata specified (a Title) will be passed into
    82 # the subdirectory and applied to every file that occurs in the
    83 # subdirectory (and to every subsubdirectory and its contents, and so on)
    84 # unless the metadata is explictly overridden later in the import.
     74# particular Metadata element.  In the second metadata element of the first
     75# FileSet above, the "Place" metadata is accumulating, and may therefore be
     76# given several values.  If we wanted to override these values and use a
     77# single metadata element again, we could set the mode attribute to
     78# "override" instead.  Remember: every element is assumed to be in override
     79# mode unless you specify otherwise, so if you want to accumulate metadata
     80# for some field, every occurance must have "mode=accumulate" specified.
     81#
     82# The second FileSet element above applies to a specific file, called
     83# nugget-point-1.jpg.  This element overrides the Title metadata set in the
     84# first FileSet, and adds a "Subject" metadata field.
     85#
     86# The third and final FileSet sets metadata for a subdirectory rather than
     87# a file.  The metadata specified (a Title) will be passed into the
     88# subdirectory and applied to every file that occurs in the subdirectory
     89# (and to every subsubdirectory and its contents, and so on) unless the
     90# metadata is explictly overridden later in the import.
    8591
    8692
     
    95101BEGIN {
    96102    @ISA = ('BasPlug');
    97 }
     103    unshift (@INC, "$ENV{'GSDLHOME'}/perllib/cpan");
     104}
     105
     106use XML::Parser;
    98107
    99108sub print_usage {
     
    108117}
    109118
     119my ($self);
    110120sub new {
    111121    my $class = shift (@_);
    112     my $self = new BasPlug ($class, @_);
    113 
     122   
     123    # $self is global for use within subroutines called by XML::Parser
     124    $self = new BasPlug ($class, @_);
     125   
    114126    if (!parsargv::parse(\@_,
    115127             q^use_metadata_files^, \$self->{'use_metadata_files'},
     
    120132    die "\n";
    121133    }
    122 
     134   
     135    if ($self->{'use_metadata_files'}) {
     136    # create XML::Parser object for parsing metadata.xml files
     137    my $parser = new XML::Parser('Style' => 'Stream',
     138                     'Handlers' => {'Char' => \&Char,
     139                            'Doctype' => \&Doctype
     140                            });
     141    $self->{'parser'} = $parser;
     142    $self->{'in_filename'} = 0;
     143   
     144    }
     145   
    123146    return bless $self, $class;
    124147}
     
    127150sub is_recursive {
    128151    my $self = shift (@_);
    129 
     152   
    130153    return 1;
    131154}
     
    133156sub get_default_block_exp {
    134157    my $self = shift (@_);
    135 
     158   
    136159    return 'CVS';
    137160}
     
    149172    my $self = shift (@_);
    150173    my ($pluginfo, $base_dir, $file, $in_metadata, $processor, $maxdocs) = @_;
    151 
     174   
    152175    my $outhandle = $self->{'outhandle'};
    153176    my $verbosity = $self->{'verbosity'};
    154177    my $read_metadata_files = $self->{'use_metadata_files'};
    155 
     178   
    156179    # Calculate the directory name and ensure it is a directory and
    157180    # that it is not explicitly blocked.
     
    160183    return undef unless (-d $dirname);
    161184    return 0 if ($self->{'block_exp'} ne "" && $dirname =~ /$self->{'block_exp'}/);
    162 
     185   
    163186    # check to make sure we're not reading the archives or index directory
    164187    my $gsdlhome = quotemeta($ENV{'GSDLHOME'});
    165     if ($dirname =~ m%^${gsdlhome}/.*?/import.*?/(archives|index)$%) {
    166         print $outhandle "RecPlug: $dirname appears to be a reference to a Greenstone collection, skipping.\n";
     188    if ($dirname =~ m/^$gsdlhome\/.*?\/import.*?\/(archives|index)$/) {
     189    print $outhandle "RecPlug: $dirname appears to be a reference to a Greenstone collection, skipping.\n";
    167190        return 0;
    168191    }
    169 
     192   
    170193    # check to see we haven't got a cyclic path...
    171194    if ($dirname =~ m%(/.*){,41}%) {
     
    173196    return 0;
    174197    }
    175 
     198   
    176199    # check to see we haven't got a cyclic path...
    177200    if ($dirname =~ m%.*?import/(.+?)/import/\1.*%) {
     
    179202    return 0;
    180203    }
    181 
     204   
    182205    if (($verbosity > 2) && ((scalar keys %$in_metadata) > 0)) {
    183206        print $outhandle "RecPlug: metadata passed in: ",
    184                      join(", ", keys %$in_metadata), "\n";
    185     }
    186 
     207    join(", ", keys %$in_metadata), "\n";
     208    }
     209   
    187210    # Recur over directory contents.
    188211    my (@dir, $subfile);
    189212    my $count = 0;
    190213    print $outhandle "RecPlug: getting directory $dirname\n" if ($verbosity);
    191 
     214   
    192215    # find all the files in the directory
    193216    if (!opendir (DIR, $dirname)) {
     
    197220    @dir = readdir (DIR);
    198221    closedir (DIR);
    199 
     222   
    200223    # read XML metadata files (if supplied)
    201224    my $additionalmetadata = 0;      # is there extra metadata available?
    202225    my %extrametadata;               # maps from filespec to extra metadata keys
    203226    my @extrametakeys;               # keys of %extrametadata in order read
    204 
     227   
    205228    if ($read_metadata_files) {
    206229
    207     # first read the directory "metadata" file
    208     my $metadatafile = &util::filename_cat ($dirname, 'metadata');
     230    # read the directory "metadata.xml" file
     231    my $metadatafile = &util::filename_cat ($dirname, 'metadata.xml');
    209232    if (-e $metadatafile) {
    210233        print $outhandle "RecPlug: found metadata in $metadatafile\n"
    211234        if ($verbosity);
    212         &read_metadata_file($metadatafile, \%extrametadata, \@extrametakeys);
    213         $additionalmetadata = 1;
    214     }
    215 
    216     # then read any files with names of the form *.metadata
    217     foreach $subfile (sort @dir) {
    218         next unless ($subfile =~ /^.*\.metadata$/);
    219         $metadatafile = &util::filename_cat ($dirname, $subfile);
    220         print $outhandle "RecPlug: found metadata in $metadatafile\n"
    221         if ($verbosity);
    222         &read_metadata_file($metadatafile, \%extrametadata, \@extrametakeys);
     235        $self->read_metadata_xml_file($metadatafile, \%extrametadata, \@extrametakeys);
    223236        $additionalmetadata = 1;
    224237    }
     
    231244    last if ($maxdocs != -1 && $count >= $maxdocs);
    232245    next if ($subfile =~ /^\.\.?$/);
    233     next if ($read_metadata_files && $subfile =~ /metadata$/);
    234     print "RecPlug: preparing metadata for $subfile\n" if ($verbosity > 2);
     246    next if ($read_metadata_files && $subfile =~ /metadata\.xml$/);
     247    print $outhandle "RecPlug: preparing metadata for $subfile\n" if ($verbosity > 2);
    235248
    236249    # Make a copy of $in_metadata to pass to $subfile
     
    265278
    266279# Read a manually-constructed metadata file and store the data
    267 # it contains in the $metadataerf structure.
     280# it contains in the $metadataref structure.
    268281#
    269282# (metadataref is a reference to a hash whose keys are filenames
    270283# and whose values are metadata hash structures.) 
    271284
    272 sub read_metadata_file {
     285sub read_metadata_xml_file {
     286    my $self = shift(@_);
    273287    my ($filename, $metadataref, $metakeysref) = @_;
    274    
    275     my ($metadatafiletext, $metatext);
    276     my ($target, $targetdataref, $default_target, $tag, $key, $value);
    277 
    278     # Read the file
    279     open(MTDT, "<$filename");
    280     $metadatafiletext = join(' ', <MTDT>);
    281     $metadatafiletext =~ s/\s+/ /go;
    282     close MTDT;
    283    
    284     # set default filespec for *.metadata files
    285     if ($filename =~ /\.metadata$/) {
    286     $default_target = $filename;
    287     $default_target =~ s/.*\///o;
    288     $default_target =~ s/\.metadata$//;
    289     } else {
    290     $default_target = '';
    291     }
    292 
    293     # split the file into sections on "metadata" tag
    294     foreach $metatext (split(/\<metadata\>/, $metadatafiletext)) {
    295     # print "metadata text: $metatext\n";
    296    
    297     # split the metadata set into sections on each field tag
    298     $target = $default_target;
    299     $targetdataref = {};
    300     foreach $tag (split(/</, $metatext)) {
    301         next if ($tag =~ m"^/");
    302         next if ($tag !~ m/>/);
    303        
    304         ($key, $value) = split(/>/, $tag);
    305         # print "$key -> $value\n";
    306 
    307         if ($key eq 'filename') {
    308         $target = $value;
     288    $self->{'metadataref'} = $metadataref;
     289    $self->{'metakeysref'} = $metakeysref;
     290   
     291    eval {
     292    $self->{'parser'}->parsefile($filename);
     293    };
     294    if ($@) {
     295    my $outhandle = $self->{'outhandle'};
     296    print $outhandle "RecPlug: Warning: Ignoring $filename because it is not a well formed metadata.xml file\n";
     297    return;
     298    }
     299}
     300
     301sub Doctype {
     302    my ($expat, $name, $sysid, $pubid, $internal) = @_;
     303    die if ($name ne "GreenstoneDirectoryMetadata");
     304}
     305
     306sub StartTag {
     307    my ($expat, $element) = @_;
     308   
     309    if ($element eq "FileSet") {
     310    $self->{'saved_targets'} = [];
     311    $self->{'saved_metadata'} = {};
     312    }
     313    elsif ($element eq "FileName") {
     314    $self->{'in_filename'} = 1;
     315    }
     316    elsif ($element eq "Metadata") {
     317    $self->{'metadata_name'} = $_{'name'};
     318    if ((defined $_{'mode'}) && ($_{'mode'} eq "accumulate")) {
     319        $self->{'metadata_accumulate'} = 1;
     320    } else {
     321        $self->{'metadata_accumulate'} = 0;
     322    }
     323    }
     324}
     325
     326sub EndTag {
     327    my ($expat, $element) = @_;
     328
     329    if ($element eq "FileSet") {
     330    push (@{$self->{'metakeysref'}}, @{$self->{'saved_targets'}});
     331    foreach my $target (@{$self->{'saved_targets'}}) {
     332        $self->{'metadataref'}->{$target} = $self->{'saved_metadata'};
     333    }
     334    }
     335    elsif ($element eq "FileName") {
     336    $self->{'in_filename'} = 0;
     337    }
     338    elsif ($element eq "Metadata") {
     339    $self->{'metadata_name'} = "";
     340    }
     341
     342}
     343
     344sub Text {
     345
     346    if ($self->{'in_filename'}) {
     347    # $_ == FileName content
     348    push (@{$self->{'saved_targets'}}, $_);
     349    }
     350    elsif (defined ($self->{'metadata_name'}) && $self->{'metadata_name'} ne "") {
     351    # $_ == Metadata content
     352    my $mname = $self->{'metadata_name'};
     353    if (defined $self->{'saved_metadata'}->{$mname}) {
     354        if ($self->{'metadata_accumulate'}) {
     355        # accumulate mode - add value to existing value(s)
     356        if (ref ($self->{'saved_metadata'}->{$mname}) eq "ARRAY") {
     357            push (@{$self->{'saved_metadata'}->{$mname}}, $_);
     358        } else {
     359            $self->{'saved_metadata'}->{$mname} =
     360            [$self->{'saved_metadata'}->{$mname}, $_];
     361        }
    309362        } else {
    310 
    311         # a metadata field can be flagged as accumulated or overridden
    312         my $accumulateflag = 0;
    313         my $overrideflag = 0;
    314         if ($key =~ / mode=a.*/io) {
    315             $accumulateflag = 1;
    316         } elsif ($key =~ / mode=o.*/io) {
    317             $overrideflag = 1;
    318         }
    319         $key =~ s/ mode=.*$//io;
    320 
    321         # set the metadata value, using an array for accumulating fields
    322         # and a scalar for override fields
    323         if ($accumulateflag) {
    324             # the accumulate flag directs us to accumulate metadata values
    325             if (!defined $targetdataref->{$key}) {
    326             # there is no existing value for this field
    327             $targetdataref->{$key} = [$value];
    328             } elsif (ref ($targetdataref->{$key}) eq "ARRAY") {
    329             # we already have an array of values for this field
    330             my $aref = $targetdataref->{$key};
    331             push @$aref, $value;
    332             } else {
    333             # we have a scalar for this field - convert to array
    334             $targetdataref->{$key} = [$targetdataref->{$key}, $value];
    335             }
    336         } elsif ($overrideflag) {
    337             # the override flag directs us to override exising values
    338             $targetdataref->{$key} = $value;
    339         } elsif (!defined $targetdataref->{$key}) {
    340             # there is no flag, and no existing value: default to override mode
    341             # In the future, I should let the user specify the default mode.
    342             $targetdataref->{$key} = $value;
    343         } elsif (ref ($targetdataref->{$key}) eq "ARRAY") {
    344             # there is no flag, and we're already in accumulate mode
    345             my $aref = $targetdataref->{$key};
    346             push @$aref, $value;
    347         } else {
    348             # there is no flag, and we're already in override mode
    349             $targetdataref->{$key} = $value;
    350         }
     363        # override mode
     364        $self->{'saved_metadata'}->{$mname} = $_;
    351365        }
    352     }
    353 
    354     # store this metadata information in the metadata ref
    355     if ($target) {
    356         push @$metakeysref, $target;
    357         $metadataref->{$target} = $targetdataref;
    358     }
    359     }
    360 }
    361 
     366    } else {
     367        if ($self->{'metadata_accumulate'}) {
     368        # accumulate mode - add value into (currently empty) array
     369        $self->{'saved_metadata'}->{$mname} = [$_];
     370        } else {
     371        # override mode
     372        $self->{'saved_metadata'}->{$mname} = $_;
     373        }
     374    }
     375    }
     376}
     377
     378# This Char function overrides the one in XML::Parser::Stream to overcome a
     379# problem where $expat->{Text} is treated as the return value, slowing
     380# things down significantly in some cases.
     381sub Char {
     382  $_[0]->{'Text'} .= $_[1];
     383  return undef;
     384}
    362385
    363386# Combine two metadata structures.  Given two references to metadata
     
    438461
    4394621;
    440 
    441 
    442 
Note: See TracChangeset for help on using the changeset viewer.