Changeset 2228


Ignore:
Timestamp:
2001-03-27T14:49:14+12:00 (23 years ago)
Author:
paynter
Message:

The -use_metadata_files option tells RecPlug to read any metadata XML files
it finds in the import directories and to add their metadata to the files
they describe. A few other changes: metadata is passed recursively down
the directory tree, collect.cfg-line arguments are parsed correctly, and
block expressions are inherited from BasPlug and blocked appropriately
(default block_exp: CVS).

File:
1 edited

Legend:

Unmodified
Added
Removed
  • trunk/gsdl/perllib/plugins/RecPlug.pm

    r1810 r2228  
    2424###########################################################################
    2525
    26 # plugin which recurses through directories processing
    27 # each file it finds
     26# RecPlug is a plugin which recurses through directories processing
     27# each file it finds.
     28
     29# RecPlug has one option: use_metadata_files.  When this is set, it will
     30# check each directory for an XML file called "metadata" that specifies
     31# metadata for the files (and subdirectories) in the directory.  It will
     32# also look in any file of the form *.metadata for metadata about the file
     33# with the same prefix.
     34#
     35# Here's an example of a metadata file that cuses theree metadata structures
     36# (ignore the # characters):
     37
     38#<metadata>
     39#  <filename>nugget.*</filename>
     40#  <Title>Nugget Point, The Catlins</Title>
     41#  <Place mode=accumulate>Nugget Point</Place>
     42#</metadata>
     43#
     44#<metadata>
     45#  <filename>nugget-point-1.jpg</filename>
     46#  <Title>Nugget Point Lighthouse, The Catlins</Title>
     47#  <Subject>Lighthouse</Subject>
     48#</metadata>
     49#
     50#<metadata>
     51#  <filename>kaka-point-dir</filename>
     52#  <Title>Kaka Point, The Catlins</Title>
     53#</metadata>
     54
     55# Metadata elements are read and applied to files in the order they appear
     56# in the file.  The directory's "metadata" file is erad first, and then any
     57# other files of the form "*.metadata" are read in alphabetical order.
     58#
     59# The filename element describes the subfiles in the directory that the
     60# metadata applies to as a perl regular expression, so
     61# <filename>nugget.*</filename> indicates that the first metadata record
     62# applies to every subfile that starts with "nugget".  For these files, a
     63# Title metadata element is set, overriding any old value that the Title
     64# might have had.
     65#
     66# Occasionally, we want to have multiple metadata values applied to a
     67# document; in this case we use the "mode=accumulate" attribute of the
     68# particular metadata item.  In the first metadata element above, the
     69# "Place" metadata is accumulating, and is therefore given several values.
     70# If we wanted to override these values and use a single metadata element
     71# again, we could write <Place mode=override>New Zealand</Place> instead.
     72# Remember: every element is assumed to be in override mode unless you
     73# specify otherwise, so if you want to accumulate metadata for some field,
     74# every occurance must have "mode=accumulate" specified.
     75#
     76# The second metadata element applies to a specific file, called
     77# nugget-point-1.jpg.  This element overrides the Title set in the first
     78# element above, and adds a "Subject" ,etadata field.
     79#
     80# The third and fional metadata element sets metadata for a subdirectory
     81# rather than a file.  The metadata specified (a Title) will be passed into
     82# the subdirectory and applied to every file that occurs in the
     83# subdirectory (and to every subsubdirectory and its contents, and so on)
     84# unless the metadata is explictly overridden later in the import.
     85
     86
    2887
    2988package RecPlug;
     
    3897}
    3998
     99sub print_usage {
     100    my ($plugin_name) = @_;
     101
     102    print STDERR "
     103  usage: plugin RecPlug [options]
     104
     105   -use_metadata_files  Read metadata from metadata XML files.
     106
     107"
     108}
     109
    40110sub new {
    41     my ($class) = @_;
    42     my $self = new BasPlug ("RecPlug", @_);
    43 
    44     $self->{'exclude_tail_dirs'} = []; # empty by default
     111    my $class = shift (@_);
     112    my $self = new BasPlug ($class, @_);
     113
     114    if (!parsargv::parse(\@_,
     115             q^use_metadata_files^, \$self->{'use_metadata_files'},
     116             "allow_extra_options")) {
     117    print STDERR "\nRecPlug uses an incorrect option.\n";
     118    print STDERR "Check your collect.cfg configuration file.\n\n";
     119        &print_usage("RecPlug");
     120    die "\n";
     121    }
    45122
    46123    return bless $self, $class;
     
    54131}
    55132
     133sub get_default_block_exp {
     134    my $self = shift (@_);
     135
     136    return 'CVS';
     137}
    56138
    57139# return number of files processed, undef if can't process
    58140# Note that $base_dir might be "" and that $file might
    59141# include directories
     142
     143# This function passes around metadata hash structures.  Metadata hash
     144# structures are hashes that map from a (scalar) key (the metadata element
     145# name) to either a scalar metadata value or a reference to an array of
     146# such values.
     147
    60148sub read {
    61149    my $self = shift (@_);
    62     my ($pluginfo, $base_dir, $file, $metadata, $processor, $maxdocs) = @_;
     150    my ($pluginfo, $base_dir, $file, $in_metadata, $processor, $maxdocs) = @_;
     151
    63152    my $outhandle = $self->{'outhandle'};
    64 
    65     foreach my $etd ( @{$self->{'exclude_tail_dirs'}} )
    66     {
    67     return 0 if ($file =~ m/$etd/);
    68     }   
    69 
    70     my (@dir, $subfile);
    71     my $count = 0;
    72 
    73     # see if this is a directory
     153    my $verbosity = $self->{'verbosity'};
     154    my $read_metadata_files = $self->{'use_metadata_files'};
     155
     156    # Calculate the directory name and ensure it is a directory and
     157    # that it is not explicitly blocked.
     158    $file =~ s/^[\/\\]+//;
    74159    my $dirname = &util::filename_cat ($base_dir, $file);
    75 
    76     # check to make sure we're not reading our own archives
    77     # or index directory
    78     my $gsdlhome = quotemeta ($ENV{'GSDLHOME'});
     160    return undef unless (-d $dirname);
     161    return 0 if ($self->{'block_exp'} ne "" && $dirname =~ /$self->{'block_exp'}/);
     162
     163
     164    # check to make sure we're not reading the archives or index directory
     165    my $gsdlhome = quotemeta($ENV{'GSDLHOME'});
    79166    if ($dirname =~ m%^${gsdlhome}/.*?/import.*?/(archives|index)$%) {
    80     print $outhandle "RecPlug: $dirname appears to be a reference to a Greenstone collection, skipping.\n";
    81     return 1;
     167        print $outhandle "RecPlug: $dirname appears to be a reference to a Greenstone collection, skipping.\n";
     168        return 0;
    82169    }
    83170
     
    85172    if ($dirname =~ m%(/.*){,41}%) {
    86173    print $outhandle "RecPlug: $dirname is 40 directories deep, is this a recursive path? if not increase constant in RecPlug.pm.\n";
    87     return 1;
     174    return 0;
    88175    }
    89176
    90177    # check to see we haven't got a cyclic path...
    91178    if ($dirname =~ m%.*?import/(.+?)/import/\1.*%) {
    92     print $outhandle "RecPlug: $dirname appears to a recursive loop ...\n";
    93     return 1;
    94     }
    95 
    96 
    97     if (-d $dirname) {
    98 
    99     if ($dirname =~ m|/CVS$|) {
    100         print $outhandle "RecPlug: $dirname is a CVS directory, skipping.\n";
    101         return 1;
    102     }
    103     # read all the files in the directory
    104         if (!opendir (DIR, $dirname))
    105         {
    106             print $outhandle "RecPlug: WARNING - couldn't read directory $dirname\n";
    107             return;
    108         }
    109 
    110     @dir = readdir (DIR);
    111     closedir (DIR);
    112 
    113     print $outhandle "RecPlug: getting directory $dirname\n";
    114 
    115     # process each file
    116     foreach $subfile (@dir) {
    117         last if ($maxdocs != -1 && $count >= $maxdocs);
    118 
    119         if ($subfile !~ /^\.\.?$/) {
    120         # note: metadata is not carried on to the next level
    121         $count += &plugin::read ($pluginfo, $base_dir, &util::filename_cat($file, $subfile),
    122                      {}, $processor, $maxdocs);
     179    print $outhandle "RecPlug: $dirname appears to be in a recursive loop...\n";
     180    return 0;
     181    }
     182
     183    if (($verbosity > 2) && ((scalar keys %$in_metadata) > 0)) {
     184        print $outhandle "RecPlug: metadata passed in: ",
     185                     join(", ", keys %$in_metadata), "\n";
     186    }
     187
     188    # Recur over directory contents.
     189    my (@dir, $subfile);
     190    my $count = 0;
     191    print $outhandle "RecPlug: getting directory $dirname\n" if ($verbosity);
     192
     193    # find all the files in the directory
     194    if (!opendir (DIR, $dirname)) {
     195    print $outhandle "RecPlug: WARNING - couldn't read directory $dirname\n";
     196    return undef;
     197    }
     198    @dir = readdir (DIR);
     199    closedir (DIR);
     200
     201    # read XML metadata files (if supplied)
     202    my $additionalmetadata = 0;      # is there extra metadata available?
     203    my %extrametadata;               # maps from filespec to extra metadata keys
     204    my @extrametakeys;               # keys of %extrametadata in order read
     205
     206    if ($read_metadata_files) {
     207
     208    # first read the directory "metadata" file
     209    my $metadatafile = &util::filename_cat ($dirname, 'metadata');
     210    if (-e $metadatafile) {
     211        print $outhandle "RecPlug: found metadata in $metadatafile\n"
     212        if ($verbosity);
     213        &read_metadata_file($metadatafile, \%extrametadata, \@extrametakeys);
     214        $additionalmetadata = 1;
     215    }
     216
     217    # then read any files with names of the form *.metadata
     218    foreach $subfile (sort @dir) {
     219        next unless ($subfile =~ /^.*\.metadata$/);
     220        $metadatafile = &util::filename_cat ($dirname, $subfile);
     221        print $outhandle "RecPlug: found metadata in $metadatafile\n"
     222        if ($verbosity);
     223        &read_metadata_file($metadatafile, \%extrametadata, \@extrametakeys);
     224        $additionalmetadata = 1;
     225    }
     226    }
     227
     228    # import each of the files in the directory
     229    my $out_metadata;
     230    foreach $subfile (@dir) {
     231       
     232    last if ($maxdocs != -1 && $count >= $maxdocs);
     233    next if ($subfile =~ /^\.\.?$/);
     234    next if ($read_metadata_files && $subfile =~ /metadata$/);
     235    print "RecPlug: preparing metadata for $subfile\n" if ($verbosity > 2);
     236
     237    # Make a copy of $in_metadata to pass to $subfile
     238    $out_metadata = {};
     239    &combine_metadata_structures($out_metadata, $in_metadata);
     240
     241    # Next add metadata read in XML files (if it is supplied)
     242    if ($additionalmetadata == 1) {
     243
     244        my ($filespec, $mdref);
     245        foreach $filespec (@extrametakeys) {
     246        if ($subfile =~ /$filespec/) {
     247            print $outhandle "File \"$subfile\" matches filespec \"$filespec\"\n"
     248            if ($verbosity > 2);
     249            $mdref = $extrametadata{$filespec};
     250            &combine_metadata_structures($out_metadata, $mdref);
     251        }
    123252        }
    124253    }
    125     return $count;
    126     }
    127 
    128     # wasn't a directory, someone else will have to process it
    129     return undef;
     254   
     255    # Recursively read each $subfile
     256    print $outhandle "RecPlug recurring: $subfile\n" if ($verbosity > 2);
     257    $count += &plugin::read ($pluginfo, $base_dir,
     258                 &util::filename_cat($file, $subfile),
     259                 $out_metadata, $processor, $maxdocs);
     260    }
     261    return $count;
     262   
     263}
     264
     265
     266
     267# Read a manually-constructed metadata file and store the data
     268# it contains in the $metadataerf structure.
     269#
     270# (metadataref is a reference to a hash whose keys are filenames
     271# and whose values are metadata hash structures.) 
     272
     273sub read_metadata_file {
     274    my ($filename, $metadataref, $metakeysref) = @_;
     275   
     276    my ($metadatafiletext, $metatext);
     277    my ($target, $targetdataref, $default_target, $tag, $key, $value);
     278
     279    # Read the file
     280    open(MTDT, "<$filename");
     281    $metadatafiletext = join(' ', <MTDT>);
     282    $metadatafiletext =~ s/\s+/ /go;
     283    close MTDT;
     284   
     285    # set default filespec for *.metadata files
     286    if ($filename =~ /\.metadata$/) {
     287    $default_target = $filename;
     288    $default_target =~ s/.*\///o;
     289    $default_target =~ s/\.metadata$//;
     290    } else {
     291    $default_target = '';
     292    }
     293
     294    # split the file into sections on "metadata" tag
     295    foreach $metatext (split(/\<metadata\>/, $metadatafiletext)) {
     296    # print "metadata text: $metatext\n";
     297   
     298    # split the metadata set into sections on each field tag
     299    $target = $default_target;
     300    $targetdataref = {};
     301    foreach $tag (split(/</, $metatext)) {
     302        next if ($tag =~ m"^/");
     303        next if ($tag !~ m/>/);
     304       
     305        ($key, $value) = split(/>/, $tag);
     306        # print "$key -> $value\n";
     307
     308        if ($key eq 'filename') {
     309        $target = $value;
     310        } else {
     311
     312        # a metadata field can be flagged as accumulated or overridden
     313        my $accumulateflag = 0;
     314        my $overrideflag = 0;
     315        if ($key =~ / mode=a.*/io) {
     316            $accumulateflag = 1;
     317        } elsif ($key =~ / mode=o.*/io) {
     318            $overrideflag = 1;
     319        }
     320        $key =~ s/ mode=.*$//io;
     321
     322        # set the metadata value, using an array for accumulating fields
     323        # and a scalar for override fields
     324        if ($accumulateflag) {
     325            # the accumulate flag directs us to accumulate metadata values
     326            if (!defined $targetdataref->{$key}) {
     327            # there is no existing value for this field
     328            $targetdataref->{$key} = [$value];
     329            } elsif (ref ($targetdataref->{$key}) eq "ARRAY") {
     330            # we already have an array of values for this field
     331            my $aref = $targetdataref->{$key};
     332            push @$aref, $value;
     333            } else {
     334            # we have a scalar for this field - convert to array
     335            $targetdataref->{$key} = [$targetdataref->{$key}, $value];
     336            }
     337        } elsif ($overrideflag) {
     338            # the override flag directs us to override exising values
     339            $targetdataref->{$key} = $value;
     340        } elsif (!defined $targetdataref->{$key}) {
     341            # there is no flag, and no existing value: default to override mode
     342            # In the future, I should let the user specify the default mode.
     343            $targetdataref->{$key} = $value;
     344        } elsif (ref ($targetdataref->{$key}) eq "ARRAY") {
     345            # there is no flag, and we're already in accumulate mode
     346            my $aref = $targetdataref->{$key};
     347            push @$aref, $value;
     348        } else {
     349            # there is no flag, and we're already in override mode
     350            $targetdataref->{$key} = $value;
     351        }
     352        }
     353    }
     354
     355    # store this metadata information in the metadata ref
     356    if ($target) {
     357        push @$metakeysref, $target;
     358        $metadataref->{$target} = $targetdataref;
     359    }
     360    }
     361}
     362
     363
     364# Combine two metadata structures.  Given two references to metadata
     365# element structures, add every field of the second ($mdref2) to the first
     366# ($mdref1). 
     367#
     368# Afterwards $mdref1 will be updated, and $mdref2 will be unchanged.
     369#
     370# We have to be acreful about the way we merge metadata when one metadata
     371# structure is in "override" mode and one is in "merge" mode.  In fact, we
     372# use the mode from the second structure, $mdref2, because it is generally
     373# defined later (lower in the directory structure) and is therefore more
     374# "local" to the document concerned.
     375#
     376# Another issue is the use of references to pass metadata around.  If we
     377# simply copy one metadata structure reference to another, then we're
     378# effectively justr copyinga pointer, and changes to the new referene
     379# will affect the old (copied) one also.  This also applies to ARRAY
     380# references used as metadata element values (hence the "clonedata"
     381# function below).
     382
     383sub combine_metadata_structures {
     384    my ($mdref1, $mdref2) = @_;
     385    my ($key, $value1, $value2);
     386
     387    foreach $key (keys %$mdref2) {
     388
     389    $value1 = $mdref1->{$key};
     390    $value2 = $mdref2->{$key};
     391   
     392    # If there is no existing value for this metadata field in
     393    # $mdref1, so we simply copy the value from $mdref2 over.
     394    if (!defined $value1) {
     395        $mdref1->{$key} = &clonedata($value2);
     396    }
     397    # Otherwise we have to add the new values to the existing ones.
     398    # If the second structure is accumulated, then acculate all the
     399    # values into the first structure
     400    elsif ((ref $value2) eq "ARRAY") {
     401        # If the first metadata element is a scalar we have to
     402        # convert it into an array before we add anything more.
     403        if ((ref $value1) ne 'ARRAY') {
     404        $mdref1->{$key} = [$value1];
     405        $value1 = $mdref1->{$key};
     406        }
     407        # Now add the value(s) from the second array to the first
     408        $value2 = &clonedata($value2);
     409        push @$value1, @$value2;
     410    }
     411    # Finally, If the second structure is not an array erference, we
     412    # know it is in override mode, so override the first structure.
     413    else {
     414        $mdref1->{$key} = &clonedata($value2);
     415    }
     416    }
     417}
     418
     419
     420# Make a "cloned" copy of a metadata value. 
     421# This is trivial for a simple scalar value,
     422# but not for an array reference.
     423
     424sub clonedata {
     425    my ($value) = @_;
     426    my $result;
     427
     428    if ((ref $value) eq 'ARRAY') {
     429    $result = [];
     430    foreach my $item (@$value) {
     431        push @$result, $item;
     432    }
     433    } else {
     434    $result = $value;
     435    }
     436    return $result;
    130437}
    131438
    132439
    1334401;
     441
     442
     443
Note: See TracChangeset for help on using the changeset viewer.