Changeset 2813
- Timestamp:
- 2001-11-01T22:03:46+13:00 (22 years ago)
- File:
-
- 1 edited
Legend:
- Unmodified
- Added
- Removed
-
trunk/gsdl/perllib/plugins/RecPlug.pm
r2795 r2813 28 28 29 29 # RecPlug has one option: use_metadata_files. When this is set, it will 30 # check each directory for an XML file called "metadata" that specifies 31 # metadata for the files (and subdirectories) in the directory. It will 32 # also look in any file of the form *.metadata for metadata about the file 33 # with the same prefix. 34 # 35 # Here's an example of a metadata file that cuses theree metadata structures 30 # check each directory for an XML file called "metadata.xml" that specifies 31 # metadata for the files (and subdirectories) in the directory. 32 # 33 # Here's an example of a metadata file that uses three FileSet structures 36 34 # (ignore the # characters): 37 35 38 #<metadata> 39 # <filename>nugget.*</filename> 40 # <Title>Nugget Point, The Catlins</Title> 41 # <Place mode=accumulate>Nugget Point</Place> 42 #</metadata> 43 # 44 #<metadata> 45 # <filename>nugget-point-1.jpg</filename> 46 # <Title>Nugget Point Lighthouse, The Catlins</Title> 47 # <Subject>Lighthouse</Subject> 48 #</metadata> 49 # 50 #<metadata> 51 # <filename>kaka-point-dir</filename> 52 # <Title>Kaka Point, The Catlins</Title> 53 #</metadata> 36 #<?xml version="1.0" encoding="UTF-8" standalone="no"?> 37 #<!DOCTYPE GreenstoneDirectoryMetadata SYSTEM "http://greenstone.org/dtd/GreenstoneDirectoryMetadata/1.0/GreenstoneDirectoryMetadata.dtd"> 38 #<DirectoryMetadata> 39 # <FileSet> 40 # <FileName>nugget.*</FileName> 41 # <Description> 42 # <Metadata name="Title">Nugget Point, The Catlins</Metadata> 43 # <Metadata name="Place" mode="accumulate">Nugget Point</Metadata> 44 # </Description> 45 # </FileSet> 46 # <FileSet> 47 # <FileName>nugget-point-1.jpg</FileName> 48 # <Description> 49 # <Metadata name="Title">Nugget Point Lighthouse, The Catlins</Metadata> 50 # <Metadata name="Subject">Lighthouse</Metadata> 51 # </Description> 52 # </FileSet> 53 # <FileSet> 54 # <FileName>kaka-point-dir</FileName> 55 # <Description> 56 # <Metadata name="Title">Kaka Point, The Catlins</Metadata> 57 # </Description> 58 # </FileSet> 59 #</DirectoryMetadata> 54 60 55 61 # Metadata elements are read and applied to files in the order they appear 56 # in the file. The directory's "metadata" file is erad first, and then any57 # other files of the form "*.metadata" are read in alphabetical order.58 # 59 # The filename element describes the subfiles in the directory that the60 # metadata applies to as a perl regular expression, so61 # <filename>nugget.*</filename> indicates that the first metadata record62 # appl iesto every subfile that starts with "nugget". For these files, a62 # in the file. 63 # 64 # The FileName element describes the subfiles in the directory that the 65 # metadata applies to as a perl regular expression (a FileSet group may 66 # contain multiple FileName elements). So, <FileName>nugget.*</FileName> 67 # indicates that the metadata records in the following Description block 68 # apply to every subfile that starts with "nugget". For these files, a 63 69 # Title metadata element is set, overriding any old value that the Title 64 70 # might have had. … … 66 72 # Occasionally, we want to have multiple metadata values applied to a 67 73 # document; in this case we use the "mode=accumulate" attribute of the 68 # particular metadata item. In the first metadata element above, the69 # "Place" metadata is accumulating, and is therefore given several values.70 # If we wanted to override these values and use a single metadata element71 # again, we could write <Place mode=override>New Zealand</Place> instead.72 # Remember: every element is assumed to be in override mode unless you73 # specify otherwise, so if you want to accumulate metadata for some field,74 # every occurance must have "mode=accumulate" specified.75 # 76 # The second metadata elementapplies to a specific file, called77 # nugget-point-1.jpg. This element overrides the Title set in the first78 # element above, and adds a "Subject" ,etadata field.79 # 80 # The third and fi onal metadata element sets metadata for a subdirectory81 # rather than a file. The metadata specified (a Title) will be passed into82 # the subdirectory and applied to every file that occurs in the83 # subdirectory (and to every subsubdirectory and its contents, and so on)84 # unless themetadata is explictly overridden later in the import.74 # particular Metadata element. In the second metadata element of the first 75 # FileSet above, the "Place" metadata is accumulating, and may therefore be 76 # given several values. If we wanted to override these values and use a 77 # single metadata element again, we could set the mode attribute to 78 # "override" instead. Remember: every element is assumed to be in override 79 # mode unless you specify otherwise, so if you want to accumulate metadata 80 # for some field, every occurance must have "mode=accumulate" specified. 81 # 82 # The second FileSet element above applies to a specific file, called 83 # nugget-point-1.jpg. This element overrides the Title metadata set in the 84 # first FileSet, and adds a "Subject" metadata field. 85 # 86 # The third and final FileSet sets metadata for a subdirectory rather than 87 # a file. The metadata specified (a Title) will be passed into the 88 # subdirectory and applied to every file that occurs in the subdirectory 89 # (and to every subsubdirectory and its contents, and so on) unless the 90 # metadata is explictly overridden later in the import. 85 91 86 92 … … 95 101 BEGIN { 96 102 @ISA = ('BasPlug'); 97 } 103 unshift (@INC, "$ENV{'GSDLHOME'}/perllib/cpan"); 104 } 105 106 use XML::Parser; 98 107 99 108 sub print_usage { … … 108 117 } 109 118 119 my ($self); 110 120 sub new { 111 121 my $class = shift (@_); 112 my $self = new BasPlug ($class, @_); 113 122 123 # $self is global for use within subroutines called by XML::Parser 124 $self = new BasPlug ($class, @_); 125 114 126 if (!parsargv::parse(\@_, 115 127 q^use_metadata_files^, \$self->{'use_metadata_files'}, … … 120 132 die "\n"; 121 133 } 122 134 135 if ($self->{'use_metadata_files'}) { 136 # create XML::Parser object for parsing metadata.xml files 137 my $parser = new XML::Parser('Style' => 'Stream', 138 'Handlers' => {'Char' => \&Char, 139 'Doctype' => \&Doctype 140 }); 141 $self->{'parser'} = $parser; 142 $self->{'in_filename'} = 0; 143 144 } 145 123 146 return bless $self, $class; 124 147 } … … 127 150 sub is_recursive { 128 151 my $self = shift (@_); 129 152 130 153 return 1; 131 154 } … … 133 156 sub get_default_block_exp { 134 157 my $self = shift (@_); 135 158 136 159 return 'CVS'; 137 160 } … … 149 172 my $self = shift (@_); 150 173 my ($pluginfo, $base_dir, $file, $in_metadata, $processor, $maxdocs) = @_; 151 174 152 175 my $outhandle = $self->{'outhandle'}; 153 176 my $verbosity = $self->{'verbosity'}; 154 177 my $read_metadata_files = $self->{'use_metadata_files'}; 155 178 156 179 # Calculate the directory name and ensure it is a directory and 157 180 # that it is not explicitly blocked. … … 160 183 return undef unless (-d $dirname); 161 184 return 0 if ($self->{'block_exp'} ne "" && $dirname =~ /$self->{'block_exp'}/); 162 185 163 186 # check to make sure we're not reading the archives or index directory 164 187 my $gsdlhome = quotemeta($ENV{'GSDLHOME'}); 165 if ($dirname =~ m %^${gsdlhome}/.*?/import.*?/(archives|index)$%) {166 188 if ($dirname =~ m/^$gsdlhome\/.*?\/import.*?\/(archives|index)$/) { 189 print $outhandle "RecPlug: $dirname appears to be a reference to a Greenstone collection, skipping.\n"; 167 190 return 0; 168 191 } 169 192 170 193 # check to see we haven't got a cyclic path... 171 194 if ($dirname =~ m%(/.*){,41}%) { … … 173 196 return 0; 174 197 } 175 198 176 199 # check to see we haven't got a cyclic path... 177 200 if ($dirname =~ m%.*?import/(.+?)/import/\1.*%) { … … 179 202 return 0; 180 203 } 181 204 182 205 if (($verbosity > 2) && ((scalar keys %$in_metadata) > 0)) { 183 206 print $outhandle "RecPlug: metadata passed in: ", 184 185 } 186 207 join(", ", keys %$in_metadata), "\n"; 208 } 209 187 210 # Recur over directory contents. 188 211 my (@dir, $subfile); 189 212 my $count = 0; 190 213 print $outhandle "RecPlug: getting directory $dirname\n" if ($verbosity); 191 214 192 215 # find all the files in the directory 193 216 if (!opendir (DIR, $dirname)) { … … 197 220 @dir = readdir (DIR); 198 221 closedir (DIR); 199 222 200 223 # read XML metadata files (if supplied) 201 224 my $additionalmetadata = 0; # is there extra metadata available? 202 225 my %extrametadata; # maps from filespec to extra metadata keys 203 226 my @extrametakeys; # keys of %extrametadata in order read 204 227 205 228 if ($read_metadata_files) { 206 229 207 # first read the directory "metadata" file208 my $metadatafile = &util::filename_cat ($dirname, 'metadata ');230 # read the directory "metadata.xml" file 231 my $metadatafile = &util::filename_cat ($dirname, 'metadata.xml'); 209 232 if (-e $metadatafile) { 210 233 print $outhandle "RecPlug: found metadata in $metadatafile\n" 211 234 if ($verbosity); 212 &read_metadata_file($metadatafile, \%extrametadata, \@extrametakeys); 213 $additionalmetadata = 1; 214 } 215 216 # then read any files with names of the form *.metadata 217 foreach $subfile (sort @dir) { 218 next unless ($subfile =~ /^.*\.metadata$/); 219 $metadatafile = &util::filename_cat ($dirname, $subfile); 220 print $outhandle "RecPlug: found metadata in $metadatafile\n" 221 if ($verbosity); 222 &read_metadata_file($metadatafile, \%extrametadata, \@extrametakeys); 235 $self->read_metadata_xml_file($metadatafile, \%extrametadata, \@extrametakeys); 223 236 $additionalmetadata = 1; 224 237 } … … 231 244 last if ($maxdocs != -1 && $count >= $maxdocs); 232 245 next if ($subfile =~ /^\.\.?$/); 233 next if ($read_metadata_files && $subfile =~ /metadata $/);234 print "RecPlug: preparing metadata for $subfile\n" if ($verbosity > 2);246 next if ($read_metadata_files && $subfile =~ /metadata\.xml$/); 247 print $outhandle "RecPlug: preparing metadata for $subfile\n" if ($verbosity > 2); 235 248 236 249 # Make a copy of $in_metadata to pass to $subfile … … 265 278 266 279 # Read a manually-constructed metadata file and store the data 267 # it contains in the $metadata erf structure.280 # it contains in the $metadataref structure. 268 281 # 269 282 # (metadataref is a reference to a hash whose keys are filenames 270 283 # and whose values are metadata hash structures.) 271 284 272 sub read_metadata_file { 285 sub read_metadata_xml_file { 286 my $self = shift(@_); 273 287 my ($filename, $metadataref, $metakeysref) = @_; 274 275 my ($metadatafiletext, $metatext); 276 my ($target, $targetdataref, $default_target, $tag, $key, $value); 277 278 # Read the file 279 open(MTDT, "<$filename"); 280 $metadatafiletext = join(' ', <MTDT>); 281 $metadatafiletext =~ s/\s+/ /go; 282 close MTDT; 283 284 # set default filespec for *.metadata files 285 if ($filename =~ /\.metadata$/) { 286 $default_target = $filename; 287 $default_target =~ s/.*\///o; 288 $default_target =~ s/\.metadata$//; 289 } else { 290 $default_target = ''; 291 } 292 293 # split the file into sections on "metadata" tag 294 foreach $metatext (split(/\<metadata\>/, $metadatafiletext)) { 295 # print "metadata text: $metatext\n"; 296 297 # split the metadata set into sections on each field tag 298 $target = $default_target; 299 $targetdataref = {}; 300 foreach $tag (split(/</, $metatext)) { 301 next if ($tag =~ m"^/"); 302 next if ($tag !~ m/>/); 303 304 ($key, $value) = split(/>/, $tag); 305 # print "$key -> $value\n"; 306 307 if ($key eq 'filename') { 308 $target = $value; 288 $self->{'metadataref'} = $metadataref; 289 $self->{'metakeysref'} = $metakeysref; 290 291 eval { 292 $self->{'parser'}->parsefile($filename); 293 }; 294 if ($@) { 295 my $outhandle = $self->{'outhandle'}; 296 print $outhandle "RecPlug: Warning: Ignoring $filename because it is not a well formed metadata.xml file\n"; 297 return; 298 } 299 } 300 301 sub Doctype { 302 my ($expat, $name, $sysid, $pubid, $internal) = @_; 303 die if ($name ne "GreenstoneDirectoryMetadata"); 304 } 305 306 sub StartTag { 307 my ($expat, $element) = @_; 308 309 if ($element eq "FileSet") { 310 $self->{'saved_targets'} = []; 311 $self->{'saved_metadata'} = {}; 312 } 313 elsif ($element eq "FileName") { 314 $self->{'in_filename'} = 1; 315 } 316 elsif ($element eq "Metadata") { 317 $self->{'metadata_name'} = $_{'name'}; 318 if ((defined $_{'mode'}) && ($_{'mode'} eq "accumulate")) { 319 $self->{'metadata_accumulate'} = 1; 320 } else { 321 $self->{'metadata_accumulate'} = 0; 322 } 323 } 324 } 325 326 sub EndTag { 327 my ($expat, $element) = @_; 328 329 if ($element eq "FileSet") { 330 push (@{$self->{'metakeysref'}}, @{$self->{'saved_targets'}}); 331 foreach my $target (@{$self->{'saved_targets'}}) { 332 $self->{'metadataref'}->{$target} = $self->{'saved_metadata'}; 333 } 334 } 335 elsif ($element eq "FileName") { 336 $self->{'in_filename'} = 0; 337 } 338 elsif ($element eq "Metadata") { 339 $self->{'metadata_name'} = ""; 340 } 341 342 } 343 344 sub Text { 345 346 if ($self->{'in_filename'}) { 347 # $_ == FileName content 348 push (@{$self->{'saved_targets'}}, $_); 349 } 350 elsif (defined ($self->{'metadata_name'}) && $self->{'metadata_name'} ne "") { 351 # $_ == Metadata content 352 my $mname = $self->{'metadata_name'}; 353 if (defined $self->{'saved_metadata'}->{$mname}) { 354 if ($self->{'metadata_accumulate'}) { 355 # accumulate mode - add value to existing value(s) 356 if (ref ($self->{'saved_metadata'}->{$mname}) eq "ARRAY") { 357 push (@{$self->{'saved_metadata'}->{$mname}}, $_); 358 } else { 359 $self->{'saved_metadata'}->{$mname} = 360 [$self->{'saved_metadata'}->{$mname}, $_]; 361 } 309 362 } else { 310 311 # a metadata field can be flagged as accumulated or overridden 312 my $accumulateflag = 0; 313 my $overrideflag = 0; 314 if ($key =~ / mode=a.*/io) { 315 $accumulateflag = 1; 316 } elsif ($key =~ / mode=o.*/io) { 317 $overrideflag = 1; 318 } 319 $key =~ s/ mode=.*$//io; 320 321 # set the metadata value, using an array for accumulating fields 322 # and a scalar for override fields 323 if ($accumulateflag) { 324 # the accumulate flag directs us to accumulate metadata values 325 if (!defined $targetdataref->{$key}) { 326 # there is no existing value for this field 327 $targetdataref->{$key} = [$value]; 328 } elsif (ref ($targetdataref->{$key}) eq "ARRAY") { 329 # we already have an array of values for this field 330 my $aref = $targetdataref->{$key}; 331 push @$aref, $value; 332 } else { 333 # we have a scalar for this field - convert to array 334 $targetdataref->{$key} = [$targetdataref->{$key}, $value]; 335 } 336 } elsif ($overrideflag) { 337 # the override flag directs us to override exising values 338 $targetdataref->{$key} = $value; 339 } elsif (!defined $targetdataref->{$key}) { 340 # there is no flag, and no existing value: default to override mode 341 # In the future, I should let the user specify the default mode. 342 $targetdataref->{$key} = $value; 343 } elsif (ref ($targetdataref->{$key}) eq "ARRAY") { 344 # there is no flag, and we're already in accumulate mode 345 my $aref = $targetdataref->{$key}; 346 push @$aref, $value; 347 } else { 348 # there is no flag, and we're already in override mode 349 $targetdataref->{$key} = $value; 350 } 363 # override mode 364 $self->{'saved_metadata'}->{$mname} = $_; 351 365 } 352 } 353 354 # store this metadata information in the metadata ref 355 if ($target) { 356 push @$metakeysref, $target; 357 $metadataref->{$target} = $targetdataref; 358 } 359 } 360 } 361 366 } else { 367 if ($self->{'metadata_accumulate'}) { 368 # accumulate mode - add value into (currently empty) array 369 $self->{'saved_metadata'}->{$mname} = [$_]; 370 } else { 371 # override mode 372 $self->{'saved_metadata'}->{$mname} = $_; 373 } 374 } 375 } 376 } 377 378 # This Char function overrides the one in XML::Parser::Stream to overcome a 379 # problem where $expat->{Text} is treated as the return value, slowing 380 # things down significantly in some cases. 381 sub Char { 382 $_[0]->{'Text'} .= $_[1]; 383 return undef; 384 } 362 385 363 386 # Combine two metadata structures. Given two references to metadata … … 438 461 439 462 1; 440 441 442
Note:
See TracChangeset
for help on using the changeset viewer.