- Timestamp:
- 2001-03-27T14:49:14+12:00 (23 years ago)
- File:
-
- 1 edited
Legend:
- Unmodified
- Added
- Removed
-
trunk/gsdl/perllib/plugins/RecPlug.pm
r1810 r2228 24 24 ########################################################################### 25 25 26 # plugin which recurses through directories processing 27 # each file it finds 26 # RecPlug is a plugin which recurses through directories processing 27 # each file it finds. 28 29 # RecPlug has one option: use_metadata_files. When this is set, it will 30 # check each directory for an XML file called "metadata" that specifies 31 # metadata for the files (and subdirectories) in the directory. It will 32 # also look in any file of the form *.metadata for metadata about the file 33 # with the same prefix. 34 # 35 # Here's an example of a metadata file that cuses theree metadata structures 36 # (ignore the # characters): 37 38 #<metadata> 39 # <filename>nugget.*</filename> 40 # <Title>Nugget Point, The Catlins</Title> 41 # <Place mode=accumulate>Nugget Point</Place> 42 #</metadata> 43 # 44 #<metadata> 45 # <filename>nugget-point-1.jpg</filename> 46 # <Title>Nugget Point Lighthouse, The Catlins</Title> 47 # <Subject>Lighthouse</Subject> 48 #</metadata> 49 # 50 #<metadata> 51 # <filename>kaka-point-dir</filename> 52 # <Title>Kaka Point, The Catlins</Title> 53 #</metadata> 54 55 # Metadata elements are read and applied to files in the order they appear 56 # in the file. The directory's "metadata" file is erad first, and then any 57 # other files of the form "*.metadata" are read in alphabetical order. 58 # 59 # The filename element describes the subfiles in the directory that the 60 # metadata applies to as a perl regular expression, so 61 # <filename>nugget.*</filename> indicates that the first metadata record 62 # applies to every subfile that starts with "nugget". For these files, a 63 # Title metadata element is set, overriding any old value that the Title 64 # might have had. 65 # 66 # Occasionally, we want to have multiple metadata values applied to a 67 # document; in this case we use the "mode=accumulate" attribute of the 68 # particular metadata item. In the first metadata element above, the 69 # "Place" metadata is accumulating, and is therefore given several values. 70 # If we wanted to override these values and use a single metadata element 71 # again, we could write <Place mode=override>New Zealand</Place> instead. 72 # Remember: every element is assumed to be in override mode unless you 73 # specify otherwise, so if you want to accumulate metadata for some field, 74 # every occurance must have "mode=accumulate" specified. 75 # 76 # The second metadata element applies to a specific file, called 77 # nugget-point-1.jpg. This element overrides the Title set in the first 78 # element above, and adds a "Subject" ,etadata field. 79 # 80 # The third and fional metadata element sets metadata for a subdirectory 81 # rather than a file. The metadata specified (a Title) will be passed into 82 # the subdirectory and applied to every file that occurs in the 83 # subdirectory (and to every subsubdirectory and its contents, and so on) 84 # unless the metadata is explictly overridden later in the import. 85 86 28 87 29 88 package RecPlug; … … 38 97 } 39 98 99 sub print_usage { 100 my ($plugin_name) = @_; 101 102 print STDERR " 103 usage: plugin RecPlug [options] 104 105 -use_metadata_files Read metadata from metadata XML files. 106 107 " 108 } 109 40 110 sub new { 41 my ($class) = @_; 42 my $self = new BasPlug ("RecPlug", @_); 43 44 $self->{'exclude_tail_dirs'} = []; # empty by default 111 my $class = shift (@_); 112 my $self = new BasPlug ($class, @_); 113 114 if (!parsargv::parse(\@_, 115 q^use_metadata_files^, \$self->{'use_metadata_files'}, 116 "allow_extra_options")) { 117 print STDERR "\nRecPlug uses an incorrect option.\n"; 118 print STDERR "Check your collect.cfg configuration file.\n\n"; 119 &print_usage("RecPlug"); 120 die "\n"; 121 } 45 122 46 123 return bless $self, $class; … … 54 131 } 55 132 133 sub get_default_block_exp { 134 my $self = shift (@_); 135 136 return 'CVS'; 137 } 56 138 57 139 # return number of files processed, undef if can't process 58 140 # Note that $base_dir might be "" and that $file might 59 141 # include directories 142 143 # This function passes around metadata hash structures. Metadata hash 144 # structures are hashes that map from a (scalar) key (the metadata element 145 # name) to either a scalar metadata value or a reference to an array of 146 # such values. 147 60 148 sub read { 61 149 my $self = shift (@_); 62 my ($pluginfo, $base_dir, $file, $metadata, $processor, $maxdocs) = @_; 150 my ($pluginfo, $base_dir, $file, $in_metadata, $processor, $maxdocs) = @_; 151 63 152 my $outhandle = $self->{'outhandle'}; 64 65 foreach my $etd ( @{$self->{'exclude_tail_dirs'}} ) 66 { 67 return 0 if ($file =~ m/$etd/); 68 } 69 70 my (@dir, $subfile); 71 my $count = 0; 72 73 # see if this is a directory 153 my $verbosity = $self->{'verbosity'}; 154 my $read_metadata_files = $self->{'use_metadata_files'}; 155 156 # Calculate the directory name and ensure it is a directory and 157 # that it is not explicitly blocked. 158 $file =~ s/^[\/\\]+//; 74 159 my $dirname = &util::filename_cat ($base_dir, $file); 75 76 # check to make sure we're not reading our own archives 77 # or index directory 78 my $gsdlhome = quotemeta ($ENV{'GSDLHOME'}); 160 return undef unless (-d $dirname); 161 return 0 if ($self->{'block_exp'} ne "" && $dirname =~ /$self->{'block_exp'}/); 162 163 164 # check to make sure we're not reading the archives or index directory 165 my $gsdlhome = quotemeta($ENV{'GSDLHOME'}); 79 166 if ($dirname =~ m%^${gsdlhome}/.*?/import.*?/(archives|index)$%) { 80 81 return 1;167 print $outhandle "RecPlug: $dirname appears to be a reference to a Greenstone collection, skipping.\n"; 168 return 0; 82 169 } 83 170 … … 85 172 if ($dirname =~ m%(/.*){,41}%) { 86 173 print $outhandle "RecPlug: $dirname is 40 directories deep, is this a recursive path? if not increase constant in RecPlug.pm.\n"; 87 return 1;174 return 0; 88 175 } 89 176 90 177 # check to see we haven't got a cyclic path... 91 178 if ($dirname =~ m%.*?import/(.+?)/import/\1.*%) { 92 print $outhandle "RecPlug: $dirname appears to a recursive loop ...\n"; 93 return 1; 94 } 95 96 97 if (-d $dirname) { 98 99 if ($dirname =~ m|/CVS$|) { 100 print $outhandle "RecPlug: $dirname is a CVS directory, skipping.\n"; 101 return 1; 102 } 103 # read all the files in the directory 104 if (!opendir (DIR, $dirname)) 105 { 106 print $outhandle "RecPlug: WARNING - couldn't read directory $dirname\n"; 107 return; 108 } 109 110 @dir = readdir (DIR); 111 closedir (DIR); 112 113 print $outhandle "RecPlug: getting directory $dirname\n"; 114 115 # process each file 116 foreach $subfile (@dir) { 117 last if ($maxdocs != -1 && $count >= $maxdocs); 118 119 if ($subfile !~ /^\.\.?$/) { 120 # note: metadata is not carried on to the next level 121 $count += &plugin::read ($pluginfo, $base_dir, &util::filename_cat($file, $subfile), 122 {}, $processor, $maxdocs); 179 print $outhandle "RecPlug: $dirname appears to be in a recursive loop...\n"; 180 return 0; 181 } 182 183 if (($verbosity > 2) && ((scalar keys %$in_metadata) > 0)) { 184 print $outhandle "RecPlug: metadata passed in: ", 185 join(", ", keys %$in_metadata), "\n"; 186 } 187 188 # Recur over directory contents. 189 my (@dir, $subfile); 190 my $count = 0; 191 print $outhandle "RecPlug: getting directory $dirname\n" if ($verbosity); 192 193 # find all the files in the directory 194 if (!opendir (DIR, $dirname)) { 195 print $outhandle "RecPlug: WARNING - couldn't read directory $dirname\n"; 196 return undef; 197 } 198 @dir = readdir (DIR); 199 closedir (DIR); 200 201 # read XML metadata files (if supplied) 202 my $additionalmetadata = 0; # is there extra metadata available? 203 my %extrametadata; # maps from filespec to extra metadata keys 204 my @extrametakeys; # keys of %extrametadata in order read 205 206 if ($read_metadata_files) { 207 208 # first read the directory "metadata" file 209 my $metadatafile = &util::filename_cat ($dirname, 'metadata'); 210 if (-e $metadatafile) { 211 print $outhandle "RecPlug: found metadata in $metadatafile\n" 212 if ($verbosity); 213 &read_metadata_file($metadatafile, \%extrametadata, \@extrametakeys); 214 $additionalmetadata = 1; 215 } 216 217 # then read any files with names of the form *.metadata 218 foreach $subfile (sort @dir) { 219 next unless ($subfile =~ /^.*\.metadata$/); 220 $metadatafile = &util::filename_cat ($dirname, $subfile); 221 print $outhandle "RecPlug: found metadata in $metadatafile\n" 222 if ($verbosity); 223 &read_metadata_file($metadatafile, \%extrametadata, \@extrametakeys); 224 $additionalmetadata = 1; 225 } 226 } 227 228 # import each of the files in the directory 229 my $out_metadata; 230 foreach $subfile (@dir) { 231 232 last if ($maxdocs != -1 && $count >= $maxdocs); 233 next if ($subfile =~ /^\.\.?$/); 234 next if ($read_metadata_files && $subfile =~ /metadata$/); 235 print "RecPlug: preparing metadata for $subfile\n" if ($verbosity > 2); 236 237 # Make a copy of $in_metadata to pass to $subfile 238 $out_metadata = {}; 239 &combine_metadata_structures($out_metadata, $in_metadata); 240 241 # Next add metadata read in XML files (if it is supplied) 242 if ($additionalmetadata == 1) { 243 244 my ($filespec, $mdref); 245 foreach $filespec (@extrametakeys) { 246 if ($subfile =~ /$filespec/) { 247 print $outhandle "File \"$subfile\" matches filespec \"$filespec\"\n" 248 if ($verbosity > 2); 249 $mdref = $extrametadata{$filespec}; 250 &combine_metadata_structures($out_metadata, $mdref); 251 } 123 252 } 124 253 } 125 return $count; 126 } 127 128 # wasn't a directory, someone else will have to process it 129 return undef; 254 255 # Recursively read each $subfile 256 print $outhandle "RecPlug recurring: $subfile\n" if ($verbosity > 2); 257 $count += &plugin::read ($pluginfo, $base_dir, 258 &util::filename_cat($file, $subfile), 259 $out_metadata, $processor, $maxdocs); 260 } 261 return $count; 262 263 } 264 265 266 267 # Read a manually-constructed metadata file and store the data 268 # it contains in the $metadataerf structure. 269 # 270 # (metadataref is a reference to a hash whose keys are filenames 271 # and whose values are metadata hash structures.) 272 273 sub read_metadata_file { 274 my ($filename, $metadataref, $metakeysref) = @_; 275 276 my ($metadatafiletext, $metatext); 277 my ($target, $targetdataref, $default_target, $tag, $key, $value); 278 279 # Read the file 280 open(MTDT, "<$filename"); 281 $metadatafiletext = join(' ', <MTDT>); 282 $metadatafiletext =~ s/\s+/ /go; 283 close MTDT; 284 285 # set default filespec for *.metadata files 286 if ($filename =~ /\.metadata$/) { 287 $default_target = $filename; 288 $default_target =~ s/.*\///o; 289 $default_target =~ s/\.metadata$//; 290 } else { 291 $default_target = ''; 292 } 293 294 # split the file into sections on "metadata" tag 295 foreach $metatext (split(/\<metadata\>/, $metadatafiletext)) { 296 # print "metadata text: $metatext\n"; 297 298 # split the metadata set into sections on each field tag 299 $target = $default_target; 300 $targetdataref = {}; 301 foreach $tag (split(/</, $metatext)) { 302 next if ($tag =~ m"^/"); 303 next if ($tag !~ m/>/); 304 305 ($key, $value) = split(/>/, $tag); 306 # print "$key -> $value\n"; 307 308 if ($key eq 'filename') { 309 $target = $value; 310 } else { 311 312 # a metadata field can be flagged as accumulated or overridden 313 my $accumulateflag = 0; 314 my $overrideflag = 0; 315 if ($key =~ / mode=a.*/io) { 316 $accumulateflag = 1; 317 } elsif ($key =~ / mode=o.*/io) { 318 $overrideflag = 1; 319 } 320 $key =~ s/ mode=.*$//io; 321 322 # set the metadata value, using an array for accumulating fields 323 # and a scalar for override fields 324 if ($accumulateflag) { 325 # the accumulate flag directs us to accumulate metadata values 326 if (!defined $targetdataref->{$key}) { 327 # there is no existing value for this field 328 $targetdataref->{$key} = [$value]; 329 } elsif (ref ($targetdataref->{$key}) eq "ARRAY") { 330 # we already have an array of values for this field 331 my $aref = $targetdataref->{$key}; 332 push @$aref, $value; 333 } else { 334 # we have a scalar for this field - convert to array 335 $targetdataref->{$key} = [$targetdataref->{$key}, $value]; 336 } 337 } elsif ($overrideflag) { 338 # the override flag directs us to override exising values 339 $targetdataref->{$key} = $value; 340 } elsif (!defined $targetdataref->{$key}) { 341 # there is no flag, and no existing value: default to override mode 342 # In the future, I should let the user specify the default mode. 343 $targetdataref->{$key} = $value; 344 } elsif (ref ($targetdataref->{$key}) eq "ARRAY") { 345 # there is no flag, and we're already in accumulate mode 346 my $aref = $targetdataref->{$key}; 347 push @$aref, $value; 348 } else { 349 # there is no flag, and we're already in override mode 350 $targetdataref->{$key} = $value; 351 } 352 } 353 } 354 355 # store this metadata information in the metadata ref 356 if ($target) { 357 push @$metakeysref, $target; 358 $metadataref->{$target} = $targetdataref; 359 } 360 } 361 } 362 363 364 # Combine two metadata structures. Given two references to metadata 365 # element structures, add every field of the second ($mdref2) to the first 366 # ($mdref1). 367 # 368 # Afterwards $mdref1 will be updated, and $mdref2 will be unchanged. 369 # 370 # We have to be acreful about the way we merge metadata when one metadata 371 # structure is in "override" mode and one is in "merge" mode. In fact, we 372 # use the mode from the second structure, $mdref2, because it is generally 373 # defined later (lower in the directory structure) and is therefore more 374 # "local" to the document concerned. 375 # 376 # Another issue is the use of references to pass metadata around. If we 377 # simply copy one metadata structure reference to another, then we're 378 # effectively justr copyinga pointer, and changes to the new referene 379 # will affect the old (copied) one also. This also applies to ARRAY 380 # references used as metadata element values (hence the "clonedata" 381 # function below). 382 383 sub combine_metadata_structures { 384 my ($mdref1, $mdref2) = @_; 385 my ($key, $value1, $value2); 386 387 foreach $key (keys %$mdref2) { 388 389 $value1 = $mdref1->{$key}; 390 $value2 = $mdref2->{$key}; 391 392 # If there is no existing value for this metadata field in 393 # $mdref1, so we simply copy the value from $mdref2 over. 394 if (!defined $value1) { 395 $mdref1->{$key} = &clonedata($value2); 396 } 397 # Otherwise we have to add the new values to the existing ones. 398 # If the second structure is accumulated, then acculate all the 399 # values into the first structure 400 elsif ((ref $value2) eq "ARRAY") { 401 # If the first metadata element is a scalar we have to 402 # convert it into an array before we add anything more. 403 if ((ref $value1) ne 'ARRAY') { 404 $mdref1->{$key} = [$value1]; 405 $value1 = $mdref1->{$key}; 406 } 407 # Now add the value(s) from the second array to the first 408 $value2 = &clonedata($value2); 409 push @$value1, @$value2; 410 } 411 # Finally, If the second structure is not an array erference, we 412 # know it is in override mode, so override the first structure. 413 else { 414 $mdref1->{$key} = &clonedata($value2); 415 } 416 } 417 } 418 419 420 # Make a "cloned" copy of a metadata value. 421 # This is trivial for a simple scalar value, 422 # but not for an array reference. 423 424 sub clonedata { 425 my ($value) = @_; 426 my $result; 427 428 if ((ref $value) eq 'ARRAY') { 429 $result = []; 430 foreach my $item (@$value) { 431 push @$result, $item; 432 } 433 } else { 434 $result = $value; 435 } 436 return $result; 130 437 } 131 438 132 439 133 440 1; 441 442 443
Note:
See TracChangeset
for help on using the changeset viewer.