- Timestamp:
- 2015-10-09T13:29:06+13:00 (9 years ago)
- File:
-
- 1 edited
Legend:
- Unmodified
- Added
- Removed
-
gs2-extensions/parallel-building/trunk/src/perllib/plugins/DirectoryPlugin.pm
r29260 r30289 34 34 use plugin; 35 35 use util; 36 use FileUtils; 36 37 use metadatautil; 37 38 … … 41 42 no strict 'subs'; 42 43 44 use Encode::Locale; 43 45 use Encode; 46 use Unicode::Normalize; 44 47 45 48 BEGIN { … … 79 82 80 83 my $self = new PrintInfo($pluginlist, $inputargs, $hashArgOptLists); 81 84 85 print STDERR "INFO: This DirectoryPlugin supports version 2 manifest files\n"; 86 82 87 if ($self->{'info_only'}) { 83 88 # don't worry about any options or initialisations etc … … 89 94 die "ERROR: DirectoryPlugin -use_metadata_files option has been deprecated. Please remove the option and add MetadataXMLPlug to your plugin list instead!\n"; 90 95 } 91 96 92 97 $self->{'num_processed'} = 0; 93 98 $self->{'num_not_processed'} = 0; … … 134 139 my $archives_inf = &dbutil::get_infodb_file_path($infodbtype, "archiveinf-doc", $output_dir); 135 140 136 if ( &FileUtils::fileExists($archives_inf)) {137 $self->{'inf_timestamp'} = &FileUtils::file_lastmodified($archives_inf);141 if ( -e $archives_inf ) { 142 $self->{'inf_timestamp'} = -M $archives_inf; 138 143 } 139 144 } … … 172 177 my $self = shift (@_); 173 178 174 return '(?i)(CVS|\.svn|Thumbs\.db|OIDcount| ~)$';179 return '(?i)(CVS|\.svn|Thumbs\.db|OIDcount|\.DS_Store|~)$'; 175 180 } 176 181 … … 179 184 my $self = shift(@_); 180 185 my ($dirname) = @_; 181 182 # replace -d with function in util library 183 return undef unless (&FileUtils::directoryExists($dirname)); 186 187 return undef unless (-d $dirname); 184 188 185 189 return 0 if ($self->{'block_exp'} ne "" && $dirname =~ /$self->{'block_exp'}/); … … 259 263 260 264 $filename_full_path = &util::upgrade_if_dos_filename($filename_full_path); 261 ### print STDERR "*** DirectoryPlugin::file_is_blocked $filename_full_path\n"; 262 263 if ($ENV{'GSDLOS'} =~ m/^windows$/) { 265 266 if (($ENV{'GSDLOS'} =~ m/^windows$/) && ($^O ne "cygwin")) { 264 267 # on windows, all block paths are lowercased. 265 268 my $lower_filename = lc ($filename_full_path); … … 295 298 # that it is not explicitly blocked. 296 299 my $dirname = $file; 297 $dirname = & util::filename_cat($base_dir, $file) if $base_dir =~ /\w/;300 $dirname = &FileUtils::filenameConcatenate($base_dir, $file) if $base_dir =~ /\w/; 298 301 299 302 my $directory_ok = $self->check_directory_path($dirname); 300 303 return $directory_ok unless (defined $directory_ok && $directory_ok == 1); 301 304 302 print $outhandle " File scan checking directory: $dirname\n";305 print $outhandle "Global file scan checking directory: $dirname\n"; 303 306 304 307 $block_hash->{'all_files'} = {} unless defined $block_hash->{'all_files'}; … … 308 311 $block_hash->{'shared_fileroot'} = {} unless defined $block_hash->{'shared_fileroot'}; 309 312 310 # Recur over directory contents. 313 # Recur over directory contents. 314 my (@dir, $subfile); 315 #my $count = 0; 316 311 317 print $outhandle "DirectoryPlugin block: getting directory $dirname\n" if ($verbosity > 2); 312 318 313 319 # find all the files in the directory 314 my @dir = @{&FileUtils::readDirectory($dirname)}; 315 if (scalar(@dir) == 0) 316 { 317 if ($gli) 318 { 319 print STDERR "<ProcessingError n='$file' r='Could not read directory $dirname'>\n"; 320 } 321 print $outhandle "DirectoryPlugin: WARNING - couldn't read directory $dirname\n"; 322 return -1; # error in processing 323 } 324 320 if (!opendir (DIR, $dirname)) { 321 if ($gli) { 322 print STDERR "<ProcessingError n='$file' r='Could not read directory $dirname'>\n"; 323 } 324 print $outhandle "DirectoryPlugin: WARNING - couldn't read directory $dirname\n"; 325 return -1; # error in processing 326 } 327 @dir = sort readdir (DIR); 328 closedir (DIR); 329 325 330 for (my $i = 0; $i < scalar(@dir); $i++) { 326 331 my $raw_subfile = $dir[$i]; … … 328 333 329 334 my $this_file_base_dir = $base_dir; 330 my $raw_file_subfile = & util::filename_cat($file, $raw_subfile);335 my $raw_file_subfile = &FileUtils::filenameConcatenate($file, $raw_subfile); 331 336 332 337 # Recursively read each $raw_subfile 333 print $outhandle "DirectoryPlugin block recurring: $raw_file_subfile\n" if ($verbosity > 2); 338 print $outhandle "DirectoryPlugin block recurring: ". Encode::decode("utf8", $raw_file_subfile) ."\n" if ($verbosity > 2); 339 print $outhandle "DirectoryPlugin block recurring: ". Encode::decode(locale =>$raw_file_subfile) ."\n" if ($verbosity > 2); 334 340 335 341 #$count += &plugin::file_block_read ($pluginfo, $this_file_base_dir, … … 369 375 my $self = shift (@_); 370 376 my ($pluginfo, $base_dir, $file, $block_hash, $in_metadata, $processor, $maxdocs, $total_count, $gli) = @_; 371 372 377 my $outhandle = $self->{'outhandle'}; 373 378 my $verbosity = $self->{'verbosity'}; … … 380 385 } else { 381 386 $dirname = $file; 382 $dirname = & util::filename_cat($base_dir, $file) if $base_dir =~ /\w/;387 $dirname = &FileUtils::filenameConcatenate($base_dir, $file) if $base_dir =~ /\w/; 383 388 } 384 389 … … 393 398 394 399 # Recur over directory contents. 400 my @dir; 401 395 402 print $outhandle "DirectoryPlugin read: getting directory $dirname\n" if ($verbosity > 2); 396 403 397 404 # find all the files in the directory 398 my @dir = @{&FileUtils::readDirectory($dirname)}; 399 if (scalar(@dir) == 0) 400 { 401 if ($gli) 402 { 403 print STDERR "<ProcessingError n='$file' r='Could not read directory $dirname'>\n"; 404 } 405 print $outhandle "DirectoryPlugin: WARNING - couldn't read directory $dirname\n"; 406 return -1; # error in processing 407 } 408 map { $_ = &unicode::raw_filename_to_url_encoded($_) } @dir; 409 410 # Re-order the files in the list so any directories ending with .all are 411 # moved to the end 405 if (!opendir (DIR, $dirname)) { 406 if ($gli) { 407 print STDERR "<ProcessingError n='$file' r='Could not read directory $dirname'>\n"; 408 } 409 print $outhandle "DirectoryPlugin: WARNING - couldn't read directory $dirname\n"; 410 return -1; # error in processing 411 } 412 @dir = sort readdir (DIR); 413 map { $_ = &unicode::raw_filename_to_url_encoded($_); } @dir; 414 closedir (DIR); 415 # Re-order the files in the list so any directories ending with .all are moved to the end 412 416 for (my $i = scalar(@dir) - 1; $i >= 0; $i--) { 413 if (-d & util::filename_cat($dirname, $dir[$i]) && $dir[$i] =~ /\.all$/) {417 if (-d &FileUtils::filenameConcatenate($dirname, $dir[$i]) && $dir[$i] =~ /\.all$/) { 414 418 push(@dir, splice(@dir, $i, 1)); 415 419 } 416 420 } 417 421 422 # Chain through to the rest of the read function (now split off and named 423 # read_phase2) 424 my $count = $self->read_phase2($pluginfo, $dirname, \@dir, $base_dir, $file, $block_hash, $in_metadata, $processor, $maxdocs, $total_count, $gli); 425 426 return $count; 427 } 428 429 sub read_phase2 430 { 431 my $self = shift (@_); 432 my ($pluginfo, $dirname, $dir_ref, $base_dir, $file, $block_hash, $in_metadata, $processor, $maxdocs, $total_count, $gli) = @_; 433 # These were defined in read (phase 1) 434 my @dir = @{$dir_ref}; 435 my $subfile; 436 437 my $outhandle = $self->{'outhandle'}; 438 my $verbosity = $self->{'verbosity'}; 439 418 440 # setup the metadata structures. we do a metadata_read pass to see if there is any additional metadata, then pass it to read 419 441 … … 428 450 my $base_dir_regexp = $base_dir; 429 451 $base_dir_regexp =~ s/\//$os_dirsep/g; 430 452 431 453 # Want to get relative path of local_dirname within the base_directory 432 # but with URL style slashes. 454 # but with URL style slashes. 433 455 my $local_dirname = &util::filename_within_directory_url_format($dirname, $base_dir); 434 456 … … 460 482 my $raw_subfile = &unicode::url_encoded_to_raw_filename($subfile); 461 483 462 my $raw_file_subfile = & util::filename_cat($file, $raw_subfile);463 my $raw_full_filename = & util::filename_cat($this_file_base_dir, $raw_file_subfile);484 my $raw_file_subfile = &FileUtils::filenameConcatenate($file, $raw_subfile); 485 my $raw_full_filename = &FileUtils::filenameConcatenate($this_file_base_dir, $raw_file_subfile); 464 486 465 487 if ($self->file_is_blocked($block_hash,$raw_full_filename)) { … … 512 534 # Re-read the files in the directory to see if there are any new files 513 535 last if (!opendir (DIR, $dirname)); 514 my @dirnow = readdir (DIR);536 my @dirnow = sort readdir (DIR); 515 537 map { $_ = &unicode::raw_filename_to_url_encoded($_) } @dirnow; 516 538 closedir (DIR); … … 540 562 my $this_file_base_dir = $base_dir; 541 563 my $raw_subfile = &unicode::url_encoded_to_raw_filename($subfile); 542 543 my $raw_file_subfile = &util::filename_cat($file, $raw_subfile); 564 # get the canonical unicode version of the filename. This may not match 565 # the filename on the file system. We will use it to compare to regex 566 # in the metadata table. 567 my $unicode_subfile = &util::raw_filename_to_unicode($dirname, $raw_subfile); 568 my $raw_file_subfile = &FileUtils::filenameConcatenate($file, $raw_subfile); 544 569 my $raw_full_filename 545 = & util::filename_cat($this_file_base_dir,$raw_file_subfile);570 = &FileUtils::filenameConcatenate($this_file_base_dir,$raw_file_subfile); 546 571 547 572 if ($self->file_is_blocked($block_hash,$raw_full_filename)) { … … 549 574 next; 550 575 } 551 #print STDERR "processing $raw_full_filename\n";576 print STDERR "** DirectoryPlugin processing $raw_full_filename\n"; 552 577 # Follow Windows shortcuts 553 if ($raw_subfile =~ /(?i)\.lnk$/ && $ENV{'GSDLOS'} =~ /^windows$/i) {578 if ($raw_subfile =~ m/(?i)\.lnk$/ && (($ENV{'GSDLOS'} =~ m/^windows$/i) && ($^O ne "cygwin"))) { 554 579 require Win32::Shortcut; 555 my $shortcut = new Win32::Shortcut(& util::filename_cat($dirname, $raw_subfile));580 my $shortcut = new Win32::Shortcut(&FileUtils::filenameConcatenate($dirname, $raw_subfile)); 556 581 if ($shortcut) { 557 582 # The file to be processed is now the target of the shortcut … … 592 617 } 593 618 594 # $subfile by this point is url-encoded => all ASCII chars => no need to encode as UTF8 595 596 # Next add metadata read in XML files (if it is supplied) 619 ### Now we need to look up the metadata table to see if there is any 620 # extra metadata for us. We need the canonical unicode version here. 597 621 if ($additionalmetadata == 1) { 598 622 foreach my $filespec (@extrametakeys) { 599 ## use the url-encoded filename to do the filename comparison 600 601 if ($subfile =~ /^$filespec$/) { 602 print $outhandle "File \"$subfile\" matches filespec \"$filespec\"\n" 623 if ($unicode_subfile =~ /^$filespec$/) { 624 print $outhandle "File \"$unicode_subfile\" matches filespec \"$filespec\"\n" 603 625 if ($verbosity > 2); 604 626 my $mdref = &extrametautil::getmetadata(\%extrametadata, $filespec); … … 624 646 if (defined $self->{'inf_timestamp'}) { 625 647 # Look to see if it's a completely new file 648 626 649 if (!$block_hash->{'new_files'}->{$raw_full_filename}) { 627 650 # Not a new file, must be an existing file 628 651 # Let' see if it's newer than the last import.pl 629 if (! &util::dir_exists($raw_full_filename)) { 652 653 654 if (! -d $raw_full_filename) { 630 655 if (!$block_hash->{'reindex_files'}->{$raw_full_filename}) { 631 656 # filename has been around for longer than inf_timestamp 632 print $outhandle "**** Skipping $ subfile\n" if ($verbosity >3);657 print $outhandle "**** Skipping $unicode_subfile\n" if ($verbosity >3); 633 658 next; 634 659 } … … 648 673 649 674 # Recursively read each $subfile 650 print $outhandle "DirectoryPlugin recurring: $ subfile\n" if ($verbosity > 2);675 print $outhandle "DirectoryPlugin recurring: $unicode_subfile\n" if ($verbosity > 2); 651 676 652 677 $count += &plugin::read ($pluginfo, $this_file_base_dir, … … 663 688 } 664 689 690 # Manifest files, version 2, provide an explicit listing of the documents to be 691 # processed by Greenstone. This allows a user to avoid expensive file tree 692 # searches - a crucial requirement for very-large scale collections and 693 # parallel processing. However, we still want to leverage the metadata parsing 694 # functionality found here in DirectoryPlugin. Thus we have this special call 695 # to read that expects a single file. The normal read function starts by 696 # listing the files in a given directory and then performs a number of actions 697 # over them (including recursing down into any further directories found). We 698 # circumvent that behaviour by 'pretending' to already have a directory listing 699 # containing at most two file - the file passed in, and an accompanying 700 # metadata.xml file if one exists. 701 sub read_for_manifest_v2 702 { 703 my $self = shift (@_); 704 my ($pluginfo, $file, $block_hash, $processor, $gli) = @_; 705 my $base_dir = ''; 706 my $in_metadata = {}; 707 my $maxdocs = -1; 708 my $total_count = 0; 709 # Ensure we have the full path of the file to process 710 my $full_path = $file; 711 if ($base_dir =~ /\w/) 712 { 713 $full_path = &FileUtils::filenameConcatenate($base_dir, $file); 714 } 715 # Unlike the vanilla read(), directories are unacceptable 716 if (!-f $full_path) 717 { 718 return 0; 719 } 720 # Now split the full path into a directory and a filename 721 my ($dirname, $the_file) = $full_path =~ /^(.*)\/([^\/]+)$/; 722 # We will prepopulate a 'directory listing' with this file 723 my @dir = ($the_file); 724 # See if there is an accompanying 725 my $metadata_xml_path = $dirname . '/metadata.xml'; 726 if (-f $metadata_xml_path) 727 { 728 unshift(@dir, 'metadata.xml'); 729 } 730 # Chain through to the normal read process, but with out 'forged' directory 731 # listing so as to avoid all the costs of actually listing / recursing. 732 my $count = $self->read_phase2($pluginfo, $dirname, \@dir, $base_dir, $dirname, $block_hash, $in_metadata, $processor, $maxdocs, $total_count, $gli); 733 # We don't return count, but test that it is 1 exactly. 734 if ($count != 1) 735 { 736 print STDERR "ERROR! The count of documents processed from a single call to DirectoryPlugin::read_for_manifest_v2() is not 1.\n"; 737 } 738 } 739 665 740 1;
Note:
See TracChangeset
for help on using the changeset viewer.