- Timestamp:
- 2013-08-19T20:42:40+12:00 (11 years ago)
- File:
-
- 1 edited
Legend:
- Unmodified
- Added
- Removed
-
other-projects/nightly-tasks/diffcol/trunk/diffcol/diffcol.pl
r28078 r28086 542 542 # so far, only doc.xml files need special Windows processing (db files' OS-sensitivity are handled in gdbdiff.pm) 543 543 # Returns true if the doc.xml contains windows style slashes in the gsdlsourcefilename meta field 544 sub isDoc XMLFileWindows544 sub isDocOrMETSXMLFileWindows 545 545 { 546 546 my ($file_contents) = @_; … … 553 553 # for doc.xml: 554 554 # <Metadata name="gsdlsourcefilename">import/html_files/cleves.html</Metadata> 555 if($file_contents =~ m@< Metadata name="gsdlsourcefilename">([^>]*)</Metadata>@m) {556 $gsdlsourcefilename = $ 1;555 if($file_contents =~ m@<(.*?:)?Metadata name="gsdlsourcefilename">([^>]*)</(.*?:)?Metadata>@m) { 556 $gsdlsourcefilename = $2; 557 557 if($gsdlsourcefilename =~ m/\\/) { # windows slashes detected. 558 558 return 1; … … 627 627 { 628 628 # allow for a namespace prefix to <Metadata> as happens in GreenstoneMETS docmets.xml files, e.g. <gsdl3:Metadata></gsdl3:Metadata> 629 my $ignore_line_re = "<(.*?:)?Metadata name=\"(lastmodified|lastmodifieddate|oailastmodified|oailastmodifieddate|ex.File.FileModifyDate|ex.File.FilePermissions|ImageSize|FileSize )\">.*</(.*?:)?Metadata>\\s*\\n*";629 my $ignore_line_re = "<(.*?:)?Metadata name=\"(lastmodified|lastmodifieddate|oailastmodified|oailastmodifieddate|ex.File.FileModifyDate|ex.File.FilePermissions|ImageSize|FileSize|ex.Composite.LightValue)\">.*</(.*?:)?Metadata>\\s*\\n*"; 630 630 631 631 my $strResult; … … 649 649 $model_contents =~ s/$ignore_line_re//g; 650 650 $test_contents =~ s/$ignore_line_re//g; 651 652 653 # doc.xml needs to additionally be normalised, before comparing a windows test with a linux model or vice-versa 654 if($strModel =~ m/doc\.xml$/) { 655 # equalise/normalise the two doc.xml files for OS differences, if there are any 656 my $testIsWin = &isDocXMLFileWindows($test_contents); 657 my $modelIsWin = &isDocXMLFileWindows($model_contents); 658 659 if($testIsWin != $modelIsWin) { # one of the 2 collections is built on windows, the other on linux, so need to make newlines constant 660 661 my $win_contents = $testIsWin ? \$test_contents : \$model_contents; 662 my $lin_contents = $testIsWin ? \$model_contents : \$test_contents; 663 664 # remove all carriage returns \r - introduced into doc.xml by multiread after pdf converted to html 665 $$win_contents =~ s@[\r]@@g; 666 667 # make all single windows slashes into single unix slashes 668 $$win_contents =~ s@([^\\])\\([^\\])@$1\/$2@g; 669 # make windows \r newlines into constant \n newlines. Already handled when \r got replaced 670 #$$win_contents =~ s@\r\n@\n@mg; # #http://stackoverflow.com/questions/650743/in-perl-how-to-do-you-remove-m-from-a-file 671 672 #FOR MAC: old macs use CR carriage return (see http://www.perlmonks.org/?node_id=745018), so replace with \n?) 673 # $$win_contents =~ s@\r@\n@mg; 674 651 652 653 # equalise/normalise the two doc.xml/docmets.xml files for OS differences, if there are any 654 # before comparing a windows test with a linux model or vice-versa 655 my $testIsWin = &isDocOrMETSXMLFileWindows($test_contents); 656 my $modelIsWin = &isDocOrMETSXMLFileWindows($model_contents); 657 658 if($testIsWin != $modelIsWin) { # one of the 2 collections is built on windows, the other on linux, so need to make newlines constant 659 660 my $win_contents = $testIsWin ? \$test_contents : \$model_contents; 661 my $lin_contents = $testIsWin ? \$model_contents : \$test_contents; 662 663 # remove all carriage returns \r - introduced into doc.xml by multiread after pdf converted to html 664 $$win_contents =~ s@[\r]@@g; 665 666 # make all single windows slashes into single unix slashes 667 # the 1 char look-ahead requires a double pass, otherwise import\3\3.pdf will get replaced with import/3\3.pdf 668 $$win_contents =~ s@([^\\])\\([^\\])@$1\/$2@g; 669 $$win_contents =~ s@([^\\])\\([^\\])@$1\/$2@g; 670 671 # make windows \r newlines into constant \n newlines. Already handled when \r got replaced 672 #$$win_contents =~ s@\r\n@\n@mg; # #http://stackoverflow.com/questions/650743/in-perl-how-to-do-you-remove-m-from-a-file 673 674 #FOR MAC: old macs use CR carriage return (see http://www.perlmonks.org/?node_id=745018), so replace with \n?) 675 # $$win_contents =~ s@\r@\n@mg; 676 677 if($strModel =~ m/doc\.xml$/) { # processing particular to doc.xml 675 678 # remove solitary, stray carriage returns \r in the linux doc.xml, as occurs in the tudor collection owing to the source material 676 679 # containing solitary carriage returns instead of linefeed … … 682 685 # Doing so is okay, since we're not modifying the doc.xml in the model or test collections, just normalising them in-memory for comparison 683 686 $$lin_contents =~ s@([^\\])\\([^\\])@$1\/$2@g; 687 $$lin_contents =~ s@([^\\])\\([^\\])@$1\/$2@g; 684 688 685 689 # Advanced Beatles collection, … … 687 691 # while windows contains: IMG SRC=_httpextlink_&amp;rl=1&amp;href=http://\\"http://www.boskowan.com/ 688 692 # Normalising to windows version for doing a diff 689 $$lin_contents =~ s@href=http:///@href=http://@g; 690 } 691 692 693 # tmp dirs have subdirs with random numbers in name, remove randomly named subdir portion of path 694 # these tmpdirs are located inside the collection directory 695 $model_contents =~ s@(tmp[\\\/])(\d*[\\\/])@$1@g; 696 $test_contents =~ s@(tmp[\\\/])(\d*[\\\/])@$1@g; 697 698 # remove all absolute paths upto collect folder from <Metadata /> elements 699 $model_contents =~ s@(<Metadata name=\"[^\"]*\">(http:\/\/)?).*(collect[\\\/]$strColName)@$1$3@g; 700 $test_contents =~ s@(<Metadata name=\"[^\"]*\">(http:\/\/)?).*(collect[\\\/]$strColName)@$1$3@g; 701 702 # The following block of code is necessary to deal with tmp (html) source files generated when using PDFBox 703 # These tmpdirs are located inside the toplevel *greenstone* directory 704 (my $gsdlhome_re = $ENV{'GSDLHOME'}) =~ s@\\@\/@g; 705 $gsdlhome_re = ".*" unless $$ENV{'GSDLHOME'}; 706 my $tmpfile_regex = "<Metadata name=\"URL\">http://$gsdlhome_re/tmp/([^\.]*)(\..{3,4})</Metadata>"; # $gsdlhome/tmp/randomfilename.html, file ext can be 3 or 4 chars long 707 708 if($test_contents =~ m@$tmpfile_regex@) { 709 # found a match, replace the tmp file name with "random", keeping the original file extension 710 # in <Metadata name="OrigSource|URL|UTF8URL|gsdlconvertedfilename"> 711 712 my ($old_tmp_filename, $ext) = ($1, $2); 713 my $new_tmp_filename = "random"; 714 715 ## The following does not work in the Multimedia collection, since there's a subfolder to tmp (the timestamp folder) which contains the output file. 716 #$tmpfile_regex = "(<Metadata name=\"(URL|UTF8URL|gsdlconvertedfilename|OrigSource)\">(http://)?)($gsdlhome_re)?(/tmp/)?$old_tmp_filename($ext</Metadata>)"; 717 $tmpfile_regex = "(<Metadata name=\"(URL|UTF8URL|gsdlconvertedfilename|OrigSource)\">(http://)?)($gsdlhome_re)?(/tmp/)?.*?($ext</Metadata>)"; 718 if($5) { 719 $test_contents =~ s@$tmpfile_regex@$1$5$new_tmp_filename$6@mg; 720 } else { # OrigSource contains only the filename 721 $test_contents =~ s@$tmpfile_regex@$1$new_tmp_filename$6@mg; 722 } 723 724 # modelcol used a different gsdlhome, but also a tmp dir, so make the same changes to its random filename 725 $tmpfile_regex = "(<Metadata name=\"(URL|UTF8URL|gsdlconvertedfilename|OrigSource)\">(http://)?)(.*)?(/tmp/)?.*?($ext</Metadata>)"; 726 if($5) { 727 $model_contents =~ s@$tmpfile_regex@$1$5$new_tmp_filename$6@mg; 728 } else { # OrigSource contains only the filename 729 $model_contents =~ s@$tmpfile_regex@$1$new_tmp_filename$6@mg; 730 } 731 } 732 733 # my $savepath = &getcwd."/../"; # TASK_HOME env var does not exist at this stage, but it's one level up from current directory 734 # if($strModel =~ m/(HASH010d.dir)/) { # list the HASH dirs for which you want the doc.xml file generated 693 $$lin_contents =~ s@href=http:///@href=http://@g; 694 } 695 } 696 697 # processing particular to doc.xml 698 if($strModel =~ m/doc\.xml$/) { 699 # tmp dirs have subdirs with random numbers in name, remove randomly named subdir portion of path 700 # these tmpdirs are located inside the collection directory 701 $model_contents =~ s@(tmp[\\\/])(\d*[\\\/])@$1@g; 702 $test_contents =~ s@(tmp[\\\/])(\d*[\\\/])@$1@g; 703 704 # remove all absolute paths upto collect folder from <Metadata /> elements 705 $model_contents =~ s@(<Metadata name=\"[^\"]*\">(http:\/\/)?).*(collect[\\\/]$strColName)@$1$3@g; 706 $test_contents =~ s@(<Metadata name=\"[^\"]*\">(http:\/\/)?).*(collect[\\\/]$strColName)@$1$3@g; 707 708 # The following block of code is necessary to deal with tmp (html) source files generated when using PDFBox 709 # These tmpdirs are located inside the toplevel *greenstone* directory 710 (my $gsdlhome_re = $ENV{'GSDLHOME'}) =~ s@\\@\/@g; 711 $gsdlhome_re = ".*" unless $$ENV{'GSDLHOME'}; 712 my $tmpfile_regex = "<Metadata name=\"URL\">http://$gsdlhome_re/tmp/([^\.]*)(\..{3,4})</Metadata>"; # $gsdlhome/tmp/randomfilename.html, file ext can be 3 or 4 chars long 713 714 if($test_contents =~ m@$tmpfile_regex@) { 715 # found a match, replace the tmp file name with "random", keeping the original file extension 716 # in <Metadata name="OrigSource|URL|UTF8URL|gsdlconvertedfilename"> 717 718 my ($old_tmp_filename, $ext) = ($1, $2); 719 my $new_tmp_filename = "random"; 720 721 ## The following does not work in the Multimedia collection, since there's a subfolder to tmp (the timestamp folder) which contains the output file. 722 #$tmpfile_regex = "(<Metadata name=\"(URL|UTF8URL|gsdlconvertedfilename|OrigSource)\">(http://)?)($gsdlhome_re)?(/tmp/)?$old_tmp_filename($ext</Metadata>)"; 723 $tmpfile_regex = "(<Metadata name=\"(URL|UTF8URL|gsdlconvertedfilename|OrigSource)\">(http://)?)($gsdlhome_re)?(/tmp/)?.*?($ext</Metadata>)"; 724 if($5) { 725 $test_contents =~ s@$tmpfile_regex@$1$5$new_tmp_filename$6@mg; 726 } else { # OrigSource contains only the filename 727 $test_contents =~ s@$tmpfile_regex@$1$new_tmp_filename$6@mg; 728 } 729 730 # modelcol used a different gsdlhome, but also a tmp dir, so make the same changes to its random filename 731 $tmpfile_regex = "(<Metadata name=\"(URL|UTF8URL|gsdlconvertedfilename|OrigSource)\">(http://)?)(.*)?(/tmp/)?.*?($ext</Metadata>)"; 732 if($5) { 733 $model_contents =~ s@$tmpfile_regex@$1$5$new_tmp_filename$6@mg; 734 } else { # OrigSource contains only the filename 735 $model_contents =~ s@$tmpfile_regex@$1$new_tmp_filename$6@mg; 736 } 737 } 738 739 } # finished special processing of doc.xml files 740 741 my $savepath = &getcwd."/../"; # TASK_HOME env var does not exist at this stage, but it's one level up from current directory 742 # &gdbdiff::print_string_to_file($model_contents, $savepath."model_docmets.xml"); 743 # &gdbdiff::print_string_to_file($test_contents, $savepath."test_docmets.xml"); 744 # if($strModel =~ m/(HASH0164.dir)/) { # list the HASH dirs for which you want the doc.xml file generated 735 745 # &gdbdiff::print_string_to_file($model_contents, $savepath."$1_model_doc.xml"); 736 746 # &gdbdiff::print_string_to_file($test_contents, $savepath."$1_test_doc.xml"); 737 747 # } 738 748 739 } # finished special processing of doc.xml files 749 740 750 741 751 # now can diff the normalised versions of the doc.xml/docmets.xml files:
Note:
See TracChangeset
for help on using the changeset viewer.