Changeset 28086
- Timestamp:
- 2013-08-19T20:42:40+12:00 (11 years ago)
- Location:
- other-projects/nightly-tasks/diffcol/trunk/diffcol
- Files:
-
- 2 edited
Legend:
- Unmodified
- Added
- Removed
-
other-projects/nightly-tasks/diffcol/trunk/diffcol/diffcol.pl
r28078 r28086 542 542 # so far, only doc.xml files need special Windows processing (db files' OS-sensitivity are handled in gdbdiff.pm) 543 543 # Returns true if the doc.xml contains windows style slashes in the gsdlsourcefilename meta field 544 sub isDoc XMLFileWindows544 sub isDocOrMETSXMLFileWindows 545 545 { 546 546 my ($file_contents) = @_; … … 553 553 # for doc.xml: 554 554 # <Metadata name="gsdlsourcefilename">import/html_files/cleves.html</Metadata> 555 if($file_contents =~ m@< Metadata name="gsdlsourcefilename">([^>]*)</Metadata>@m) {556 $gsdlsourcefilename = $ 1;555 if($file_contents =~ m@<(.*?:)?Metadata name="gsdlsourcefilename">([^>]*)</(.*?:)?Metadata>@m) { 556 $gsdlsourcefilename = $2; 557 557 if($gsdlsourcefilename =~ m/\\/) { # windows slashes detected. 558 558 return 1; … … 627 627 { 628 628 # allow for a namespace prefix to <Metadata> as happens in GreenstoneMETS docmets.xml files, e.g. <gsdl3:Metadata></gsdl3:Metadata> 629 my $ignore_line_re = "<(.*?:)?Metadata name=\"(lastmodified|lastmodifieddate|oailastmodified|oailastmodifieddate|ex.File.FileModifyDate|ex.File.FilePermissions|ImageSize|FileSize )\">.*</(.*?:)?Metadata>\\s*\\n*";629 my $ignore_line_re = "<(.*?:)?Metadata name=\"(lastmodified|lastmodifieddate|oailastmodified|oailastmodifieddate|ex.File.FileModifyDate|ex.File.FilePermissions|ImageSize|FileSize|ex.Composite.LightValue)\">.*</(.*?:)?Metadata>\\s*\\n*"; 630 630 631 631 my $strResult; … … 649 649 $model_contents =~ s/$ignore_line_re//g; 650 650 $test_contents =~ s/$ignore_line_re//g; 651 652 653 # doc.xml needs to additionally be normalised, before comparing a windows test with a linux model or vice-versa 654 if($strModel =~ m/doc\.xml$/) { 655 # equalise/normalise the two doc.xml files for OS differences, if there are any 656 my $testIsWin = &isDocXMLFileWindows($test_contents); 657 my $modelIsWin = &isDocXMLFileWindows($model_contents); 658 659 if($testIsWin != $modelIsWin) { # one of the 2 collections is built on windows, the other on linux, so need to make newlines constant 660 661 my $win_contents = $testIsWin ? \$test_contents : \$model_contents; 662 my $lin_contents = $testIsWin ? \$model_contents : \$test_contents; 663 664 # remove all carriage returns \r - introduced into doc.xml by multiread after pdf converted to html 665 $$win_contents =~ s@[\r]@@g; 666 667 # make all single windows slashes into single unix slashes 668 $$win_contents =~ s@([^\\])\\([^\\])@$1\/$2@g; 669 # make windows \r newlines into constant \n newlines. Already handled when \r got replaced 670 #$$win_contents =~ s@\r\n@\n@mg; # #http://stackoverflow.com/questions/650743/in-perl-how-to-do-you-remove-m-from-a-file 671 672 #FOR MAC: old macs use CR carriage return (see http://www.perlmonks.org/?node_id=745018), so replace with \n?) 673 # $$win_contents =~ s@\r@\n@mg; 674 651 652 653 # equalise/normalise the two doc.xml/docmets.xml files for OS differences, if there are any 654 # before comparing a windows test with a linux model or vice-versa 655 my $testIsWin = &isDocOrMETSXMLFileWindows($test_contents); 656 my $modelIsWin = &isDocOrMETSXMLFileWindows($model_contents); 657 658 if($testIsWin != $modelIsWin) { # one of the 2 collections is built on windows, the other on linux, so need to make newlines constant 659 660 my $win_contents = $testIsWin ? \$test_contents : \$model_contents; 661 my $lin_contents = $testIsWin ? \$model_contents : \$test_contents; 662 663 # remove all carriage returns \r - introduced into doc.xml by multiread after pdf converted to html 664 $$win_contents =~ s@[\r]@@g; 665 666 # make all single windows slashes into single unix slashes 667 # the 1 char look-ahead requires a double pass, otherwise import\3\3.pdf will get replaced with import/3\3.pdf 668 $$win_contents =~ s@([^\\])\\([^\\])@$1\/$2@g; 669 $$win_contents =~ s@([^\\])\\([^\\])@$1\/$2@g; 670 671 # make windows \r newlines into constant \n newlines. Already handled when \r got replaced 672 #$$win_contents =~ s@\r\n@\n@mg; # #http://stackoverflow.com/questions/650743/in-perl-how-to-do-you-remove-m-from-a-file 673 674 #FOR MAC: old macs use CR carriage return (see http://www.perlmonks.org/?node_id=745018), so replace with \n?) 675 # $$win_contents =~ s@\r@\n@mg; 676 677 if($strModel =~ m/doc\.xml$/) { # processing particular to doc.xml 675 678 # remove solitary, stray carriage returns \r in the linux doc.xml, as occurs in the tudor collection owing to the source material 676 679 # containing solitary carriage returns instead of linefeed … … 682 685 # Doing so is okay, since we're not modifying the doc.xml in the model or test collections, just normalising them in-memory for comparison 683 686 $$lin_contents =~ s@([^\\])\\([^\\])@$1\/$2@g; 687 $$lin_contents =~ s@([^\\])\\([^\\])@$1\/$2@g; 684 688 685 689 # Advanced Beatles collection, … … 687 691 # while windows contains: IMG SRC=_httpextlink_&amp;rl=1&amp;href=http://\\"http://www.boskowan.com/ 688 692 # Normalising to windows version for doing a diff 689 $$lin_contents =~ s@href=http:///@href=http://@g; 690 } 691 692 693 # tmp dirs have subdirs with random numbers in name, remove randomly named subdir portion of path 694 # these tmpdirs are located inside the collection directory 695 $model_contents =~ s@(tmp[\\\/])(\d*[\\\/])@$1@g; 696 $test_contents =~ s@(tmp[\\\/])(\d*[\\\/])@$1@g; 697 698 # remove all absolute paths upto collect folder from <Metadata /> elements 699 $model_contents =~ s@(<Metadata name=\"[^\"]*\">(http:\/\/)?).*(collect[\\\/]$strColName)@$1$3@g; 700 $test_contents =~ s@(<Metadata name=\"[^\"]*\">(http:\/\/)?).*(collect[\\\/]$strColName)@$1$3@g; 701 702 # The following block of code is necessary to deal with tmp (html) source files generated when using PDFBox 703 # These tmpdirs are located inside the toplevel *greenstone* directory 704 (my $gsdlhome_re = $ENV{'GSDLHOME'}) =~ s@\\@\/@g; 705 $gsdlhome_re = ".*" unless $$ENV{'GSDLHOME'}; 706 my $tmpfile_regex = "<Metadata name=\"URL\">http://$gsdlhome_re/tmp/([^\.]*)(\..{3,4})</Metadata>"; # $gsdlhome/tmp/randomfilename.html, file ext can be 3 or 4 chars long 707 708 if($test_contents =~ m@$tmpfile_regex@) { 709 # found a match, replace the tmp file name with "random", keeping the original file extension 710 # in <Metadata name="OrigSource|URL|UTF8URL|gsdlconvertedfilename"> 711 712 my ($old_tmp_filename, $ext) = ($1, $2); 713 my $new_tmp_filename = "random"; 714 715 ## The following does not work in the Multimedia collection, since there's a subfolder to tmp (the timestamp folder) which contains the output file. 716 #$tmpfile_regex = "(<Metadata name=\"(URL|UTF8URL|gsdlconvertedfilename|OrigSource)\">(http://)?)($gsdlhome_re)?(/tmp/)?$old_tmp_filename($ext</Metadata>)"; 717 $tmpfile_regex = "(<Metadata name=\"(URL|UTF8URL|gsdlconvertedfilename|OrigSource)\">(http://)?)($gsdlhome_re)?(/tmp/)?.*?($ext</Metadata>)"; 718 if($5) { 719 $test_contents =~ s@$tmpfile_regex@$1$5$new_tmp_filename$6@mg; 720 } else { # OrigSource contains only the filename 721 $test_contents =~ s@$tmpfile_regex@$1$new_tmp_filename$6@mg; 722 } 723 724 # modelcol used a different gsdlhome, but also a tmp dir, so make the same changes to its random filename 725 $tmpfile_regex = "(<Metadata name=\"(URL|UTF8URL|gsdlconvertedfilename|OrigSource)\">(http://)?)(.*)?(/tmp/)?.*?($ext</Metadata>)"; 726 if($5) { 727 $model_contents =~ s@$tmpfile_regex@$1$5$new_tmp_filename$6@mg; 728 } else { # OrigSource contains only the filename 729 $model_contents =~ s@$tmpfile_regex@$1$new_tmp_filename$6@mg; 730 } 731 } 732 733 # my $savepath = &getcwd."/../"; # TASK_HOME env var does not exist at this stage, but it's one level up from current directory 734 # if($strModel =~ m/(HASH010d.dir)/) { # list the HASH dirs for which you want the doc.xml file generated 693 $$lin_contents =~ s@href=http:///@href=http://@g; 694 } 695 } 696 697 # processing particular to doc.xml 698 if($strModel =~ m/doc\.xml$/) { 699 # tmp dirs have subdirs with random numbers in name, remove randomly named subdir portion of path 700 # these tmpdirs are located inside the collection directory 701 $model_contents =~ s@(tmp[\\\/])(\d*[\\\/])@$1@g; 702 $test_contents =~ s@(tmp[\\\/])(\d*[\\\/])@$1@g; 703 704 # remove all absolute paths upto collect folder from <Metadata /> elements 705 $model_contents =~ s@(<Metadata name=\"[^\"]*\">(http:\/\/)?).*(collect[\\\/]$strColName)@$1$3@g; 706 $test_contents =~ s@(<Metadata name=\"[^\"]*\">(http:\/\/)?).*(collect[\\\/]$strColName)@$1$3@g; 707 708 # The following block of code is necessary to deal with tmp (html) source files generated when using PDFBox 709 # These tmpdirs are located inside the toplevel *greenstone* directory 710 (my $gsdlhome_re = $ENV{'GSDLHOME'}) =~ s@\\@\/@g; 711 $gsdlhome_re = ".*" unless $$ENV{'GSDLHOME'}; 712 my $tmpfile_regex = "<Metadata name=\"URL\">http://$gsdlhome_re/tmp/([^\.]*)(\..{3,4})</Metadata>"; # $gsdlhome/tmp/randomfilename.html, file ext can be 3 or 4 chars long 713 714 if($test_contents =~ m@$tmpfile_regex@) { 715 # found a match, replace the tmp file name with "random", keeping the original file extension 716 # in <Metadata name="OrigSource|URL|UTF8URL|gsdlconvertedfilename"> 717 718 my ($old_tmp_filename, $ext) = ($1, $2); 719 my $new_tmp_filename = "random"; 720 721 ## The following does not work in the Multimedia collection, since there's a subfolder to tmp (the timestamp folder) which contains the output file. 722 #$tmpfile_regex = "(<Metadata name=\"(URL|UTF8URL|gsdlconvertedfilename|OrigSource)\">(http://)?)($gsdlhome_re)?(/tmp/)?$old_tmp_filename($ext</Metadata>)"; 723 $tmpfile_regex = "(<Metadata name=\"(URL|UTF8URL|gsdlconvertedfilename|OrigSource)\">(http://)?)($gsdlhome_re)?(/tmp/)?.*?($ext</Metadata>)"; 724 if($5) { 725 $test_contents =~ s@$tmpfile_regex@$1$5$new_tmp_filename$6@mg; 726 } else { # OrigSource contains only the filename 727 $test_contents =~ s@$tmpfile_regex@$1$new_tmp_filename$6@mg; 728 } 729 730 # modelcol used a different gsdlhome, but also a tmp dir, so make the same changes to its random filename 731 $tmpfile_regex = "(<Metadata name=\"(URL|UTF8URL|gsdlconvertedfilename|OrigSource)\">(http://)?)(.*)?(/tmp/)?.*?($ext</Metadata>)"; 732 if($5) { 733 $model_contents =~ s@$tmpfile_regex@$1$5$new_tmp_filename$6@mg; 734 } else { # OrigSource contains only the filename 735 $model_contents =~ s@$tmpfile_regex@$1$new_tmp_filename$6@mg; 736 } 737 } 738 739 } # finished special processing of doc.xml files 740 741 my $savepath = &getcwd."/../"; # TASK_HOME env var does not exist at this stage, but it's one level up from current directory 742 # &gdbdiff::print_string_to_file($model_contents, $savepath."model_docmets.xml"); 743 # &gdbdiff::print_string_to_file($test_contents, $savepath."test_docmets.xml"); 744 # if($strModel =~ m/(HASH0164.dir)/) { # list the HASH dirs for which you want the doc.xml file generated 735 745 # &gdbdiff::print_string_to_file($model_contents, $savepath."$1_model_doc.xml"); 736 746 # &gdbdiff::print_string_to_file($test_contents, $savepath."$1_test_doc.xml"); 737 747 # } 738 748 739 } # finished special processing of doc.xml files 749 740 750 741 751 # now can diff the normalised versions of the doc.xml/docmets.xml files: -
other-projects/nightly-tasks/diffcol/trunk/diffcol/gdbdiff.pm
r28071 r28086 66 66 # The total_numbytes field can vary depending on how many backslashes exist in the urls in the main body text, as each 67 67 # of these windows slashes get escaped with another backslash, and the resulting string is used as key into rel link db 68 my $ignore_line_re = "\n<(FileSize|lastmodified|lastmodifieddate|oailastmodified|oailastmodifieddate|ex.File.FileModifyDate|ex.File.FilePermissions|total_numbytes )>([^\n])*";68 my $ignore_line_re = "\n<(FileSize|lastmodified|lastmodifieddate|oailastmodified|oailastmodifieddate|ex.File.FileModifyDate|ex.File.FilePermissions|total_numbytes|ex.Composite.LightValue)>([^\n])*"; 69 69 $model_text =~ s/$ignore_line_re//g; 70 70 $test_text =~ s/$ignore_line_re//g; … … 81 81 my $modelIsWin = &isDBWindowsSensitive($dbname, $model_text); 82 82 83 if($testIsWin == $modelIsWin) { # both linux or both windows, do the basic test we did on linux machines: 83 if($testIsWin == $modelIsWin) { 84 # both linux or both windows, do the basic test we did on linux machines: 84 85 # ignore absolute path prefixes in modelcol and testcol (necessary for archiveinf-doc and -src.gdb files) 85 86 … … 120 121 121 122 # assoc-file and meta-file contain filepaths, ensure these are long windows file paths now (will later convert to linux slashes) 122 if($line =~ m@^<(assoc-file|meta-file)>(.*)(\s+)@s) { 123 if($line =~ m@^<(assoc-file|meta-file)>(.*)(\s+)@s) { 123 124 $line = $2; # may be a short file name 124 125 # perhaps test here if it is a shortfilename? should match /CAPS....~number(.ext)/ … … 135 136 if($dbname =~ m/$strColName/) { 136 137 my $tmp = ""; # rebuild windows file's set of lines after processing them one by one 137 for my $line (split /^/, $$win_text) { # split the string into newlines 138 138 for my $line (split /^/, $$win_text) { # split the string into newlines 139 139 140 # In the following regex, add any .gdb fieldnames that represent a path and so would contain double backslashes 140 141 # on Windows (to escape the single backlash of win filepaths). They will be turned into single-backslashes here, … … 143 144 # E.g. On windows, the Word-PDF collection(s) contains double backslashes in the ex.File.Directory field 144 145 # the MARC-Exploded collection contains double backslashes in the null_file entry field of the .gdb file 145 if($line =~ m@^<(ex.File.Directory|null_file)>(.*)@s) { 146 if($line =~ m@^<(ex.File.Directory|null_file)>(.*)@s) { 146 147 my ($fieldname, $escaped_path) = ($1, $2); 147 148 $escaped_path =~ s@\\\\@/@g; #(my $escaped_path = $2) =~ s@\\\\@\\@g; … … 282 283 return 1; 283 284 } 285 elsif ($db_contents =~ m@^(<ex.File.Directory>[a-zA-Z]:\\\\)@m) { # <ex.File.Directory>C:\\path\\path for OAI collection 286 return 1; 287 } 284 288 return 0; 285 289 }
Note:
See TracChangeset
for help on using the changeset viewer.