Index: other-projects/nightly-tasks/diffcol/trunk/diffcol/diffcol.pl
===================================================================
--- other-projects/nightly-tasks/diffcol/trunk/diffcol/diffcol.pl (revision 28085)
+++ other-projects/nightly-tasks/diffcol/trunk/diffcol/diffcol.pl (revision 28086)
@@ -542,5 +542,5 @@
# so far, only doc.xml files need special Windows processing (db files' OS-sensitivity are handled in gdbdiff.pm)
# Returns true if the doc.xml contains windows style slashes in the gsdlsourcefilename meta field
-sub isDocXMLFileWindows
+sub isDocOrMETSXMLFileWindows
{
my ($file_contents) = @_;
@@ -553,6 +553,6 @@
# for doc.xml:
# import/html_files/cleves.html
- if($file_contents =~ m@([^>]*)@m) {
- $gsdlsourcefilename = $1;
+ if($file_contents =~ m@<(.*?:)?Metadata name="gsdlsourcefilename">([^>]*)(.*?:)?Metadata>@m) {
+ $gsdlsourcefilename = $2;
if($gsdlsourcefilename =~ m/\\/) { # windows slashes detected.
return 1;
@@ -627,5 +627,5 @@
{
# allow for a namespace prefix to as happens in GreenstoneMETS docmets.xml files, e.g.
- my $ignore_line_re = "<(.*?:)?Metadata name=\"(lastmodified|lastmodifieddate|oailastmodified|oailastmodifieddate|ex.File.FileModifyDate|ex.File.FilePermissions|ImageSize|FileSize)\">.*(.*?:)?Metadata>\\s*\\n*";
+ my $ignore_line_re = "<(.*?:)?Metadata name=\"(lastmodified|lastmodifieddate|oailastmodified|oailastmodifieddate|ex.File.FileModifyDate|ex.File.FilePermissions|ImageSize|FileSize|ex.Composite.LightValue)\">.*(.*?:)?Metadata>\\s*\\n*";
my $strResult;
@@ -649,28 +649,31 @@
$model_contents =~ s/$ignore_line_re//g;
$test_contents =~ s/$ignore_line_re//g;
-
-
- # doc.xml needs to additionally be normalised, before comparing a windows test with a linux model or vice-versa
- if($strModel =~ m/doc\.xml$/) {
- # equalise/normalise the two doc.xml files for OS differences, if there are any
- my $testIsWin = &isDocXMLFileWindows($test_contents);
- my $modelIsWin = &isDocXMLFileWindows($model_contents);
-
- if($testIsWin != $modelIsWin) { # one of the 2 collections is built on windows, the other on linux, so need to make newlines constant
-
- my $win_contents = $testIsWin ? \$test_contents : \$model_contents;
- my $lin_contents = $testIsWin ? \$model_contents : \$test_contents;
-
- # remove all carriage returns \r - introduced into doc.xml by multiread after pdf converted to html
- $$win_contents =~ s@[\r]@@g;
-
- # make all single windows slashes into single unix slashes
- $$win_contents =~ s@([^\\])\\([^\\])@$1\/$2@g;
- # make windows \r newlines into constant \n newlines. Already handled when \r got replaced
- #$$win_contents =~ s@\r\n@\n@mg; # #http://stackoverflow.com/questions/650743/in-perl-how-to-do-you-remove-m-from-a-file
-
- #FOR MAC: old macs use CR carriage return (see http://www.perlmonks.org/?node_id=745018), so replace with \n?)
- # $$win_contents =~ s@\r@\n@mg;
-
+
+
+ # equalise/normalise the two doc.xml/docmets.xml files for OS differences, if there are any
+ # before comparing a windows test with a linux model or vice-versa
+ my $testIsWin = &isDocOrMETSXMLFileWindows($test_contents);
+ my $modelIsWin = &isDocOrMETSXMLFileWindows($model_contents);
+
+ if($testIsWin != $modelIsWin) { # one of the 2 collections is built on windows, the other on linux, so need to make newlines constant
+
+ my $win_contents = $testIsWin ? \$test_contents : \$model_contents;
+ my $lin_contents = $testIsWin ? \$model_contents : \$test_contents;
+
+ # remove all carriage returns \r - introduced into doc.xml by multiread after pdf converted to html
+ $$win_contents =~ s@[\r]@@g;
+
+ # make all single windows slashes into single unix slashes
+ # the 1 char look-ahead requires a double pass, otherwise import\3\3.pdf will get replaced with import/3\3.pdf
+ $$win_contents =~ s@([^\\])\\([^\\])@$1\/$2@g;
+ $$win_contents =~ s@([^\\])\\([^\\])@$1\/$2@g;
+
+ # make windows \r newlines into constant \n newlines. Already handled when \r got replaced
+ #$$win_contents =~ s@\r\n@\n@mg; # #http://stackoverflow.com/questions/650743/in-perl-how-to-do-you-remove-m-from-a-file
+
+ #FOR MAC: old macs use CR carriage return (see http://www.perlmonks.org/?node_id=745018), so replace with \n?)
+ # $$win_contents =~ s@\r@\n@mg;
+
+ if($strModel =~ m/doc\.xml$/) { # processing particular to doc.xml
# remove solitary, stray carriage returns \r in the linux doc.xml, as occurs in the tudor collection owing to the source material
# containing solitary carriage returns instead of linefeed
@@ -682,4 +685,5 @@
# Doing so is okay, since we're not modifying the doc.xml in the model or test collections, just normalising them in-memory for comparison
$$lin_contents =~ s@([^\\])\\([^\\])@$1\/$2@g;
+ $$lin_contents =~ s@([^\\])\\([^\\])@$1\/$2@g;
# Advanced Beatles collection,
@@ -687,55 +691,61 @@
# while windows contains: IMG SRC=_httpextlink_&rl=1&href=http://\\"http://www.boskowan.com/
# Normalising to windows version for doing a diff
- $$lin_contents =~ s@href=http:///@href=http://@g;
- }
-
-
- # tmp dirs have subdirs with random numbers in name, remove randomly named subdir portion of path
- # these tmpdirs are located inside the collection directory
- $model_contents =~ s@(tmp[\\\/])(\d*[\\\/])@$1@g;
- $test_contents =~ s@(tmp[\\\/])(\d*[\\\/])@$1@g;
-
- # remove all absolute paths upto collect folder from elements
- $model_contents =~ s@((http:\/\/)?).*(collect[\\\/]$strColName)@$1$3@g;
- $test_contents =~ s@((http:\/\/)?).*(collect[\\\/]$strColName)@$1$3@g;
-
- # The following block of code is necessary to deal with tmp (html) source files generated when using PDFBox
- # These tmpdirs are located inside the toplevel *greenstone* directory
- (my $gsdlhome_re = $ENV{'GSDLHOME'}) =~ s@\\@\/@g;
- $gsdlhome_re = ".*" unless $$ENV{'GSDLHOME'};
- my $tmpfile_regex = "http://$gsdlhome_re/tmp/([^\.]*)(\..{3,4})"; # $gsdlhome/tmp/randomfilename.html, file ext can be 3 or 4 chars long
-
- if($test_contents =~ m@$tmpfile_regex@) {
- # found a match, replace the tmp file name with "random", keeping the original file extension
- # in
-
- my ($old_tmp_filename, $ext) = ($1, $2);
- my $new_tmp_filename = "random";
-
- ## The following does not work in the Multimedia collection, since there's a subfolder to tmp (the timestamp folder) which contains the output file.
- #$tmpfile_regex = "((http://)?)($gsdlhome_re)?(/tmp/)?$old_tmp_filename($ext)";
- $tmpfile_regex = "((http://)?)($gsdlhome_re)?(/tmp/)?.*?($ext)";
- if($5) {
- $test_contents =~ s@$tmpfile_regex@$1$5$new_tmp_filename$6@mg;
- } else { # OrigSource contains only the filename
- $test_contents =~ s@$tmpfile_regex@$1$new_tmp_filename$6@mg;
- }
-
- # modelcol used a different gsdlhome, but also a tmp dir, so make the same changes to its random filename
- $tmpfile_regex = "((http://)?)(.*)?(/tmp/)?.*?($ext)";
- if($5) {
- $model_contents =~ s@$tmpfile_regex@$1$5$new_tmp_filename$6@mg;
- } else { # OrigSource contains only the filename
- $model_contents =~ s@$tmpfile_regex@$1$new_tmp_filename$6@mg;
- }
- }
-
-# my $savepath = &getcwd."/../"; # TASK_HOME env var does not exist at this stage, but it's one level up from current directory
-# if($strModel =~ m/(HASH010d.dir)/) { # list the HASH dirs for which you want the doc.xml file generated
+ $$lin_contents =~ s@href=http:///@href=http://@g;
+ }
+ }
+
+ # processing particular to doc.xml
+ if($strModel =~ m/doc\.xml$/) {
+ # tmp dirs have subdirs with random numbers in name, remove randomly named subdir portion of path
+ # these tmpdirs are located inside the collection directory
+ $model_contents =~ s@(tmp[\\\/])(\d*[\\\/])@$1@g;
+ $test_contents =~ s@(tmp[\\\/])(\d*[\\\/])@$1@g;
+
+ # remove all absolute paths upto collect folder from elements
+ $model_contents =~ s@((http:\/\/)?).*(collect[\\\/]$strColName)@$1$3@g;
+ $test_contents =~ s@((http:\/\/)?).*(collect[\\\/]$strColName)@$1$3@g;
+
+ # The following block of code is necessary to deal with tmp (html) source files generated when using PDFBox
+ # These tmpdirs are located inside the toplevel *greenstone* directory
+ (my $gsdlhome_re = $ENV{'GSDLHOME'}) =~ s@\\@\/@g;
+ $gsdlhome_re = ".*" unless $$ENV{'GSDLHOME'};
+ my $tmpfile_regex = "http://$gsdlhome_re/tmp/([^\.]*)(\..{3,4})"; # $gsdlhome/tmp/randomfilename.html, file ext can be 3 or 4 chars long
+
+ if($test_contents =~ m@$tmpfile_regex@) {
+ # found a match, replace the tmp file name with "random", keeping the original file extension
+ # in
+
+ my ($old_tmp_filename, $ext) = ($1, $2);
+ my $new_tmp_filename = "random";
+
+ ## The following does not work in the Multimedia collection, since there's a subfolder to tmp (the timestamp folder) which contains the output file.
+ #$tmpfile_regex = "((http://)?)($gsdlhome_re)?(/tmp/)?$old_tmp_filename($ext)";
+ $tmpfile_regex = "((http://)?)($gsdlhome_re)?(/tmp/)?.*?($ext)";
+ if($5) {
+ $test_contents =~ s@$tmpfile_regex@$1$5$new_tmp_filename$6@mg;
+ } else { # OrigSource contains only the filename
+ $test_contents =~ s@$tmpfile_regex@$1$new_tmp_filename$6@mg;
+ }
+
+ # modelcol used a different gsdlhome, but also a tmp dir, so make the same changes to its random filename
+ $tmpfile_regex = "((http://)?)(.*)?(/tmp/)?.*?($ext)";
+ if($5) {
+ $model_contents =~ s@$tmpfile_regex@$1$5$new_tmp_filename$6@mg;
+ } else { # OrigSource contains only the filename
+ $model_contents =~ s@$tmpfile_regex@$1$new_tmp_filename$6@mg;
+ }
+ }
+
+ } # finished special processing of doc.xml files
+
+ my $savepath = &getcwd."/../"; # TASK_HOME env var does not exist at this stage, but it's one level up from current directory
+# &gdbdiff::print_string_to_file($model_contents, $savepath."model_docmets.xml");
+# &gdbdiff::print_string_to_file($test_contents, $savepath."test_docmets.xml");
+# if($strModel =~ m/(HASH0164.dir)/) { # list the HASH dirs for which you want the doc.xml file generated
# &gdbdiff::print_string_to_file($model_contents, $savepath."$1_model_doc.xml");
# &gdbdiff::print_string_to_file($test_contents, $savepath."$1_test_doc.xml");
# }
- } # finished special processing of doc.xml files
+
# now can diff the normalised versions of the doc.xml/docmets.xml files:
Index: other-projects/nightly-tasks/diffcol/trunk/diffcol/gdbdiff.pm
===================================================================
--- other-projects/nightly-tasks/diffcol/trunk/diffcol/gdbdiff.pm (revision 28085)
+++ other-projects/nightly-tasks/diffcol/trunk/diffcol/gdbdiff.pm (revision 28086)
@@ -66,5 +66,5 @@
# The total_numbytes field can vary depending on how many backslashes exist in the urls in the main body text, as each
# of these windows slashes get escaped with another backslash, and the resulting string is used as key into rel link db
- my $ignore_line_re = "\n<(FileSize|lastmodified|lastmodifieddate|oailastmodified|oailastmodifieddate|ex.File.FileModifyDate|ex.File.FilePermissions|total_numbytes)>([^\n])*";
+ my $ignore_line_re = "\n<(FileSize|lastmodified|lastmodifieddate|oailastmodified|oailastmodifieddate|ex.File.FileModifyDate|ex.File.FilePermissions|total_numbytes|ex.Composite.LightValue)>([^\n])*";
$model_text =~ s/$ignore_line_re//g;
$test_text =~ s/$ignore_line_re//g;
@@ -81,5 +81,6 @@
my $modelIsWin = &isDBWindowsSensitive($dbname, $model_text);
- if($testIsWin == $modelIsWin) { # both linux or both windows, do the basic test we did on linux machines:
+ if($testIsWin == $modelIsWin) {
+ # both linux or both windows, do the basic test we did on linux machines:
# ignore absolute path prefixes in modelcol and testcol (necessary for archiveinf-doc and -src.gdb files)
@@ -120,5 +121,5 @@
# assoc-file and meta-file contain filepaths, ensure these are long windows file paths now (will later convert to linux slashes)
- if($line =~ m@^<(assoc-file|meta-file)>(.*)(\s+)@s) {
+ if($line =~ m@^<(assoc-file|meta-file)>(.*)(\s+)@s) {
$line = $2; # may be a short file name
# perhaps test here if it is a shortfilename? should match /CAPS....~number(.ext)/
@@ -135,6 +136,6 @@
if($dbname =~ m/$strColName/) {
my $tmp = ""; # rebuild windows file's set of lines after processing them one by one
- for my $line (split /^/, $$win_text) { # split the string into newlines
-
+ for my $line (split /^/, $$win_text) { # split the string into newlines
+
# In the following regex, add any .gdb fieldnames that represent a path and so would contain double backslashes
# on Windows (to escape the single backlash of win filepaths). They will be turned into single-backslashes here,
@@ -143,5 +144,5 @@
# E.g. On windows, the Word-PDF collection(s) contains double backslashes in the ex.File.Directory field
# the MARC-Exploded collection contains double backslashes in the null_file entry field of the .gdb file
- if($line =~ m@^<(ex.File.Directory|null_file)>(.*)@s) {
+ if($line =~ m@^<(ex.File.Directory|null_file)>(.*)@s) {
my ($fieldname, $escaped_path) = ($1, $2);
$escaped_path =~ s@\\\\@/@g; #(my $escaped_path = $2) =~ s@\\\\@\\@g;
@@ -282,4 +283,7 @@
return 1;
}
+ elsif ($db_contents =~ m@^([a-zA-Z]:\\\\)@m) { # C:\\path\\path for OAI collection
+ return 1;
+ }
return 0;
}