Context Navigation

← Previous Change
Next Change →

Changeset 28086 for other-projects/nightly-tasks

Timestamp:

2013-08-19T20:42:40+12:00 (11 years ago)

Author:

ak19

Message:

Bringing windows diffcol up to date for the latest tutorials. A new field whose value can be slightly different. Need to test for windows gdb differently when the input collection is OAI. Backslashes in docmets need to be normalised.

Location:

other-projects/nightly-tasks/diffcol/trunk/diffcol

Files:

: 2 edited

diffcol.pl (modified) (6 diffs)
gdbdiff.pm (modified) (6 diffs)

Legend:

: Unmodified
: Added
: Removed

other-projects/nightly-tasks/diffcol/trunk/diffcol/diffcol.pl

-              r28078
+              r28086
 # so far, only doc.xml files need special Windows processing (db files' OS-sensitivity are handled in gdbdiff.pm)
 # Returns true if the doc.xml contains windows style slashes in the gsdlsourcefilename meta field
 sub isDocXMLFileWindows
+sub isDocOrMETSXMLFileWindows
+{
     my ($file_contents) = @_;
 …
     # for doc.xml:
     #     <Metadata name="gsdlsourcefilename">import/html_files/cleves.html</Metadata>
     if($file_contents =~ m@<Metadata name="gsdlsourcefilename">([^>]*)</Metadata>@m) {
         $gsdlsourcefilename = $1;
+    if($file_contents =~ m@<(.*?:)?Metadata name="gsdlsourcefilename">([^>]*)</(.*?:)?Metadata>@m) {
+        $gsdlsourcefilename = $2;
         if($gsdlsourcefilename =~ m/\\/) { # windows slashes detected.
             return 1;
 …
+    {
         # allow for a namespace prefix to <Metadata> as happens in GreenstoneMETS docmets.xml files, e.g. <gsdl3:Metadata></gsdl3:Metadata>
         my $ignore_line_re = "<(.*?:)?Metadata name=\"(lastmodified|lastmodifieddate|oailastmodified|oailastmodifieddate|ex.File.FileModifyDate|ex.File.FilePermissions|ImageSize|FileSize)\">.*</(.*?:)?Metadata>\\s*\\n*";
+        my $ignore_line_re = "<(.*?:)?Metadata name=\"(lastmodified|lastmodifieddate|oailastmodified|oailastmodifieddate|ex.File.FileModifyDate|ex.File.FilePermissions|ImageSize|FileSize|ex.Composite.LightValue)\">.*</(.*?:)?Metadata>\\s*\\n*";
         my $strResult;
 …
         $model_contents =~ s/$ignore_line_re//g;
         $test_contents =~ s/$ignore_line_re//g;
+        # doc.xml needs to additionally be normalised, before comparing a windows test with a linux model or vice-versa
+        if($strModel =~ m/doc\.xml$/) {
+            # equalise/normalise the two doc.xml files for OS differences, if there are any
+            my $testIsWin = &isDocXMLFileWindows($test_contents);
+            my $modelIsWin = &isDocXMLFileWindows($model_contents);
+            if($testIsWin != $modelIsWin) { # one of the 2 collections is built on windows, the other on linux, so need to make newlines constant
+                my $win_contents = $testIsWin ? \$test_contents : \$model_contents;
+                my $lin_contents = $testIsWin ? \$model_contents : \$test_contents;
+                # remove all carriage returns \r - introduced into doc.xml by multiread after pdf converted to html
+                $$win_contents =~ s@[\r]@@g;
+                # make all single windows slashes into single unix slashes
+                $$win_contents =~ s@([^\\])\\([^\\])@$1\/$2@g;
+                # make windows \r newlines into constant \n newlines. Already handled when \r got replaced
+                #$$win_contents =~ s@\r\n@\n@mg; # #http://stackoverflow.com/questions/650743/in-perl-how-to-do-you-remove-m-from-a-file
+                #FOR MAC: old macs use CR carriage return (see http://www.perlmonks.org/?node_id=745018), so replace with \n?)
+                # $$win_contents =~ s@\r@\n@mg;
+        # equalise/normalise the two doc.xml/docmets.xml files for OS differences, if there are any
+        # before comparing a windows test with a linux model or vice-versa
+        my $testIsWin = &isDocOrMETSXMLFileWindows($test_contents);
+        my $modelIsWin = &isDocOrMETSXMLFileWindows($model_contents);
+        if($testIsWin != $modelIsWin) { # one of the 2 collections is built on windows, the other on linux, so need to make newlines constant
+            my $win_contents = $testIsWin ? \$test_contents : \$model_contents;
+            my $lin_contents = $testIsWin ? \$model_contents : \$test_contents;
+            # remove all carriage returns \r - introduced into doc.xml by multiread after pdf converted to html
+            $$win_contents =~ s@[\r]@@g;
+            # make all single windows slashes into single unix slashes
+            # the 1 char look-ahead requires a double pass, otherwise import\3\3.pdf will get replaced with import/3\3.pdf
+            $$win_contents =~ s@([^\\])\\([^\\])@$1\/$2@g;
+            $$win_contents =~ s@([^\\])\\([^\\])@$1\/$2@g;
+            # make windows \r newlines into constant \n newlines. Already handled when \r got replaced
+            #$$win_contents =~ s@\r\n@\n@mg; # #http://stackoverflow.com/questions/650743/in-perl-how-to-do-you-remove-m-from-a-file
+            #FOR MAC: old macs use CR carriage return (see http://www.perlmonks.org/?node_id=745018), so replace with \n?)
+            # $$win_contents =~ s@\r@\n@mg;
+            if($strModel =~ m/doc\.xml$/) { # processing particular to doc.xml
                 # remove solitary, stray carriage returns \r in the linux doc.xml, as occurs in the tudor collection owing to the source material
                 # containing solitary carriage returns instead of linefeed
 …
                 # Doing so is okay, since we're not modifying the doc.xml in the model or test collections, just normalising them in-memory for comparison
                 $$lin_contents =~ s@([^\\])\\([^\\])@$1\/$2@g;
+                $$lin_contents =~ s@([^\\])\\([^\\])@$1\/$2@g;
                 # Advanced Beatles collection,
 …
                 # while windows contains: IMG SRC=_httpextlink_&amp;amp;rl=1&amp;amp;href=http://\\&quot;http://www.boskowan.com/
                 # Normalising to windows version for doing a diff
+                $$lin_contents =~ s@href=http:///@href=http://@g;
+            }
+        # tmp dirs have subdirs with random numbers in name, remove randomly named subdir portion of path
+        # these tmpdirs are located inside the collection directory
+        $model_contents =~ s@(tmp[\\\/])(\d*[\\\/])@$1@g;
+        $test_contents =~ s@(tmp[\\\/])(\d*[\\\/])@$1@g;
+        # remove all absolute paths upto collect folder from <Metadata /> elements
+        $model_contents =~ s@(<Metadata name=\"[^\"]*\">(http:\/\/)?).*(collect[\\\/]$strColName)@$1$3@g;
+        $test_contents =~ s@(<Metadata name=\"[^\"]*\">(http:\/\/)?).*(collect[\\\/]$strColName)@$1$3@g;
+        # The following block of code is necessary to deal with tmp (html) source files generated when using PDFBox
+        # These tmpdirs are located inside the toplevel *greenstone* directory
+        (my $gsdlhome_re = $ENV{'GSDLHOME'}) =~ s@\\@\/@g;
+        $gsdlhome_re = ".*" unless $$ENV{'GSDLHOME'};
+        my $tmpfile_regex = "<Metadata name=\"URL\">http://$gsdlhome_re/tmp/([^\.]*)(\..{3,4})</Metadata>"; # $gsdlhome/tmp/randomfilename.html, file ext can be 3 or 4 chars long
+        if($test_contents =~ m@$tmpfile_regex@) {
+            # found a match, replace the tmp file name with "random", keeping the original file extension
+            # in <Metadata name="OrigSource|URL|UTF8URL|gsdlconvertedfilename">
+            my ($old_tmp_filename, $ext) = ($1, $2);
+            my $new_tmp_filename = "random";
+            ## The following does not work in the Multimedia collection, since there's a subfolder to tmp (the timestamp folder) which contains the output file.
+            #$tmpfile_regex = "(<Metadata name=\"(URL|UTF8URL|gsdlconvertedfilename|OrigSource)\">(http://)?)($gsdlhome_re)?(/tmp/)?$old_tmp_filename($ext</Metadata>)";
+            $tmpfile_regex = "(<Metadata name=\"(URL|UTF8URL|gsdlconvertedfilename|OrigSource)\">(http://)?)($gsdlhome_re)?(/tmp/)?.*?($ext</Metadata>)";
+            if($5) {
+                $test_contents =~ s@$tmpfile_regex@$1$5$new_tmp_filename$6@mg;
+            } else { # OrigSource contains only the filename
+                $test_contents =~ s@$tmpfile_regex@$1$new_tmp_filename$6@mg;
+            }
+            # modelcol used a different gsdlhome, but also a tmp dir, so make the same changes to its random filename
+            $tmpfile_regex = "(<Metadata name=\"(URL|UTF8URL|gsdlconvertedfilename|OrigSource)\">(http://)?)(.*)?(/tmp/)?.*?($ext</Metadata>)";
+            if($5) {
+                $model_contents =~ s@$tmpfile_regex@$1$5$new_tmp_filename$6@mg;
+            } else { # OrigSource contains only the filename
+                $model_contents =~ s@$tmpfile_regex@$1$new_tmp_filename$6@mg;
+            }
+        }
+#       my $savepath = &getcwd."/../"; # TASK_HOME env var does not exist at this stage, but it's one level up from current directory
+#       if($strModel =~ m/(HASH010d.dir)/) { # list the HASH dirs for which you want the doc.xml file generated
+                $$lin_contents =~ s@href=http:///@href=http://@g;
+            }
+        }
+        # processing particular to doc.xml
+        if($strModel =~ m/doc\.xml$/) {
+            # tmp dirs have subdirs with random numbers in name, remove randomly named subdir portion of path
+            # these tmpdirs are located inside the collection directory
+            $model_contents =~ s@(tmp[\\\/])(\d*[\\\/])@$1@g;
+            $test_contents =~ s@(tmp[\\\/])(\d*[\\\/])@$1@g;
+            # remove all absolute paths upto collect folder from <Metadata /> elements
+            $model_contents =~ s@(<Metadata name=\"[^\"]*\">(http:\/\/)?).*(collect[\\\/]$strColName)@$1$3@g;
+            $test_contents =~ s@(<Metadata name=\"[^\"]*\">(http:\/\/)?).*(collect[\\\/]$strColName)@$1$3@g;
+            # The following block of code is necessary to deal with tmp (html) source files generated when using PDFBox
+            # These tmpdirs are located inside the toplevel *greenstone* directory
+            (my $gsdlhome_re = $ENV{'GSDLHOME'}) =~ s@\\@\/@g;
+            $gsdlhome_re = ".*" unless $$ENV{'GSDLHOME'};
+            my $tmpfile_regex = "<Metadata name=\"URL\">http://$gsdlhome_re/tmp/([^\.]*)(\..{3,4})</Metadata>"; # $gsdlhome/tmp/randomfilename.html, file ext can be 3 or 4 chars long
+            if($test_contents =~ m@$tmpfile_regex@) {
+                # found a match, replace the tmp file name with "random", keeping the original file extension
+                # in <Metadata name="OrigSource|URL|UTF8URL|gsdlconvertedfilename">
+                my ($old_tmp_filename, $ext) = ($1, $2);
+                my $new_tmp_filename = "random";
+                ## The following does not work in the Multimedia collection, since there's a subfolder to tmp (the timestamp folder) which contains the output file.
+                #$tmpfile_regex = "(<Metadata name=\"(URL|UTF8URL|gsdlconvertedfilename|OrigSource)\">(http://)?)($gsdlhome_re)?(/tmp/)?$old_tmp_filename($ext</Metadata>)";
+                $tmpfile_regex = "(<Metadata name=\"(URL|UTF8URL|gsdlconvertedfilename|OrigSource)\">(http://)?)($gsdlhome_re)?(/tmp/)?.*?($ext</Metadata>)";
+                if($5) {
+                    $test_contents =~ s@$tmpfile_regex@$1$5$new_tmp_filename$6@mg;
+                } else { # OrigSource contains only the filename
+                    $test_contents =~ s@$tmpfile_regex@$1$new_tmp_filename$6@mg;
+                }
+                # modelcol used a different gsdlhome, but also a tmp dir, so make the same changes to its random filename
+                $tmpfile_regex = "(<Metadata name=\"(URL|UTF8URL|gsdlconvertedfilename|OrigSource)\">(http://)?)(.*)?(/tmp/)?.*?($ext</Metadata>)";
+                if($5) {
+                    $model_contents =~ s@$tmpfile_regex@$1$5$new_tmp_filename$6@mg;
+                } else { # OrigSource contains only the filename
+                    $model_contents =~ s@$tmpfile_regex@$1$new_tmp_filename$6@mg;
+                }
+            }
+        } # finished special processing of doc.xml files
+        my $savepath = &getcwd."/../"; # TASK_HOME env var does not exist at this stage, but it's one level up from current directory
+#       &gdbdiff::print_string_to_file($model_contents, $savepath."model_docmets.xml");
+#       &gdbdiff::print_string_to_file($test_contents, $savepath."test_docmets.xml");
+#       if($strModel =~ m/(HASH0164.dir)/) { # list the HASH dirs for which you want the doc.xml file generated
 #       &gdbdiff::print_string_to_file($model_contents, $savepath."$1_model_doc.xml");
 #       &gdbdiff::print_string_to_file($test_contents, $savepath."$1_test_doc.xml");
 #       }
+        } # finished special processing of doc.xml files
         # now can diff the normalised versions of the doc.xml/docmets.xml files:

other-projects/nightly-tasks/diffcol/trunk/diffcol/gdbdiff.pm

-              r28071
+              r28086
     # The total_numbytes field can vary depending on how many backslashes exist in the urls in the main body text, as each
     # of these windows slashes get escaped with another backslash, and the resulting string is used as key into rel link db
     my $ignore_line_re = "\n<(FileSize|lastmodified|lastmodifieddate|oailastmodified|oailastmodifieddate|ex.File.FileModifyDate|ex.File.FilePermissions|total_numbytes)>([^\n])*";
+    my $ignore_line_re = "\n<(FileSize|lastmodified|lastmodifieddate|oailastmodified|oailastmodifieddate|ex.File.FileModifyDate|ex.File.FilePermissions|total_numbytes|ex.Composite.LightValue)>([^\n])*";
     $model_text =~ s/$ignore_line_re//g;
     $test_text =~ s/$ignore_line_re//g;
 …
     my $modelIsWin = &isDBWindowsSensitive($dbname, $model_text);
+    if($testIsWin == $modelIsWin) { # both linux or both windows, do the basic test we did on linux machines:
+    if($testIsWin == $modelIsWin) {
+    # both linux or both windows, do the basic test we did on linux machines:
             # ignore absolute path prefixes in modelcol and testcol (necessary for archiveinf-doc and -src.gdb files)
 …
                 # assoc-file and meta-file contain filepaths, ensure these are long windows file paths now (will later convert to linux slashes)
                 if($line =~ m@^<(assoc-file|meta-file)>(.*)(\s+)@s) {
+                if($line =~ m@^<(assoc-file|meta-file)>(.*)(\s+)@s) {
                     $line = $2; # may be a short file name
                     # perhaps test here if it is a shortfilename? should match /CAPS....~number(.ext)/
 …
         if($dbname =~ m/$strColName/) {
             my $tmp = ""; # rebuild windows file's set of lines after processing them one by one
             for my $line (split /^/, $$win_text) { # split the string into newlines
+            for my $line (split /^/, $$win_text) { # split the string into newlines
                 # In the following regex, add any .gdb fieldnames that represent a path and so would contain double backslashes
                 # on Windows (to escape the single backlash of win filepaths). They will be turned into single-backslashes here,
 …
                 # E.g. On windows, the Word-PDF collection(s) contains double backslashes in the ex.File.Directory field
                 # the MARC-Exploded collection contains double backslashes in the null_file entry field of the .gdb file
                 if($line =~ m@^<(ex.File.Directory|null_file)>(.*)@s) {
+                if($line =~ m@^<(ex.File.Directory|null_file)>(.*)@s) {
                     my ($fieldname, $escaped_path) = ($1, $2);
                     $escaped_path =~ s@\\\\@/@g; #(my $escaped_path = $2) =~ s@\\\\@\\@g;
 …
             return 1;
+        }
+        elsif ($db_contents =~ m@^(<ex.File.Directory>[a-zA-Z]:\\\\)@m) { # <ex.File.Directory>C:\\path\\path for OAI collection
+            return 1;
+        }
         return 0;
+    }

Note: See TracChangeset for help on using the changeset viewer.

Context Navigation

Changeset 28086 for other-projects/nightly-tasks

Legend:

other-projects/nightly-tasks/diffcol/trunk/diffcol/diffcol.pl

other-projects/nightly-tasks/diffcol/trunk/diffcol/gdbdiff.pm

Download in other formats: