Show
Ignore:
Timestamp:
19.08.2013 20:42:40 (6 years ago)
Author:
ak19
Message:

Bringing windows diffcol up to date for the latest tutorials. A new field whose value can be slightly different. Need to test for windows gdb differently when the input collection is OAI. Backslashes in docmets need to be normalised.

Location:
other-projects/nightly-tasks/diffcol/trunk/diffcol
Files:
2 modified

Legend:

Unmodified
Added
Removed
  • other-projects/nightly-tasks/diffcol/trunk/diffcol/diffcol.pl

    r28078 r28086  
    542542# so far, only doc.xml files need special Windows processing (db files' OS-sensitivity are handled in gdbdiff.pm) 
    543543# Returns true if the doc.xml contains windows style slashes in the gsdlsourcefilename meta field 
    544 sub isDocXMLFileWindows 
     544sub isDocOrMETSXMLFileWindows 
    545545{ 
    546546    my ($file_contents) = @_; 
     
    553553    # for doc.xml: 
    554554    #     <Metadata name="gsdlsourcefilename">import/html_files/cleves.html</Metadata> 
    555     if($file_contents =~ m@<Metadata name="gsdlsourcefilename">([^>]*)</Metadata>@m) { 
    556         $gsdlsourcefilename = $1; 
     555    if($file_contents =~ m@<(.*?:)?Metadata name="gsdlsourcefilename">([^>]*)</(.*?:)?Metadata>@m) { 
     556        $gsdlsourcefilename = $2; 
    557557        if($gsdlsourcefilename =~ m/\\/) { # windows slashes detected. 
    558558            return 1; 
     
    627627    { 
    628628        # allow for a namespace prefix to <Metadata> as happens in GreenstoneMETS docmets.xml files, e.g. <gsdl3:Metadata></gsdl3:Metadata> 
    629         my $ignore_line_re = "<(.*?:)?Metadata name=\"(lastmodified|lastmodifieddate|oailastmodified|oailastmodifieddate|ex.File.FileModifyDate|ex.File.FilePermissions|ImageSize|FileSize)\">.*</(.*?:)?Metadata>\\s*\\n*"; 
     629        my $ignore_line_re = "<(.*?:)?Metadata name=\"(lastmodified|lastmodifieddate|oailastmodified|oailastmodifieddate|ex.File.FileModifyDate|ex.File.FilePermissions|ImageSize|FileSize|ex.Composite.LightValue)\">.*</(.*?:)?Metadata>\\s*\\n*"; 
    630630         
    631631        my $strResult; 
     
    649649        $model_contents =~ s/$ignore_line_re//g; 
    650650        $test_contents =~ s/$ignore_line_re//g; 
    651          
    652  
    653         # doc.xml needs to additionally be normalised, before comparing a windows test with a linux model or vice-versa 
    654         if($strModel =~ m/doc\.xml$/) { 
    655             # equalise/normalise the two doc.xml files for OS differences, if there are any 
    656             my $testIsWin = &isDocXMLFileWindows($test_contents); 
    657             my $modelIsWin = &isDocXMLFileWindows($model_contents); 
    658              
    659             if($testIsWin != $modelIsWin) { # one of the 2 collections is built on windows, the other on linux, so need to make newlines constant 
    660              
    661                 my $win_contents = $testIsWin ? \$test_contents : \$model_contents; 
    662                 my $lin_contents = $testIsWin ? \$model_contents : \$test_contents; 
    663                  
    664                 # remove all carriage returns \r - introduced into doc.xml by multiread after pdf converted to html 
    665                 $$win_contents =~ s@[\r]@@g; 
    666              
    667                 # make all single windows slashes into single unix slashes 
    668                 $$win_contents =~ s@([^\\])\\([^\\])@$1\/$2@g; 
    669                 # make windows \r newlines into constant \n newlines. Already handled when \r got replaced 
    670                 #$$win_contents =~ s@\r\n@\n@mg; # #http://stackoverflow.com/questions/650743/in-perl-how-to-do-you-remove-m-from-a-file 
    671                  
    672                 #FOR MAC: old macs use CR carriage return (see http://www.perlmonks.org/?node_id=745018), so replace with \n?) 
    673                 # $$win_contents =~ s@\r@\n@mg; 
    674                  
     651 
     652 
     653        # equalise/normalise the two doc.xml/docmets.xml files for OS differences, if there are any 
     654        # before comparing a windows test with a linux model or vice-versa 
     655        my $testIsWin = &isDocOrMETSXMLFileWindows($test_contents); 
     656        my $modelIsWin = &isDocOrMETSXMLFileWindows($model_contents); 
     657         
     658        if($testIsWin != $modelIsWin) { # one of the 2 collections is built on windows, the other on linux, so need to make newlines constant 
     659         
     660            my $win_contents = $testIsWin ? \$test_contents : \$model_contents; 
     661            my $lin_contents = $testIsWin ? \$model_contents : \$test_contents; 
     662             
     663            # remove all carriage returns \r - introduced into doc.xml by multiread after pdf converted to html 
     664            $$win_contents =~ s@[\r]@@g;             
     665         
     666            # make all single windows slashes into single unix slashes 
     667            # the 1 char look-ahead requires a double pass, otherwise import\3\3.pdf will get replaced with import/3\3.pdf 
     668            $$win_contents =~ s@([^\\])\\([^\\])@$1\/$2@g; 
     669            $$win_contents =~ s@([^\\])\\([^\\])@$1\/$2@g;               
     670             
     671            # make windows \r newlines into constant \n newlines. Already handled when \r got replaced 
     672            #$$win_contents =~ s@\r\n@\n@mg; # #http://stackoverflow.com/questions/650743/in-perl-how-to-do-you-remove-m-from-a-file 
     673             
     674            #FOR MAC: old macs use CR carriage return (see http://www.perlmonks.org/?node_id=745018), so replace with \n?) 
     675            # $$win_contents =~ s@\r@\n@mg; 
     676             
     677            if($strModel =~ m/doc\.xml$/) { # processing particular to doc.xml 
    675678                # remove solitary, stray carriage returns \r in the linux doc.xml, as occurs in the tudor collection owing to the source material 
    676679                # containing solitary carriage returns instead of linefeed 
     
    682685                # Doing so is okay, since we're not modifying the doc.xml in the model or test collections, just normalising them in-memory for comparison 
    683686                $$lin_contents =~ s@([^\\])\\([^\\])@$1\/$2@g; 
     687                $$lin_contents =~ s@([^\\])\\([^\\])@$1\/$2@g; 
    684688                 
    685689                # Advanced Beatles collection,  
     
    687691                # while windows contains: IMG SRC=_httpextlink_&amp;amp;rl=1&amp;amp;href=http://\\&quot;http://www.boskowan.com/ 
    688692                # Normalising to windows version for doing a diff 
    689                 $$lin_contents =~ s@href=http:///@href=http://@g; 
    690             } 
    691              
    692          
    693         # tmp dirs have subdirs with random numbers in name, remove randomly named subdir portion of path 
    694         # these tmpdirs are located inside the collection directory 
    695         $model_contents =~ s@(tmp[\\\/])(\d*[\\\/])@$1@g;  
    696         $test_contents =~ s@(tmp[\\\/])(\d*[\\\/])@$1@g; 
    697          
    698         # remove all absolute paths upto collect folder from <Metadata /> elements 
    699         $model_contents =~ s@(<Metadata name=\"[^\"]*\">(http:\/\/)?).*(collect[\\\/]$strColName)@$1$3@g; 
    700         $test_contents =~ s@(<Metadata name=\"[^\"]*\">(http:\/\/)?).*(collect[\\\/]$strColName)@$1$3@g;         
    701          
    702         # The following block of code is necessary to deal with tmp (html) source files generated when using PDFBox 
    703         # These tmpdirs are located inside the toplevel *greenstone* directory 
    704         (my $gsdlhome_re = $ENV{'GSDLHOME'}) =~ s@\\@\/@g; 
    705         $gsdlhome_re = ".*" unless $$ENV{'GSDLHOME'}; 
    706         my $tmpfile_regex = "<Metadata name=\"URL\">http://$gsdlhome_re/tmp/([^\.]*)(\..{3,4})</Metadata>"; # $gsdlhome/tmp/randomfilename.html, file ext can be 3 or 4 chars long   
    707          
    708         if($test_contents =~ m@$tmpfile_regex@) {            
    709             # found a match, replace the tmp file name with "random", keeping the original file extension  
    710             # in <Metadata name="OrigSource|URL|UTF8URL|gsdlconvertedfilename"> 
    711          
    712             my ($old_tmp_filename, $ext) = ($1, $2);             
    713             my $new_tmp_filename = "random";             
    714              
    715             ## The following does not work in the Multimedia collection, since there's a subfolder to tmp (the timestamp folder) which contains the output file. 
    716             #$tmpfile_regex = "(<Metadata name=\"(URL|UTF8URL|gsdlconvertedfilename|OrigSource)\">(http://)?)($gsdlhome_re)?(/tmp/)?$old_tmp_filename($ext</Metadata>)"; 
    717             $tmpfile_regex = "(<Metadata name=\"(URL|UTF8URL|gsdlconvertedfilename|OrigSource)\">(http://)?)($gsdlhome_re)?(/tmp/)?.*?($ext</Metadata>)"; 
    718             if($5) {  
    719                 $test_contents =~ s@$tmpfile_regex@$1$5$new_tmp_filename$6@mg; 
    720             } else { # OrigSource contains only the filename 
    721                 $test_contents =~ s@$tmpfile_regex@$1$new_tmp_filename$6@mg; 
    722             } 
    723              
    724             # modelcol used a different gsdlhome, but also a tmp dir, so make the same changes to its random filename            
    725             $tmpfile_regex = "(<Metadata name=\"(URL|UTF8URL|gsdlconvertedfilename|OrigSource)\">(http://)?)(.*)?(/tmp/)?.*?($ext</Metadata>)"; 
    726             if($5) {  
    727                 $model_contents =~ s@$tmpfile_regex@$1$5$new_tmp_filename$6@mg; 
    728             } else { # OrigSource contains only the filename 
    729                 $model_contents =~ s@$tmpfile_regex@$1$new_tmp_filename$6@mg; 
    730             } 
    731         } 
    732          
    733 #       my $savepath = &getcwd."/../"; # TASK_HOME env var does not exist at this stage, but it's one level up from current directory        
    734 #       if($strModel =~ m/(HASH010d.dir)/) { # list the HASH dirs for which you want the doc.xml file generated 
     693                $$lin_contents =~ s@href=http:///@href=http://@g;                
     694            }    
     695        } 
     696         
     697        # processing particular to doc.xml   
     698        if($strModel =~ m/doc\.xml$/) { 
     699            # tmp dirs have subdirs with random numbers in name, remove randomly named subdir portion of path 
     700            # these tmpdirs are located inside the collection directory 
     701            $model_contents =~ s@(tmp[\\\/])(\d*[\\\/])@$1@g;  
     702            $test_contents =~ s@(tmp[\\\/])(\d*[\\\/])@$1@g; 
     703             
     704            # remove all absolute paths upto collect folder from <Metadata /> elements 
     705            $model_contents =~ s@(<Metadata name=\"[^\"]*\">(http:\/\/)?).*(collect[\\\/]$strColName)@$1$3@g; 
     706            $test_contents =~ s@(<Metadata name=\"[^\"]*\">(http:\/\/)?).*(collect[\\\/]$strColName)@$1$3@g;         
     707             
     708            # The following block of code is necessary to deal with tmp (html) source files generated when using PDFBox 
     709            # These tmpdirs are located inside the toplevel *greenstone* directory 
     710            (my $gsdlhome_re = $ENV{'GSDLHOME'}) =~ s@\\@\/@g; 
     711            $gsdlhome_re = ".*" unless $$ENV{'GSDLHOME'}; 
     712            my $tmpfile_regex = "<Metadata name=\"URL\">http://$gsdlhome_re/tmp/([^\.]*)(\..{3,4})</Metadata>"; # $gsdlhome/tmp/randomfilename.html, file ext can be 3 or 4 chars long   
     713             
     714            if($test_contents =~ m@$tmpfile_regex@) {            
     715                # found a match, replace the tmp file name with "random", keeping the original file extension  
     716                # in <Metadata name="OrigSource|URL|UTF8URL|gsdlconvertedfilename"> 
     717             
     718                my ($old_tmp_filename, $ext) = ($1, $2);             
     719                my $new_tmp_filename = "random";             
     720                 
     721                ## The following does not work in the Multimedia collection, since there's a subfolder to tmp (the timestamp folder) which contains the output file. 
     722                #$tmpfile_regex = "(<Metadata name=\"(URL|UTF8URL|gsdlconvertedfilename|OrigSource)\">(http://)?)($gsdlhome_re)?(/tmp/)?$old_tmp_filename($ext</Metadata>)"; 
     723                $tmpfile_regex = "(<Metadata name=\"(URL|UTF8URL|gsdlconvertedfilename|OrigSource)\">(http://)?)($gsdlhome_re)?(/tmp/)?.*?($ext</Metadata>)"; 
     724                if($5) {  
     725                    $test_contents =~ s@$tmpfile_regex@$1$5$new_tmp_filename$6@mg; 
     726                } else { # OrigSource contains only the filename 
     727                    $test_contents =~ s@$tmpfile_regex@$1$new_tmp_filename$6@mg; 
     728                } 
     729                 
     730                # modelcol used a different gsdlhome, but also a tmp dir, so make the same changes to its random filename            
     731                $tmpfile_regex = "(<Metadata name=\"(URL|UTF8URL|gsdlconvertedfilename|OrigSource)\">(http://)?)(.*)?(/tmp/)?.*?($ext</Metadata>)"; 
     732                if($5) {  
     733                    $model_contents =~ s@$tmpfile_regex@$1$5$new_tmp_filename$6@mg; 
     734                } else { # OrigSource contains only the filename 
     735                    $model_contents =~ s@$tmpfile_regex@$1$new_tmp_filename$6@mg; 
     736                } 
     737            } 
     738 
     739        } # finished special processing of doc.xml files 
     740         
     741        my $savepath = &getcwd."/../"; # TASK_HOME env var does not exist at this stage, but it's one level up from current directory        
     742#       &gdbdiff::print_string_to_file($model_contents, $savepath."model_docmets.xml"); 
     743#       &gdbdiff::print_string_to_file($test_contents, $savepath."test_docmets.xml"); 
     744#       if($strModel =~ m/(HASH0164.dir)/) { # list the HASH dirs for which you want the doc.xml file generated 
    735745#       &gdbdiff::print_string_to_file($model_contents, $savepath."$1_model_doc.xml"); 
    736746#       &gdbdiff::print_string_to_file($test_contents, $savepath."$1_test_doc.xml"); 
    737747#       } 
    738748         
    739         } # finished special processing of doc.xml files 
     749 
    740750         
    741751        # now can diff the normalised versions of the doc.xml/docmets.xml files: 
  • other-projects/nightly-tasks/diffcol/trunk/diffcol/gdbdiff.pm

    r28071 r28086  
    6666    # The total_numbytes field can vary depending on how many backslashes exist in the urls in the main body text, as each 
    6767    # of these windows slashes get escaped with another backslash, and the resulting string is used as key into rel link db 
    68     my $ignore_line_re = "\n<(FileSize|lastmodified|lastmodifieddate|oailastmodified|oailastmodifieddate|ex.File.FileModifyDate|ex.File.FilePermissions|total_numbytes)>([^\n])*"; 
     68    my $ignore_line_re = "\n<(FileSize|lastmodified|lastmodifieddate|oailastmodified|oailastmodifieddate|ex.File.FileModifyDate|ex.File.FilePermissions|total_numbytes|ex.Composite.LightValue)>([^\n])*"; 
    6969    $model_text =~ s/$ignore_line_re//g; 
    7070    $test_text =~ s/$ignore_line_re//g; 
     
    8181    my $modelIsWin = &isDBWindowsSensitive($dbname, $model_text); 
    8282     
    83     if($testIsWin == $modelIsWin) { # both linux or both windows, do the basic test we did on linux machines: 
     83    if($testIsWin == $modelIsWin) {      
     84    # both linux or both windows, do the basic test we did on linux machines: 
    8485            # ignore absolute path prefixes in modelcol and testcol (necessary for archiveinf-doc and -src.gdb files) 
    8586 
     
    120121                 
    121122                # assoc-file and meta-file contain filepaths, ensure these are long windows file paths now (will later convert to linux slashes)     
    122                 if($line =~ m@^<(assoc-file|meta-file)>(.*)(\s+)@s) {  
     123                if($line =~ m@^<(assoc-file|meta-file)>(.*)(\s+)@s) { 
    123124                    $line = $2; # may be a short file name 
    124125                    # perhaps test here if it is a shortfilename? should match /CAPS....~number(.ext)/ 
     
    135136        if($dbname =~ m/$strColName/) { 
    136137            my $tmp = ""; # rebuild windows file's set of lines after processing them one by one 
    137             for my $line (split /^/, $$win_text) { # split the string into newlines 
    138              
     138            for my $line (split /^/, $$win_text) { # split the string into newlines          
     139 
    139140                # In the following regex, add any .gdb fieldnames that represent a path and so would contain double backslashes  
    140141                # on Windows (to escape the single backlash of win filepaths). They will be turned into single-backslashes here,  
     
    143144                # E.g. On windows, the Word-PDF collection(s) contains double backslashes in the ex.File.Directory field 
    144145                # the MARC-Exploded collection contains double backslashes in the null_file entry field of the .gdb file                 
    145                 if($line =~ m@^<(ex.File.Directory|null_file)>(.*)@s) {                  
     146                if($line =~ m@^<(ex.File.Directory|null_file)>(.*)@s) { 
    146147                    my ($fieldname, $escaped_path) = ($1, $2); 
    147148                    $escaped_path =~ s@\\\\@/@g; #(my $escaped_path = $2) =~ s@\\\\@\\@g; 
     
    282283            return 1; 
    283284        } 
     285        elsif ($db_contents =~ m@^(<ex.File.Directory>[a-zA-Z]:\\\\)@m) { # <ex.File.Directory>C:\\path\\path for OAI collection 
     286            return 1; 
     287        } 
    284288        return 0;        
    285289    }