Ignore:
Timestamp:
2013-08-19T20:42:40+12:00 (11 years ago)
Author:
ak19
Message:

Bringing windows diffcol up to date for the latest tutorials. A new field whose value can be slightly different. Need to test for windows gdb differently when the input collection is OAI. Backslashes in docmets need to be normalised.

Location:
other-projects/nightly-tasks/diffcol/trunk/diffcol
Files:
2 edited

Legend:

Unmodified
Added
Removed
  • other-projects/nightly-tasks/diffcol/trunk/diffcol/diffcol.pl

    r28078 r28086  
    542542# so far, only doc.xml files need special Windows processing (db files' OS-sensitivity are handled in gdbdiff.pm)
    543543# Returns true if the doc.xml contains windows style slashes in the gsdlsourcefilename meta field
    544 sub isDocXMLFileWindows
     544sub isDocOrMETSXMLFileWindows
    545545{
    546546    my ($file_contents) = @_;
     
    553553    # for doc.xml:
    554554    #     <Metadata name="gsdlsourcefilename">import/html_files/cleves.html</Metadata>
    555     if($file_contents =~ m@<Metadata name="gsdlsourcefilename">([^>]*)</Metadata>@m) {
    556         $gsdlsourcefilename = $1;
     555    if($file_contents =~ m@<(.*?:)?Metadata name="gsdlsourcefilename">([^>]*)</(.*?:)?Metadata>@m) {
     556        $gsdlsourcefilename = $2;
    557557        if($gsdlsourcefilename =~ m/\\/) { # windows slashes detected.
    558558            return 1;
     
    627627    {
    628628        # allow for a namespace prefix to <Metadata> as happens in GreenstoneMETS docmets.xml files, e.g. <gsdl3:Metadata></gsdl3:Metadata>
    629         my $ignore_line_re = "<(.*?:)?Metadata name=\"(lastmodified|lastmodifieddate|oailastmodified|oailastmodifieddate|ex.File.FileModifyDate|ex.File.FilePermissions|ImageSize|FileSize)\">.*</(.*?:)?Metadata>\\s*\\n*";
     629        my $ignore_line_re = "<(.*?:)?Metadata name=\"(lastmodified|lastmodifieddate|oailastmodified|oailastmodifieddate|ex.File.FileModifyDate|ex.File.FilePermissions|ImageSize|FileSize|ex.Composite.LightValue)\">.*</(.*?:)?Metadata>\\s*\\n*";
    630630       
    631631        my $strResult;
     
    649649        $model_contents =~ s/$ignore_line_re//g;
    650650        $test_contents =~ s/$ignore_line_re//g;
    651        
    652 
    653         # doc.xml needs to additionally be normalised, before comparing a windows test with a linux model or vice-versa
    654         if($strModel =~ m/doc\.xml$/) {
    655             # equalise/normalise the two doc.xml files for OS differences, if there are any
    656             my $testIsWin = &isDocXMLFileWindows($test_contents);
    657             my $modelIsWin = &isDocXMLFileWindows($model_contents);
    658            
    659             if($testIsWin != $modelIsWin) { # one of the 2 collections is built on windows, the other on linux, so need to make newlines constant
    660            
    661                 my $win_contents = $testIsWin ? \$test_contents : \$model_contents;
    662                 my $lin_contents = $testIsWin ? \$model_contents : \$test_contents;
    663                
    664                 # remove all carriage returns \r - introduced into doc.xml by multiread after pdf converted to html
    665                 $$win_contents =~ s@[\r]@@g;
    666            
    667                 # make all single windows slashes into single unix slashes
    668                 $$win_contents =~ s@([^\\])\\([^\\])@$1\/$2@g;
    669                 # make windows \r newlines into constant \n newlines. Already handled when \r got replaced
    670                 #$$win_contents =~ s@\r\n@\n@mg; # #http://stackoverflow.com/questions/650743/in-perl-how-to-do-you-remove-m-from-a-file
    671                
    672                 #FOR MAC: old macs use CR carriage return (see http://www.perlmonks.org/?node_id=745018), so replace with \n?)
    673                 # $$win_contents =~ s@\r@\n@mg;
    674                
     651
     652
     653        # equalise/normalise the two doc.xml/docmets.xml files for OS differences, if there are any
     654        # before comparing a windows test with a linux model or vice-versa
     655        my $testIsWin = &isDocOrMETSXMLFileWindows($test_contents);
     656        my $modelIsWin = &isDocOrMETSXMLFileWindows($model_contents);
     657       
     658        if($testIsWin != $modelIsWin) { # one of the 2 collections is built on windows, the other on linux, so need to make newlines constant
     659       
     660            my $win_contents = $testIsWin ? \$test_contents : \$model_contents;
     661            my $lin_contents = $testIsWin ? \$model_contents : \$test_contents;
     662           
     663            # remove all carriage returns \r - introduced into doc.xml by multiread after pdf converted to html
     664            $$win_contents =~ s@[\r]@@g;           
     665       
     666            # make all single windows slashes into single unix slashes
     667            # the 1 char look-ahead requires a double pass, otherwise import\3\3.pdf will get replaced with import/3\3.pdf
     668            $$win_contents =~ s@([^\\])\\([^\\])@$1\/$2@g;
     669            $$win_contents =~ s@([^\\])\\([^\\])@$1\/$2@g;             
     670           
     671            # make windows \r newlines into constant \n newlines. Already handled when \r got replaced
     672            #$$win_contents =~ s@\r\n@\n@mg; # #http://stackoverflow.com/questions/650743/in-perl-how-to-do-you-remove-m-from-a-file
     673           
     674            #FOR MAC: old macs use CR carriage return (see http://www.perlmonks.org/?node_id=745018), so replace with \n?)
     675            # $$win_contents =~ s@\r@\n@mg;
     676           
     677            if($strModel =~ m/doc\.xml$/) { # processing particular to doc.xml
    675678                # remove solitary, stray carriage returns \r in the linux doc.xml, as occurs in the tudor collection owing to the source material
    676679                # containing solitary carriage returns instead of linefeed
     
    682685                # Doing so is okay, since we're not modifying the doc.xml in the model or test collections, just normalising them in-memory for comparison
    683686                $$lin_contents =~ s@([^\\])\\([^\\])@$1\/$2@g;
     687                $$lin_contents =~ s@([^\\])\\([^\\])@$1\/$2@g;
    684688               
    685689                # Advanced Beatles collection,
     
    687691                # while windows contains: IMG SRC=_httpextlink_&amp;amp;rl=1&amp;amp;href=http://\\&quot;http://www.boskowan.com/
    688692                # Normalising to windows version for doing a diff
    689                 $$lin_contents =~ s@href=http:///@href=http://@g;
    690             }
    691            
    692        
    693         # tmp dirs have subdirs with random numbers in name, remove randomly named subdir portion of path
    694         # these tmpdirs are located inside the collection directory
    695         $model_contents =~ s@(tmp[\\\/])(\d*[\\\/])@$1@g;
    696         $test_contents =~ s@(tmp[\\\/])(\d*[\\\/])@$1@g;
    697        
    698         # remove all absolute paths upto collect folder from <Metadata /> elements
    699         $model_contents =~ s@(<Metadata name=\"[^\"]*\">(http:\/\/)?).*(collect[\\\/]$strColName)@$1$3@g;
    700         $test_contents =~ s@(<Metadata name=\"[^\"]*\">(http:\/\/)?).*(collect[\\\/]$strColName)@$1$3@g;       
    701        
    702         # The following block of code is necessary to deal with tmp (html) source files generated when using PDFBox
    703         # These tmpdirs are located inside the toplevel *greenstone* directory
    704         (my $gsdlhome_re = $ENV{'GSDLHOME'}) =~ s@\\@\/@g;
    705         $gsdlhome_re = ".*" unless $$ENV{'GSDLHOME'};
    706         my $tmpfile_regex = "<Metadata name=\"URL\">http://$gsdlhome_re/tmp/([^\.]*)(\..{3,4})</Metadata>"; # $gsdlhome/tmp/randomfilename.html, file ext can be 3 or 4 chars long 
    707        
    708         if($test_contents =~ m@$tmpfile_regex@) {           
    709             # found a match, replace the tmp file name with "random", keeping the original file extension
    710             # in <Metadata name="OrigSource|URL|UTF8URL|gsdlconvertedfilename">
    711        
    712             my ($old_tmp_filename, $ext) = ($1, $2);           
    713             my $new_tmp_filename = "random";           
    714            
    715             ## The following does not work in the Multimedia collection, since there's a subfolder to tmp (the timestamp folder) which contains the output file.
    716             #$tmpfile_regex = "(<Metadata name=\"(URL|UTF8URL|gsdlconvertedfilename|OrigSource)\">(http://)?)($gsdlhome_re)?(/tmp/)?$old_tmp_filename($ext</Metadata>)";
    717             $tmpfile_regex = "(<Metadata name=\"(URL|UTF8URL|gsdlconvertedfilename|OrigSource)\">(http://)?)($gsdlhome_re)?(/tmp/)?.*?($ext</Metadata>)";
    718             if($5) {
    719                 $test_contents =~ s@$tmpfile_regex@$1$5$new_tmp_filename$6@mg;
    720             } else { # OrigSource contains only the filename
    721                 $test_contents =~ s@$tmpfile_regex@$1$new_tmp_filename$6@mg;
    722             }
    723            
    724             # modelcol used a different gsdlhome, but also a tmp dir, so make the same changes to its random filename           
    725             $tmpfile_regex = "(<Metadata name=\"(URL|UTF8URL|gsdlconvertedfilename|OrigSource)\">(http://)?)(.*)?(/tmp/)?.*?($ext</Metadata>)";
    726             if($5) {
    727                 $model_contents =~ s@$tmpfile_regex@$1$5$new_tmp_filename$6@mg;
    728             } else { # OrigSource contains only the filename
    729                 $model_contents =~ s@$tmpfile_regex@$1$new_tmp_filename$6@mg;
    730             }
    731         }
    732        
    733 #       my $savepath = &getcwd."/../"; # TASK_HOME env var does not exist at this stage, but it's one level up from current directory       
    734 #       if($strModel =~ m/(HASH010d.dir)/) { # list the HASH dirs for which you want the doc.xml file generated
     693                $$lin_contents =~ s@href=http:///@href=http://@g;               
     694            }   
     695        }
     696       
     697        # processing particular to doc.xml 
     698        if($strModel =~ m/doc\.xml$/) {
     699            # tmp dirs have subdirs with random numbers in name, remove randomly named subdir portion of path
     700            # these tmpdirs are located inside the collection directory
     701            $model_contents =~ s@(tmp[\\\/])(\d*[\\\/])@$1@g;
     702            $test_contents =~ s@(tmp[\\\/])(\d*[\\\/])@$1@g;
     703           
     704            # remove all absolute paths upto collect folder from <Metadata /> elements
     705            $model_contents =~ s@(<Metadata name=\"[^\"]*\">(http:\/\/)?).*(collect[\\\/]$strColName)@$1$3@g;
     706            $test_contents =~ s@(<Metadata name=\"[^\"]*\">(http:\/\/)?).*(collect[\\\/]$strColName)@$1$3@g;       
     707           
     708            # The following block of code is necessary to deal with tmp (html) source files generated when using PDFBox
     709            # These tmpdirs are located inside the toplevel *greenstone* directory
     710            (my $gsdlhome_re = $ENV{'GSDLHOME'}) =~ s@\\@\/@g;
     711            $gsdlhome_re = ".*" unless $$ENV{'GSDLHOME'};
     712            my $tmpfile_regex = "<Metadata name=\"URL\">http://$gsdlhome_re/tmp/([^\.]*)(\..{3,4})</Metadata>"; # $gsdlhome/tmp/randomfilename.html, file ext can be 3 or 4 chars long 
     713           
     714            if($test_contents =~ m@$tmpfile_regex@) {           
     715                # found a match, replace the tmp file name with "random", keeping the original file extension
     716                # in <Metadata name="OrigSource|URL|UTF8URL|gsdlconvertedfilename">
     717           
     718                my ($old_tmp_filename, $ext) = ($1, $2);           
     719                my $new_tmp_filename = "random";           
     720               
     721                ## The following does not work in the Multimedia collection, since there's a subfolder to tmp (the timestamp folder) which contains the output file.
     722                #$tmpfile_regex = "(<Metadata name=\"(URL|UTF8URL|gsdlconvertedfilename|OrigSource)\">(http://)?)($gsdlhome_re)?(/tmp/)?$old_tmp_filename($ext</Metadata>)";
     723                $tmpfile_regex = "(<Metadata name=\"(URL|UTF8URL|gsdlconvertedfilename|OrigSource)\">(http://)?)($gsdlhome_re)?(/tmp/)?.*?($ext</Metadata>)";
     724                if($5) {
     725                    $test_contents =~ s@$tmpfile_regex@$1$5$new_tmp_filename$6@mg;
     726                } else { # OrigSource contains only the filename
     727                    $test_contents =~ s@$tmpfile_regex@$1$new_tmp_filename$6@mg;
     728                }
     729               
     730                # modelcol used a different gsdlhome, but also a tmp dir, so make the same changes to its random filename           
     731                $tmpfile_regex = "(<Metadata name=\"(URL|UTF8URL|gsdlconvertedfilename|OrigSource)\">(http://)?)(.*)?(/tmp/)?.*?($ext</Metadata>)";
     732                if($5) {
     733                    $model_contents =~ s@$tmpfile_regex@$1$5$new_tmp_filename$6@mg;
     734                } else { # OrigSource contains only the filename
     735                    $model_contents =~ s@$tmpfile_regex@$1$new_tmp_filename$6@mg;
     736                }
     737            }
     738
     739        } # finished special processing of doc.xml files
     740       
     741        my $savepath = &getcwd."/../"; # TASK_HOME env var does not exist at this stage, but it's one level up from current directory       
     742#       &gdbdiff::print_string_to_file($model_contents, $savepath."model_docmets.xml");
     743#       &gdbdiff::print_string_to_file($test_contents, $savepath."test_docmets.xml");
     744#       if($strModel =~ m/(HASH0164.dir)/) { # list the HASH dirs for which you want the doc.xml file generated
    735745#       &gdbdiff::print_string_to_file($model_contents, $savepath."$1_model_doc.xml");
    736746#       &gdbdiff::print_string_to_file($test_contents, $savepath."$1_test_doc.xml");
    737747#       }
    738748       
    739         } # finished special processing of doc.xml files
     749
    740750       
    741751        # now can diff the normalised versions of the doc.xml/docmets.xml files:
  • other-projects/nightly-tasks/diffcol/trunk/diffcol/gdbdiff.pm

    r28071 r28086  
    6666    # The total_numbytes field can vary depending on how many backslashes exist in the urls in the main body text, as each
    6767    # of these windows slashes get escaped with another backslash, and the resulting string is used as key into rel link db
    68     my $ignore_line_re = "\n<(FileSize|lastmodified|lastmodifieddate|oailastmodified|oailastmodifieddate|ex.File.FileModifyDate|ex.File.FilePermissions|total_numbytes)>([^\n])*";
     68    my $ignore_line_re = "\n<(FileSize|lastmodified|lastmodifieddate|oailastmodified|oailastmodifieddate|ex.File.FileModifyDate|ex.File.FilePermissions|total_numbytes|ex.Composite.LightValue)>([^\n])*";
    6969    $model_text =~ s/$ignore_line_re//g;
    7070    $test_text =~ s/$ignore_line_re//g;
     
    8181    my $modelIsWin = &isDBWindowsSensitive($dbname, $model_text);
    8282   
    83     if($testIsWin == $modelIsWin) { # both linux or both windows, do the basic test we did on linux machines:
     83    if($testIsWin == $modelIsWin) {     
     84    # both linux or both windows, do the basic test we did on linux machines:
    8485            # ignore absolute path prefixes in modelcol and testcol (necessary for archiveinf-doc and -src.gdb files)
    8586
     
    120121               
    121122                # assoc-file and meta-file contain filepaths, ensure these are long windows file paths now (will later convert to linux slashes)   
    122                 if($line =~ m@^<(assoc-file|meta-file)>(.*)(\s+)@s) { 
     123                if($line =~ m@^<(assoc-file|meta-file)>(.*)(\s+)@s) {
    123124                    $line = $2; # may be a short file name
    124125                    # perhaps test here if it is a shortfilename? should match /CAPS....~number(.ext)/
     
    135136        if($dbname =~ m/$strColName/) {
    136137            my $tmp = ""; # rebuild windows file's set of lines after processing them one by one
    137             for my $line (split /^/, $$win_text) { # split the string into newlines
    138            
     138            for my $line (split /^/, $$win_text) { # split the string into newlines         
     139
    139140                # In the following regex, add any .gdb fieldnames that represent a path and so would contain double backslashes
    140141                # on Windows (to escape the single backlash of win filepaths). They will be turned into single-backslashes here,
     
    143144                # E.g. On windows, the Word-PDF collection(s) contains double backslashes in the ex.File.Directory field
    144145                # the MARC-Exploded collection contains double backslashes in the null_file entry field of the .gdb file               
    145                 if($line =~ m@^<(ex.File.Directory|null_file)>(.*)@s) {                 
     146                if($line =~ m@^<(ex.File.Directory|null_file)>(.*)@s) {
    146147                    my ($fieldname, $escaped_path) = ($1, $2);
    147148                    $escaped_path =~ s@\\\\@/@g; #(my $escaped_path = $2) =~ s@\\\\@\\@g;
     
    282283            return 1;
    283284        }
     285        elsif ($db_contents =~ m@^(<ex.File.Directory>[a-zA-Z]:\\\\)@m) { # <ex.File.Directory>C:\\path\\path for OAI collection
     286            return 1;
     287        }
    284288        return 0;       
    285289    }   
Note: See TracChangeset for help on using the changeset viewer.