Ignore:
Timestamp:
2013-08-19T20:42:40+12:00 (11 years ago)
Author:
ak19
Message:

Bringing windows diffcol up to date for the latest tutorials. A new field whose value can be slightly different. Need to test for windows gdb differently when the input collection is OAI. Backslashes in docmets need to be normalised.

File:
1 edited

Legend:

Unmodified
Added
Removed
  • other-projects/nightly-tasks/diffcol/trunk/diffcol/diffcol.pl

    r28078 r28086  
    542542# so far, only doc.xml files need special Windows processing (db files' OS-sensitivity are handled in gdbdiff.pm)
    543543# Returns true if the doc.xml contains windows style slashes in the gsdlsourcefilename meta field
    544 sub isDocXMLFileWindows
     544sub isDocOrMETSXMLFileWindows
    545545{
    546546    my ($file_contents) = @_;
     
    553553    # for doc.xml:
    554554    #     <Metadata name="gsdlsourcefilename">import/html_files/cleves.html</Metadata>
    555     if($file_contents =~ m@<Metadata name="gsdlsourcefilename">([^>]*)</Metadata>@m) {
    556         $gsdlsourcefilename = $1;
     555    if($file_contents =~ m@<(.*?:)?Metadata name="gsdlsourcefilename">([^>]*)</(.*?:)?Metadata>@m) {
     556        $gsdlsourcefilename = $2;
    557557        if($gsdlsourcefilename =~ m/\\/) { # windows slashes detected.
    558558            return 1;
     
    627627    {
    628628        # allow for a namespace prefix to <Metadata> as happens in GreenstoneMETS docmets.xml files, e.g. <gsdl3:Metadata></gsdl3:Metadata>
    629         my $ignore_line_re = "<(.*?:)?Metadata name=\"(lastmodified|lastmodifieddate|oailastmodified|oailastmodifieddate|ex.File.FileModifyDate|ex.File.FilePermissions|ImageSize|FileSize)\">.*</(.*?:)?Metadata>\\s*\\n*";
     629        my $ignore_line_re = "<(.*?:)?Metadata name=\"(lastmodified|lastmodifieddate|oailastmodified|oailastmodifieddate|ex.File.FileModifyDate|ex.File.FilePermissions|ImageSize|FileSize|ex.Composite.LightValue)\">.*</(.*?:)?Metadata>\\s*\\n*";
    630630       
    631631        my $strResult;
     
    649649        $model_contents =~ s/$ignore_line_re//g;
    650650        $test_contents =~ s/$ignore_line_re//g;
    651        
    652 
    653         # doc.xml needs to additionally be normalised, before comparing a windows test with a linux model or vice-versa
    654         if($strModel =~ m/doc\.xml$/) {
    655             # equalise/normalise the two doc.xml files for OS differences, if there are any
    656             my $testIsWin = &isDocXMLFileWindows($test_contents);
    657             my $modelIsWin = &isDocXMLFileWindows($model_contents);
    658            
    659             if($testIsWin != $modelIsWin) { # one of the 2 collections is built on windows, the other on linux, so need to make newlines constant
    660            
    661                 my $win_contents = $testIsWin ? \$test_contents : \$model_contents;
    662                 my $lin_contents = $testIsWin ? \$model_contents : \$test_contents;
    663                
    664                 # remove all carriage returns \r - introduced into doc.xml by multiread after pdf converted to html
    665                 $$win_contents =~ s@[\r]@@g;
    666            
    667                 # make all single windows slashes into single unix slashes
    668                 $$win_contents =~ s@([^\\])\\([^\\])@$1\/$2@g;
    669                 # make windows \r newlines into constant \n newlines. Already handled when \r got replaced
    670                 #$$win_contents =~ s@\r\n@\n@mg; # #http://stackoverflow.com/questions/650743/in-perl-how-to-do-you-remove-m-from-a-file
    671                
    672                 #FOR MAC: old macs use CR carriage return (see http://www.perlmonks.org/?node_id=745018), so replace with \n?)
    673                 # $$win_contents =~ s@\r@\n@mg;
    674                
     651
     652
     653        # equalise/normalise the two doc.xml/docmets.xml files for OS differences, if there are any
     654        # before comparing a windows test with a linux model or vice-versa
     655        my $testIsWin = &isDocOrMETSXMLFileWindows($test_contents);
     656        my $modelIsWin = &isDocOrMETSXMLFileWindows($model_contents);
     657       
     658        if($testIsWin != $modelIsWin) { # one of the 2 collections is built on windows, the other on linux, so need to make newlines constant
     659       
     660            my $win_contents = $testIsWin ? \$test_contents : \$model_contents;
     661            my $lin_contents = $testIsWin ? \$model_contents : \$test_contents;
     662           
     663            # remove all carriage returns \r - introduced into doc.xml by multiread after pdf converted to html
     664            $$win_contents =~ s@[\r]@@g;           
     665       
     666            # make all single windows slashes into single unix slashes
     667            # the 1 char look-ahead requires a double pass, otherwise import\3\3.pdf will get replaced with import/3\3.pdf
     668            $$win_contents =~ s@([^\\])\\([^\\])@$1\/$2@g;
     669            $$win_contents =~ s@([^\\])\\([^\\])@$1\/$2@g;             
     670           
     671            # make windows \r newlines into constant \n newlines. Already handled when \r got replaced
     672            #$$win_contents =~ s@\r\n@\n@mg; # #http://stackoverflow.com/questions/650743/in-perl-how-to-do-you-remove-m-from-a-file
     673           
     674            #FOR MAC: old macs use CR carriage return (see http://www.perlmonks.org/?node_id=745018), so replace with \n?)
     675            # $$win_contents =~ s@\r@\n@mg;
     676           
     677            if($strModel =~ m/doc\.xml$/) { # processing particular to doc.xml
    675678                # remove solitary, stray carriage returns \r in the linux doc.xml, as occurs in the tudor collection owing to the source material
    676679                # containing solitary carriage returns instead of linefeed
     
    682685                # Doing so is okay, since we're not modifying the doc.xml in the model or test collections, just normalising them in-memory for comparison
    683686                $$lin_contents =~ s@([^\\])\\([^\\])@$1\/$2@g;
     687                $$lin_contents =~ s@([^\\])\\([^\\])@$1\/$2@g;
    684688               
    685689                # Advanced Beatles collection,
     
    687691                # while windows contains: IMG SRC=_httpextlink_&amp;amp;rl=1&amp;amp;href=http://\\&quot;http://www.boskowan.com/
    688692                # Normalising to windows version for doing a diff
    689                 $$lin_contents =~ s@href=http:///@href=http://@g;
    690             }
    691            
    692        
    693         # tmp dirs have subdirs with random numbers in name, remove randomly named subdir portion of path
    694         # these tmpdirs are located inside the collection directory
    695         $model_contents =~ s@(tmp[\\\/])(\d*[\\\/])@$1@g;
    696         $test_contents =~ s@(tmp[\\\/])(\d*[\\\/])@$1@g;
    697        
    698         # remove all absolute paths upto collect folder from <Metadata /> elements
    699         $model_contents =~ s@(<Metadata name=\"[^\"]*\">(http:\/\/)?).*(collect[\\\/]$strColName)@$1$3@g;
    700         $test_contents =~ s@(<Metadata name=\"[^\"]*\">(http:\/\/)?).*(collect[\\\/]$strColName)@$1$3@g;       
    701        
    702         # The following block of code is necessary to deal with tmp (html) source files generated when using PDFBox
    703         # These tmpdirs are located inside the toplevel *greenstone* directory
    704         (my $gsdlhome_re = $ENV{'GSDLHOME'}) =~ s@\\@\/@g;
    705         $gsdlhome_re = ".*" unless $$ENV{'GSDLHOME'};
    706         my $tmpfile_regex = "<Metadata name=\"URL\">http://$gsdlhome_re/tmp/([^\.]*)(\..{3,4})</Metadata>"; # $gsdlhome/tmp/randomfilename.html, file ext can be 3 or 4 chars long 
    707        
    708         if($test_contents =~ m@$tmpfile_regex@) {           
    709             # found a match, replace the tmp file name with "random", keeping the original file extension
    710             # in <Metadata name="OrigSource|URL|UTF8URL|gsdlconvertedfilename">
    711        
    712             my ($old_tmp_filename, $ext) = ($1, $2);           
    713             my $new_tmp_filename = "random";           
    714            
    715             ## The following does not work in the Multimedia collection, since there's a subfolder to tmp (the timestamp folder) which contains the output file.
    716             #$tmpfile_regex = "(<Metadata name=\"(URL|UTF8URL|gsdlconvertedfilename|OrigSource)\">(http://)?)($gsdlhome_re)?(/tmp/)?$old_tmp_filename($ext</Metadata>)";
    717             $tmpfile_regex = "(<Metadata name=\"(URL|UTF8URL|gsdlconvertedfilename|OrigSource)\">(http://)?)($gsdlhome_re)?(/tmp/)?.*?($ext</Metadata>)";
    718             if($5) {
    719                 $test_contents =~ s@$tmpfile_regex@$1$5$new_tmp_filename$6@mg;
    720             } else { # OrigSource contains only the filename
    721                 $test_contents =~ s@$tmpfile_regex@$1$new_tmp_filename$6@mg;
    722             }
    723            
    724             # modelcol used a different gsdlhome, but also a tmp dir, so make the same changes to its random filename           
    725             $tmpfile_regex = "(<Metadata name=\"(URL|UTF8URL|gsdlconvertedfilename|OrigSource)\">(http://)?)(.*)?(/tmp/)?.*?($ext</Metadata>)";
    726             if($5) {
    727                 $model_contents =~ s@$tmpfile_regex@$1$5$new_tmp_filename$6@mg;
    728             } else { # OrigSource contains only the filename
    729                 $model_contents =~ s@$tmpfile_regex@$1$new_tmp_filename$6@mg;
    730             }
    731         }
    732        
    733 #       my $savepath = &getcwd."/../"; # TASK_HOME env var does not exist at this stage, but it's one level up from current directory       
    734 #       if($strModel =~ m/(HASH010d.dir)/) { # list the HASH dirs for which you want the doc.xml file generated
     693                $$lin_contents =~ s@href=http:///@href=http://@g;               
     694            }   
     695        }
     696       
     697        # processing particular to doc.xml 
     698        if($strModel =~ m/doc\.xml$/) {
     699            # tmp dirs have subdirs with random numbers in name, remove randomly named subdir portion of path
     700            # these tmpdirs are located inside the collection directory
     701            $model_contents =~ s@(tmp[\\\/])(\d*[\\\/])@$1@g;
     702            $test_contents =~ s@(tmp[\\\/])(\d*[\\\/])@$1@g;
     703           
     704            # remove all absolute paths upto collect folder from <Metadata /> elements
     705            $model_contents =~ s@(<Metadata name=\"[^\"]*\">(http:\/\/)?).*(collect[\\\/]$strColName)@$1$3@g;
     706            $test_contents =~ s@(<Metadata name=\"[^\"]*\">(http:\/\/)?).*(collect[\\\/]$strColName)@$1$3@g;       
     707           
     708            # The following block of code is necessary to deal with tmp (html) source files generated when using PDFBox
     709            # These tmpdirs are located inside the toplevel *greenstone* directory
     710            (my $gsdlhome_re = $ENV{'GSDLHOME'}) =~ s@\\@\/@g;
     711            $gsdlhome_re = ".*" unless $$ENV{'GSDLHOME'};
     712            my $tmpfile_regex = "<Metadata name=\"URL\">http://$gsdlhome_re/tmp/([^\.]*)(\..{3,4})</Metadata>"; # $gsdlhome/tmp/randomfilename.html, file ext can be 3 or 4 chars long 
     713           
     714            if($test_contents =~ m@$tmpfile_regex@) {           
     715                # found a match, replace the tmp file name with "random", keeping the original file extension
     716                # in <Metadata name="OrigSource|URL|UTF8URL|gsdlconvertedfilename">
     717           
     718                my ($old_tmp_filename, $ext) = ($1, $2);           
     719                my $new_tmp_filename = "random";           
     720               
     721                ## The following does not work in the Multimedia collection, since there's a subfolder to tmp (the timestamp folder) which contains the output file.
     722                #$tmpfile_regex = "(<Metadata name=\"(URL|UTF8URL|gsdlconvertedfilename|OrigSource)\">(http://)?)($gsdlhome_re)?(/tmp/)?$old_tmp_filename($ext</Metadata>)";
     723                $tmpfile_regex = "(<Metadata name=\"(URL|UTF8URL|gsdlconvertedfilename|OrigSource)\">(http://)?)($gsdlhome_re)?(/tmp/)?.*?($ext</Metadata>)";
     724                if($5) {
     725                    $test_contents =~ s@$tmpfile_regex@$1$5$new_tmp_filename$6@mg;
     726                } else { # OrigSource contains only the filename
     727                    $test_contents =~ s@$tmpfile_regex@$1$new_tmp_filename$6@mg;
     728                }
     729               
     730                # modelcol used a different gsdlhome, but also a tmp dir, so make the same changes to its random filename           
     731                $tmpfile_regex = "(<Metadata name=\"(URL|UTF8URL|gsdlconvertedfilename|OrigSource)\">(http://)?)(.*)?(/tmp/)?.*?($ext</Metadata>)";
     732                if($5) {
     733                    $model_contents =~ s@$tmpfile_regex@$1$5$new_tmp_filename$6@mg;
     734                } else { # OrigSource contains only the filename
     735                    $model_contents =~ s@$tmpfile_regex@$1$new_tmp_filename$6@mg;
     736                }
     737            }
     738
     739        } # finished special processing of doc.xml files
     740       
     741        my $savepath = &getcwd."/../"; # TASK_HOME env var does not exist at this stage, but it's one level up from current directory       
     742#       &gdbdiff::print_string_to_file($model_contents, $savepath."model_docmets.xml");
     743#       &gdbdiff::print_string_to_file($test_contents, $savepath."test_docmets.xml");
     744#       if($strModel =~ m/(HASH0164.dir)/) { # list the HASH dirs for which you want the doc.xml file generated
    735745#       &gdbdiff::print_string_to_file($model_contents, $savepath."$1_model_doc.xml");
    736746#       &gdbdiff::print_string_to_file($test_contents, $savepath."$1_test_doc.xml");
    737747#       }
    738748       
    739         } # finished special processing of doc.xml files
     749
    740750       
    741751        # now can diff the normalised versions of the doc.xml/docmets.xml files:
Note: See TracChangeset for help on using the changeset viewer.