Ignore:
Timestamp:
2013-07-03T21:37:18+12:00 (11 years ago)
Author:
ak19
Message:

Basic Word-PDF collection now has the same number of diffing errors on Windows upon diffcol as on Linux and Mac. Needed to do a lot of special processing for windows: to remove carriage returns introduced into doc.xml when doing a multiread on the html version of a pdf doc after it has been converted to html. (And similarly, needed to get rid of windows carriage returns introduced into ex.Title meta for pdf01.pdf converted to HTML. This was handled in HTMLPlugin). Further special tags need either to be ignored, if they're time stamps, or specially handled if they're filepaths. Not sure if it's the encoding setting in multiread or maybe the locale that is introducing the carriage returns, but am dealing with this at the point of diffcol since it's not a 'problem' in Greenstone, just an inconsistency across OS-es. There's still one diffcol error remaining for this collection on all 3 OS: one word document has a different word wrap length on the machine where the model col was built compared to the wrap length on the other machines. This may be a setting to wvware or else libreoffice/staroffice, if these are used.

File:
1 edited

Legend:

Unmodified
Added
Removed
  • other-projects/nightly-tasks/diffcol/trunk/diffcol/diffcol.pl

    r27730 r27743  
    591591                my $strNewTest = &FileUtils::filenameConcatenate($strTest,$strEachFile);
    592592                # now additionally ignoring the earliestDatestamp file and the index/idx/*.idh binary file when diffing file
    593                 if(!($strEachFile eq "log" || $strEachFile eq "earliestDatestamp" || $strEachFile =~ m/\.cfg$/g || $strEachFile =~ m/\.((g|j|l|b)db|idh|i.*|wa|td|tsd|ti|t|tl|w|jpe?g|gif|png)$/g))
     593                if(!($strEachFile eq "log" || $strEachFile eq "earliestDatestamp" || $strEachFile =~ m/\.cfg$/g || $strEachFile =~ m/\.((g|j|l|b)db|idh|i.*|wa|td|tsd|ti|t|tl|w|jpe?g|gif|png|wmf)$/g)) # wmf = windows meta file
    594594                {
    595595                    push(@Errors,TestEach($strNewModel,$strNewTest,$intLevel,$strColName));
     
    611611                my $strNewModel = &FileUtils::filenameConcatenate($strModel,$strEachFile);
    612612                my $strNewTest = &FileUtils::filenameConcatenate($strTest,$strEachFile);
    613                 if(!($strEachFile eq "log" || $strEachFile eq "earliestDatestamp" || $strEachFile =~ m/\.cfg$/g || $strEachFile =~ m/\.((g|j|l|b)db|idh|i.*|wa|td|tsd|ti|t|tl|w|jpe?g|gif|png)$/g))
     613                if(!($strEachFile eq "log" || $strEachFile eq "earliestDatestamp" || $strEachFile =~ m/\.cfg$/g || $strEachFile =~ m/\.((g|j|l|b)db|idh|i.*|wa|td|tsd|ti|t|tl|w|jpe?g|gif|png|wmf)$/g))
    614614                {
    615615                    push(@Errors,TestEach($strNewModel,$strNewTest,$intLevel,$strColName));
     
    645645        $test_contents =~ s/$ignore_line_re//g;
    646646
     647
     648            # equalise/normalise the two doc.xml files for OS differences, if there are any
     649            my $testIsWin = &isDocXMLFileWindows($test_contents);
     650            my $modelIsWin = &isDocXMLFileWindows($model_contents);
     651           
     652            if($testIsWin != $modelIsWin) { # one of the 2 collections is built on windows, the other on linux, so need to make newlines constant
     653           
     654                my $win_contents = $testIsWin ? \$test_contents : \$model_contents;
     655           
     656                # remove all carriage returns \r - introduced into doc.xml by multiread after pdf converted to html
     657                $$win_contents =~ s@[\r]@@g;
     658           
     659                # make all single windows slashes into single unix slashes
     660                $$win_contents =~ s@([^\\])\\([^\\])@$1\/$2@g;
     661                # make windows \r newlines into constant \n newlines. Already handled when \r got replaced
     662                #$$win_contents =~ s@\r\n@\n@mg; # #http://stackoverflow.com/questions/650743/in-perl-how-to-do-you-remove-m-from-a-file
     663               
     664                #FOR MAC: old macs use CR carriage return (see http://www.perlmonks.org/?node_id=745018), so replace with \n?)
     665                # $$win_contents =~ s@\r@\n@mg;
     666            }
     667           
     668       
     669
    647670        # tmp dirs have subdirs with random numbers in name, remove randomly named subdir portion of path
    648671        $model_contents =~ s@(tmp[\\\/])(\d*[\\\/])@$1@g;
     
    651674        # remove all absolute paths upto collect folder from <Metadata /> elements
    652675        $model_contents =~ s@(<Metadata name=\"[^\"]*\">(http:\/\/)?).*(collect[\\\/]$strColName)@$1$3@g;
    653         $test_contents =~ s@(<Metadata name=\"[^\"]*\">(http:\/\/)?).*(collect[\\\/]$strColName)@$1$3@g;
    654 
    655 
    656             # equalise/normalise the two doc.xml files for OS differences, if there are any
    657             my $testIsWin = &isDocXMLFileWindows($test_contents);
    658             my $modelIsWin = &isDocXMLFileWindows($model_contents);
    659            
    660             if($testIsWin != $modelIsWin) { # one of the 2 collections is built on windows, the other on linux, so need to make newlines constant
    661            
    662                 my $win_contents = $testIsWin ? \$test_contents : \$model_contents;
    663            
    664                 # make all windows slashes into unix slashes
    665                 $$win_contents =~ s@\\@\/@g;
    666                 # make windows \r newlines into constant \n newlines       
    667                 $$win_contents =~ s@\r\n@\n@mg; # #http://stackoverflow.com/questions/650743/in-perl-how-to-do-you-remove-m-from-a-file
    668                
    669                 #FOR MAC: old macs use CR carriage return (see http://www.perlmonks.org/?node_id=745018), so replace with \n?)
    670                 # $$win_contents =~ s@\r@\n@mg;
    671             }
     676        $test_contents =~ s@(<Metadata name=\"[^\"]*\">(http:\/\/)?).*(collect[\\\/]$strColName)@$1$3@g;   
     677       
     678#       my $savepath = &getcwd."/../"; # TASK_HOME env var does not exist at this stage, but it's one level up from current directory       
     679#       &gdbdiff::print_string_to_file($model_contents, $savepath."model_doc.xml");
     680#       &gdbdiff::print_string_to_file($test_contents, $savepath."test_doc.xml");
    672681       
    673682        $strResult = diff \$model_contents, \$test_contents, { STYLE => "OldStyle" };
Note: See TracChangeset for help on using the changeset viewer.