Changeset 27743

Show
Ignore:
Timestamp:
03.07.2013 21:37:18 (6 years ago)
Author:
ak19
Message:

Basic Word-PDF collection now has the same number of diffing errors on Windows upon diffcol as on Linux and Mac. Needed to do a lot of special processing for windows: to remove carriage returns introduced into doc.xml when doing a multiread on the html version of a pdf doc after it has been converted to html. (And similarly, needed to get rid of windows carriage returns introduced into ex.Title meta for pdf01.pdf converted to HTML. This was handled in HTMLPlugin). Further special tags need either to be ignored, if they're time stamps, or specially handled if they're filepaths. Not sure if it's the encoding setting in multiread or maybe the locale that is introducing the carriage returns, but am dealing with this at the point of diffcol since it's not a 'problem' in Greenstone, just an inconsistency across OS-es. There's still one diffcol error remaining for this collection on all 3 OS: one word document has a different word wrap length on the machine where the model col was built compared to the wrap length on the other machines. This may be a setting to wvware or else libreoffice/staroffice, if these are used.

Location:
other-projects/nightly-tasks/diffcol/trunk
Files:
3 modified

Legend:

Unmodified
Added
Removed
  • other-projects/nightly-tasks/diffcol/trunk/diffcol/diffcol.pl

    r27730 r27743  
    591591                my $strNewTest = &FileUtils::filenameConcatenate($strTest,$strEachFile); 
    592592                # now additionally ignoring the earliestDatestamp file and the index/idx/*.idh binary file when diffing file  
    593                 if(!($strEachFile eq "log" || $strEachFile eq "earliestDatestamp" || $strEachFile =~ m/\.cfg$/g || $strEachFile =~ m/\.((g|j|l|b)db|idh|i.*|wa|td|tsd|ti|t|tl|w|jpe?g|gif|png)$/g)) 
     593                if(!($strEachFile eq "log" || $strEachFile eq "earliestDatestamp" || $strEachFile =~ m/\.cfg$/g || $strEachFile =~ m/\.((g|j|l|b)db|idh|i.*|wa|td|tsd|ti|t|tl|w|jpe?g|gif|png|wmf)$/g)) # wmf = windows meta file 
    594594                { 
    595595                    push(@Errors,TestEach($strNewModel,$strNewTest,$intLevel,$strColName)); 
     
    611611                my $strNewModel = &FileUtils::filenameConcatenate($strModel,$strEachFile); 
    612612                my $strNewTest = &FileUtils::filenameConcatenate($strTest,$strEachFile); 
    613                 if(!($strEachFile eq "log" || $strEachFile eq "earliestDatestamp" || $strEachFile =~ m/\.cfg$/g || $strEachFile =~ m/\.((g|j|l|b)db|idh|i.*|wa|td|tsd|ti|t|tl|w|jpe?g|gif|png)$/g)) 
     613                if(!($strEachFile eq "log" || $strEachFile eq "earliestDatestamp" || $strEachFile =~ m/\.cfg$/g || $strEachFile =~ m/\.((g|j|l|b)db|idh|i.*|wa|td|tsd|ti|t|tl|w|jpe?g|gif|png|wmf)$/g)) 
    614614                { 
    615615                    push(@Errors,TestEach($strNewModel,$strNewTest,$intLevel,$strColName)); 
     
    645645        $test_contents =~ s/$ignore_line_re//g; 
    646646 
     647 
     648            # equalise/normalise the two doc.xml files for OS differences, if there are any 
     649            my $testIsWin = &isDocXMLFileWindows($test_contents); 
     650            my $modelIsWin = &isDocXMLFileWindows($model_contents); 
     651             
     652            if($testIsWin != $modelIsWin) { # one of the 2 collections is built on windows, the other on linux, so need to make newlines constant 
     653             
     654                my $win_contents = $testIsWin ? \$test_contents : \$model_contents; 
     655             
     656                # remove all carriage returns \r - introduced into doc.xml by multiread after pdf converted to html 
     657                $$win_contents =~ s@[\r]@@g; 
     658             
     659                # make all single windows slashes into single unix slashes 
     660                $$win_contents =~ s@([^\\])\\([^\\])@$1\/$2@g; 
     661                # make windows \r newlines into constant \n newlines. Already handled when \r got replaced 
     662                #$$win_contents =~ s@\r\n@\n@mg; # #http://stackoverflow.com/questions/650743/in-perl-how-to-do-you-remove-m-from-a-file 
     663                 
     664                #FOR MAC: old macs use CR carriage return (see http://www.perlmonks.org/?node_id=745018), so replace with \n?) 
     665                # $$win_contents =~ s@\r@\n@mg; 
     666            } 
     667             
     668         
     669 
    647670        # tmp dirs have subdirs with random numbers in name, remove randomly named subdir portion of path 
    648671        $model_contents =~ s@(tmp[\\\/])(\d*[\\\/])@$1@g;  
     
    651674        # remove all absolute paths upto collect folder from <Metadata /> elements 
    652675        $model_contents =~ s@(<Metadata name=\"[^\"]*\">(http:\/\/)?).*(collect[\\\/]$strColName)@$1$3@g; 
    653         $test_contents =~ s@(<Metadata name=\"[^\"]*\">(http:\/\/)?).*(collect[\\\/]$strColName)@$1$3@g; 
    654  
    655  
    656             # equalise/normalise the two doc.xml files for OS differences, if there are any 
    657             my $testIsWin = &isDocXMLFileWindows($test_contents); 
    658             my $modelIsWin = &isDocXMLFileWindows($model_contents); 
    659              
    660             if($testIsWin != $modelIsWin) { # one of the 2 collections is built on windows, the other on linux, so need to make newlines constant 
    661              
    662                 my $win_contents = $testIsWin ? \$test_contents : \$model_contents; 
    663              
    664                 # make all windows slashes into unix slashes 
    665                 $$win_contents =~ s@\\@\/@g; 
    666                 # make windows \r newlines into constant \n newlines         
    667                 $$win_contents =~ s@\r\n@\n@mg; # #http://stackoverflow.com/questions/650743/in-perl-how-to-do-you-remove-m-from-a-file 
    668                  
    669                 #FOR MAC: old macs use CR carriage return (see http://www.perlmonks.org/?node_id=745018), so replace with \n?) 
    670                 # $$win_contents =~ s@\r@\n@mg; 
    671             } 
     676        $test_contents =~ s@(<Metadata name=\"[^\"]*\">(http:\/\/)?).*(collect[\\\/]$strColName)@$1$3@g;     
     677         
     678#       my $savepath = &getcwd."/../"; # TASK_HOME env var does not exist at this stage, but it's one level up from current directory        
     679#       &gdbdiff::print_string_to_file($model_contents, $savepath."model_doc.xml"); 
     680#       &gdbdiff::print_string_to_file($test_contents, $savepath."test_doc.xml"); 
    672681         
    673682        $strResult = diff \$model_contents, \$test_contents, { STYLE => "OldStyle" }; 
  • other-projects/nightly-tasks/diffcol/trunk/diffcol/gdbdiff.pm

    r27730 r27743  
    5959    my $test_text = readin_gdb($test_cmd); 
    6060 
    61 #   my $savepath = &getcwd."/../"; # TASK_HOME env does not exist at this stage, but it's one level up from current directory 
     61#   my $savepath = &getcwd."/../"; # TASK_HOME env var does not exist at this stage, but it's one level up from current directory 
    6262#   print_string_to_file($test_text, $savepath.$dbname."_test.out");     
    6363#   print_string_to_file($model_text, $savepath.$dbname."_model.out"); 
     
    9292            # Better regex is of the form /BEGIN((?:(?!BEGIN).)*)END/, see http://docstore.mik.ua/orelly/perl/cookbook/ch06_16.htm 
    9393 
    94             $model_text =~ s@^([^\\//]*).*(\\|/)(collect(\\|/)$strColName)(.*)$@$1$3$5@mg;           
    95             $test_text =~ s@^([^\\//]*).*(\\|/)(collect(\\|/)$strColName)(.*)$@$1$3$5@mg;                        
     94            $model_text =~ s@^([^\\/]*(//)*).*(\\|/)(collect(\\|/)$strColName)(.*)$@$1$4$6@mg; 
     95            $test_text =~ s@^([^\\/]*(//)*).*(\\|/)(collect(\\|/)$strColName)(.*)$@$1$4$6@mg; 
     96            #$model_text =~ s@^([^\\//]*).*(\\|/)(collect(\\|/)$strColName)(.*)$@$1$3$5@mg;          
     97            #$test_text =~ s@^([^\\//]*).*(\\|/)(collect(\\|/)$strColName)(.*)$@$1$3$5@mg;                       
    9698    } 
    9799     
     
    116118            for my $line (split /^/, $$win_text) { # split the string into newlines 
    117119                 
    118                 if($line =~ m@^<assoc-file>(.*)(\s+)@s) { 
    119                     $line = $1; # may be a short file name 
     120                # assoc-file and meta-file contain filepaths, ensure these are long windows file paths now (will later convert to linux slashes)     
     121                if($line =~ m@^<(assoc-file|meta-file)>(.*)(\s+)@s) {  
     122                    $line = $2; # may be a short file name 
    120123                    # perhaps test here if it is a shortfilename? should match /CAPS....~number(.ext)/ 
    121124                 
    122                     $line = "<assoc-file>".&Win32::GetLongPathName($line)."$2"; # make it a long file name and prefix assoc-file to it again                     
     125                    $line = "<$1>".&Win32::GetLongPathName($line)."$3"; # make it a long file name and prefix assoc-file/meta-file tagname to it again                   
    123126                } 
    124127                $tmp .= $line; 
     
    127130        } 
    128131         
    129         # slashes in windows text need to be turned into linux style slashes 
    130         $$win_text =~ s@\\@/@g; 
     132         
     133        # index gdb file 
     134        if($dbname =~ m/$strColName/) { 
     135            my $tmp = ""; # rebuild windows file's set of lines after processing them one by one 
     136            for my $line (split /^/, $$win_text) { # split the string into newlines 
     137                 
     138                if($line =~ m@^<ex.File.Directory>(.*)@s) { # word-pdf collection contains double windows backslashes 
     139                    (my $escaped_path = $1) =~ s@\\\\@\\@g;              
     140                    $line = "<ex.File.Directory>$escaped_path"; 
     141                }  
     142                elsif($line =~ m@^<Title>(.*)@s) { 
     143#                   print STDERR "***** TITLE: |$1|\n"; 
     144                 
     145                    # word-pdf collection: Title of ps files contain new lines at end when 
     146                    # GreenstoneXMLPlugin::xml_end_tag() writes the Title back out after utf8 decode 
     147                    # if($metadata_name eq "Title") { $metadata_value =~ s/[\n\r]*$//; } 
     148                 
     149                    (my $title = $1) =~ s@(\r|\n|\\n)*$@@; # get rid of trailing newlines/carriage returns 
     150                    $line = "<Title>$title\n"; # add single newline                  
     151                } 
     152                $tmp .= $line; 
     153            } 
     154            $$win_text = $tmp;           
     155        } 
     156         
     157         
     158        # slashes in windows metadata text need to be turned into linux style slashes 
     159        $$win_text =~ s@\\@/@g; #$$win_text =~ s@\\([^n|r|\|"])@/$1@g; # filepath something\rtf remains something\rtf 
    131160         
    132161        # cut down absolute paths to files to just collect/colname/.../file, same as before 
    133         $$lin_text =~ s@^([^\\//]*).*(\\|/)(collect(\\|/)$strColName)(.*)$@$1$3$5@mg; 
    134         $$win_text =~ s@^([^\\//]*).*(\\|/)(collect(\\|/)$strColName)(.*)$@$1$3$5@mg; 
     162        $$lin_text =~ s@^([^\\/]*(//)*).*(\\|/)(collect(\\|/)$strColName)(.*)$@$1$4$6@mg; # $$lin_text =~ s@^([^\\\/]*(//)?).*(\\|/)(collect(\\|/)$strColName)(.*)$@$1$4$6@mg; 
     163        $$win_text =~ s@^([^\\/]*(//)*).*(\\|/)(collect(\\|/)$strColName)(.*)$@$1$4$6@mg;        
    135164         
    136165        # for the windows text, need to further get rid of the driveletter after [ or <meta> 
     
    166195    my ($dbtailname, $db_contents) = @_; # db filename without suffix 
    167196     
    168     if($dbtailname !~ m/archiveinf/) { # only archiveinf-doc and archive-inf source need special Windows processing, not col.gdb 
    169         return 0; 
    170     } 
     197    #if($dbtailname !~ m/archiveinf/) { # only archiveinf-doc and archive-inf source need special Windows processing, not col.gdb 
     198    #   return 0; 
     199    #} 
    171200    return ($db_contents =~ m/\\/) ? 1 : 0; # windows slashes detected. Better test would be: [Something\something] OR <tag>something\something 
    172201    # for doc.xml: 
  • other-projects/nightly-tasks/diffcol/trunk/task.pl

    r27725 r27743  
    417417    for my $collection (readdir $collect_handle) { 
    418418    next if ($collection eq "." || $collection eq ".."); 
    419     ##next if ($collection ne "Small-HTML"); ## TEMPORARY, FOR TESTING THIS SCRIPT 
     419#   next if ($collection ne "Word-PDF-Basic"); ## TEMPORARY, FOR TESTING THIS SCRIPT 
    420420 
    421421    #escape the filename (in case of space)