Changeset 27743 for other-projects


Ignore:
Timestamp:
07/03/13 21:37:18 (8 years ago)
Author:
ak19
Message:

Basic Word-PDF collection now has the same number of diffing errors on Windows upon diffcol as on Linux and Mac. Needed to do a lot of special processing for windows: to remove carriage returns introduced into doc.xml when doing a multiread on the html version of a pdf doc after it has been converted to html. (And similarly, needed to get rid of windows carriage returns introduced into ex.Title meta for pdf01.pdf converted to HTML. This was handled in HTMLPlugin). Further special tags need either to be ignored, if they're time stamps, or specially handled if they're filepaths. Not sure if it's the encoding setting in multiread or maybe the locale that is introducing the carriage returns, but am dealing with this at the point of diffcol since it's not a 'problem' in Greenstone, just an inconsistency across OS-es. There's still one diffcol error remaining for this collection on all 3 OS: one word document has a different word wrap length on the machine where the model col was built compared to the wrap length on the other machines. This may be a setting to wvware or else libreoffice/staroffice, if these are used.

Location:
other-projects/nightly-tasks/diffcol/trunk
Files:
3 edited

Legend:

Unmodified
Added
Removed
  • other-projects/nightly-tasks/diffcol/trunk/diffcol/diffcol.pl

    r27730 r27743  
    591591                my $strNewTest = &FileUtils::filenameConcatenate($strTest,$strEachFile);
    592592                # now additionally ignoring the earliestDatestamp file and the index/idx/*.idh binary file when diffing file
    593                 if(!($strEachFile eq "log" || $strEachFile eq "earliestDatestamp" || $strEachFile =~ m/\.cfg$/g || $strEachFile =~ m/\.((g|j|l|b)db|idh|i.*|wa|td|tsd|ti|t|tl|w|jpe?g|gif|png)$/g))
     593                if(!($strEachFile eq "log" || $strEachFile eq "earliestDatestamp" || $strEachFile =~ m/\.cfg$/g || $strEachFile =~ m/\.((g|j|l|b)db|idh|i.*|wa|td|tsd|ti|t|tl|w|jpe?g|gif|png|wmf)$/g)) # wmf = windows meta file
    594594                {
    595595                    push(@Errors,TestEach($strNewModel,$strNewTest,$intLevel,$strColName));
     
    611611                my $strNewModel = &FileUtils::filenameConcatenate($strModel,$strEachFile);
    612612                my $strNewTest = &FileUtils::filenameConcatenate($strTest,$strEachFile);
    613                 if(!($strEachFile eq "log" || $strEachFile eq "earliestDatestamp" || $strEachFile =~ m/\.cfg$/g || $strEachFile =~ m/\.((g|j|l|b)db|idh|i.*|wa|td|tsd|ti|t|tl|w|jpe?g|gif|png)$/g))
     613                if(!($strEachFile eq "log" || $strEachFile eq "earliestDatestamp" || $strEachFile =~ m/\.cfg$/g || $strEachFile =~ m/\.((g|j|l|b)db|idh|i.*|wa|td|tsd|ti|t|tl|w|jpe?g|gif|png|wmf)$/g))
    614614                {
    615615                    push(@Errors,TestEach($strNewModel,$strNewTest,$intLevel,$strColName));
     
    645645        $test_contents =~ s/$ignore_line_re//g;
    646646
     647
     648            # equalise/normalise the two doc.xml files for OS differences, if there are any
     649            my $testIsWin = &isDocXMLFileWindows($test_contents);
     650            my $modelIsWin = &isDocXMLFileWindows($model_contents);
     651           
     652            if($testIsWin != $modelIsWin) { # one of the 2 collections is built on windows, the other on linux, so need to make newlines constant
     653           
     654                my $win_contents = $testIsWin ? \$test_contents : \$model_contents;
     655           
     656                # remove all carriage returns \r - introduced into doc.xml by multiread after pdf converted to html
     657                $$win_contents =~ s@[\r]@@g;
     658           
     659                # make all single windows slashes into single unix slashes
     660                $$win_contents =~ s@([^\\])\\([^\\])@$1\/$2@g;
     661                # make windows \r newlines into constant \n newlines. Already handled when \r got replaced
     662                #$$win_contents =~ s@\r\n@\n@mg; # #http://stackoverflow.com/questions/650743/in-perl-how-to-do-you-remove-m-from-a-file
     663               
     664                #FOR MAC: old macs use CR carriage return (see http://www.perlmonks.org/?node_id=745018), so replace with \n?)
     665                # $$win_contents =~ s@\r@\n@mg;
     666            }
     667           
     668       
     669
    647670        # tmp dirs have subdirs with random numbers in name, remove randomly named subdir portion of path
    648671        $model_contents =~ s@(tmp[\\\/])(\d*[\\\/])@$1@g;
     
    651674        # remove all absolute paths upto collect folder from <Metadata /> elements
    652675        $model_contents =~ s@(<Metadata name=\"[^\"]*\">(http:\/\/)?).*(collect[\\\/]$strColName)@$1$3@g;
    653         $test_contents =~ s@(<Metadata name=\"[^\"]*\">(http:\/\/)?).*(collect[\\\/]$strColName)@$1$3@g;
    654 
    655 
    656             # equalise/normalise the two doc.xml files for OS differences, if there are any
    657             my $testIsWin = &isDocXMLFileWindows($test_contents);
    658             my $modelIsWin = &isDocXMLFileWindows($model_contents);
    659            
    660             if($testIsWin != $modelIsWin) { # one of the 2 collections is built on windows, the other on linux, so need to make newlines constant
    661            
    662                 my $win_contents = $testIsWin ? \$test_contents : \$model_contents;
    663            
    664                 # make all windows slashes into unix slashes
    665                 $$win_contents =~ s@\\@\/@g;
    666                 # make windows \r newlines into constant \n newlines       
    667                 $$win_contents =~ s@\r\n@\n@mg; # #http://stackoverflow.com/questions/650743/in-perl-how-to-do-you-remove-m-from-a-file
    668                
    669                 #FOR MAC: old macs use CR carriage return (see http://www.perlmonks.org/?node_id=745018), so replace with \n?)
    670                 # $$win_contents =~ s@\r@\n@mg;
    671             }
     676        $test_contents =~ s@(<Metadata name=\"[^\"]*\">(http:\/\/)?).*(collect[\\\/]$strColName)@$1$3@g;   
     677       
     678#       my $savepath = &getcwd."/../"; # TASK_HOME env var does not exist at this stage, but it's one level up from current directory       
     679#       &gdbdiff::print_string_to_file($model_contents, $savepath."model_doc.xml");
     680#       &gdbdiff::print_string_to_file($test_contents, $savepath."test_doc.xml");
    672681       
    673682        $strResult = diff \$model_contents, \$test_contents, { STYLE => "OldStyle" };
  • other-projects/nightly-tasks/diffcol/trunk/diffcol/gdbdiff.pm

    r27730 r27743  
    5959    my $test_text = readin_gdb($test_cmd);
    6060
    61 #   my $savepath = &getcwd."/../"; # TASK_HOME env does not exist at this stage, but it's one level up from current directory
     61#   my $savepath = &getcwd."/../"; # TASK_HOME env var does not exist at this stage, but it's one level up from current directory
    6262#   print_string_to_file($test_text, $savepath.$dbname."_test.out");   
    6363#   print_string_to_file($model_text, $savepath.$dbname."_model.out");
     
    9292            # Better regex is of the form /BEGIN((?:(?!BEGIN).)*)END/, see http://docstore.mik.ua/orelly/perl/cookbook/ch06_16.htm
    9393
    94             $model_text =~ s@^([^\\//]*).*(\\|/)(collect(\\|/)$strColName)(.*)$@$1$3$5@mg;         
    95             $test_text =~ s@^([^\\//]*).*(\\|/)(collect(\\|/)$strColName)(.*)$@$1$3$5@mg;                       
     94            $model_text =~ s@^([^\\/]*(//)*).*(\\|/)(collect(\\|/)$strColName)(.*)$@$1$4$6@mg;
     95            $test_text =~ s@^([^\\/]*(//)*).*(\\|/)(collect(\\|/)$strColName)(.*)$@$1$4$6@mg;
     96            #$model_text =~ s@^([^\\//]*).*(\\|/)(collect(\\|/)$strColName)(.*)$@$1$3$5@mg;         
     97            #$test_text =~ s@^([^\\//]*).*(\\|/)(collect(\\|/)$strColName)(.*)$@$1$3$5@mg;                     
    9698    }
    9799   
     
    116118            for my $line (split /^/, $$win_text) { # split the string into newlines
    117119               
    118                 if($line =~ m@^<assoc-file>(.*)(\s+)@s) {
    119                     $line = $1; # may be a short file name
     120                # assoc-file and meta-file contain filepaths, ensure these are long windows file paths now (will later convert to linux slashes)   
     121                if($line =~ m@^<(assoc-file|meta-file)>(.*)(\s+)@s) {
     122                    $line = $2; # may be a short file name
    120123                    # perhaps test here if it is a shortfilename? should match /CAPS....~number(.ext)/
    121124               
    122                     $line = "<assoc-file>".&Win32::GetLongPathName($line)."$2"; # make it a long file name and prefix assoc-file to it again                   
     125                    $line = "<$1>".&Win32::GetLongPathName($line)."$3"; # make it a long file name and prefix assoc-file/meta-file tagname to it again                 
    123126                }
    124127                $tmp .= $line;
     
    127130        }
    128131       
    129         # slashes in windows text need to be turned into linux style slashes
    130         $$win_text =~ s@\\@/@g;
     132       
     133        # index gdb file
     134        if($dbname =~ m/$strColName/) {
     135            my $tmp = ""; # rebuild windows file's set of lines after processing them one by one
     136            for my $line (split /^/, $$win_text) { # split the string into newlines
     137               
     138                if($line =~ m@^<ex.File.Directory>(.*)@s) { # word-pdf collection contains double windows backslashes
     139                    (my $escaped_path = $1) =~ s@\\\\@\\@g;             
     140                    $line = "<ex.File.Directory>$escaped_path";
     141                }
     142                elsif($line =~ m@^<Title>(.*)@s) {
     143#                   print STDERR "***** TITLE: |$1|\n";
     144               
     145                    # word-pdf collection: Title of ps files contain new lines at end when
     146                    # GreenstoneXMLPlugin::xml_end_tag() writes the Title back out after utf8 decode
     147                    # if($metadata_name eq "Title") { $metadata_value =~ s/[\n\r]*$//; }
     148               
     149                    (my $title = $1) =~ s@(\r|\n|\\n)*$@@; # get rid of trailing newlines/carriage returns
     150                    $line = "<Title>$title\n"; # add single newline                 
     151                }
     152                $tmp .= $line;
     153            }
     154            $$win_text = $tmp;         
     155        }
     156       
     157       
     158        # slashes in windows metadata text need to be turned into linux style slashes
     159        $$win_text =~ s@\\@/@g; #$$win_text =~ s@\\([^n|r|\|"])@/$1@g; # filepath something\rtf remains something\rtf
    131160       
    132161        # cut down absolute paths to files to just collect/colname/.../file, same as before
    133         $$lin_text =~ s@^([^\\//]*).*(\\|/)(collect(\\|/)$strColName)(.*)$@$1$3$5@mg;
    134         $$win_text =~ s@^([^\\//]*).*(\\|/)(collect(\\|/)$strColName)(.*)$@$1$3$5@mg;
     162        $$lin_text =~ s@^([^\\/]*(//)*).*(\\|/)(collect(\\|/)$strColName)(.*)$@$1$4$6@mg; # $$lin_text =~ s@^([^\\\/]*(//)?).*(\\|/)(collect(\\|/)$strColName)(.*)$@$1$4$6@mg;
     163        $$win_text =~ s@^([^\\/]*(//)*).*(\\|/)(collect(\\|/)$strColName)(.*)$@$1$4$6@mg;       
    135164       
    136165        # for the windows text, need to further get rid of the driveletter after [ or <meta>
     
    166195    my ($dbtailname, $db_contents) = @_; # db filename without suffix
    167196   
    168     if($dbtailname !~ m/archiveinf/) { # only archiveinf-doc and archive-inf source need special Windows processing, not col.gdb
    169         return 0;
    170     }
     197    #if($dbtailname !~ m/archiveinf/) { # only archiveinf-doc and archive-inf source need special Windows processing, not col.gdb
     198    #   return 0;
     199    #}
    171200    return ($db_contents =~ m/\\/) ? 1 : 0; # windows slashes detected. Better test would be: [Something\something] OR <tag>something\something
    172201    # for doc.xml:
  • other-projects/nightly-tasks/diffcol/trunk/task.pl

    r27725 r27743  
    417417    for my $collection (readdir $collect_handle) {
    418418    next if ($collection eq "." || $collection eq "..");
    419     ##next if ($collection ne "Small-HTML"); ## TEMPORARY, FOR TESTING THIS SCRIPT
     419#   next if ($collection ne "Word-PDF-Basic"); ## TEMPORARY, FOR TESTING THIS SCRIPT
    420420
    421421    #escape the filename (in case of space)
Note: See TracChangeset for help on using the changeset viewer.