Ignore:
Timestamp:
2013-07-03T21:37:18+12:00 (11 years ago)
Author:
ak19
Message:

Basic Word-PDF collection now has the same number of diffing errors on Windows upon diffcol as on Linux and Mac. Needed to do a lot of special processing for windows: to remove carriage returns introduced into doc.xml when doing a multiread on the html version of a pdf doc after it has been converted to html. (And similarly, needed to get rid of windows carriage returns introduced into ex.Title meta for pdf01.pdf converted to HTML. This was handled in HTMLPlugin). Further special tags need either to be ignored, if they're time stamps, or specially handled if they're filepaths. Not sure if it's the encoding setting in multiread or maybe the locale that is introducing the carriage returns, but am dealing with this at the point of diffcol since it's not a 'problem' in Greenstone, just an inconsistency across OS-es. There's still one diffcol error remaining for this collection on all 3 OS: one word document has a different word wrap length on the machine where the model col was built compared to the wrap length on the other machines. This may be a setting to wvware or else libreoffice/staroffice, if these are used.

File:
1 edited

Legend:

Unmodified
Added
Removed
  • other-projects/nightly-tasks/diffcol/trunk/diffcol/gdbdiff.pm

    r27730 r27743  
    5959    my $test_text = readin_gdb($test_cmd);
    6060
    61 #   my $savepath = &getcwd."/../"; # TASK_HOME env does not exist at this stage, but it's one level up from current directory
     61#   my $savepath = &getcwd."/../"; # TASK_HOME env var does not exist at this stage, but it's one level up from current directory
    6262#   print_string_to_file($test_text, $savepath.$dbname."_test.out");   
    6363#   print_string_to_file($model_text, $savepath.$dbname."_model.out");
     
    9292            # Better regex is of the form /BEGIN((?:(?!BEGIN).)*)END/, see http://docstore.mik.ua/orelly/perl/cookbook/ch06_16.htm
    9393
    94             $model_text =~ s@^([^\\//]*).*(\\|/)(collect(\\|/)$strColName)(.*)$@$1$3$5@mg;         
    95             $test_text =~ s@^([^\\//]*).*(\\|/)(collect(\\|/)$strColName)(.*)$@$1$3$5@mg;                       
     94            $model_text =~ s@^([^\\/]*(//)*).*(\\|/)(collect(\\|/)$strColName)(.*)$@$1$4$6@mg;
     95            $test_text =~ s@^([^\\/]*(//)*).*(\\|/)(collect(\\|/)$strColName)(.*)$@$1$4$6@mg;
     96            #$model_text =~ s@^([^\\//]*).*(\\|/)(collect(\\|/)$strColName)(.*)$@$1$3$5@mg;         
     97            #$test_text =~ s@^([^\\//]*).*(\\|/)(collect(\\|/)$strColName)(.*)$@$1$3$5@mg;                     
    9698    }
    9799   
     
    116118            for my $line (split /^/, $$win_text) { # split the string into newlines
    117119               
    118                 if($line =~ m@^<assoc-file>(.*)(\s+)@s) {
    119                     $line = $1; # may be a short file name
     120                # assoc-file and meta-file contain filepaths, ensure these are long windows file paths now (will later convert to linux slashes)   
     121                if($line =~ m@^<(assoc-file|meta-file)>(.*)(\s+)@s) {
     122                    $line = $2; # may be a short file name
    120123                    # perhaps test here if it is a shortfilename? should match /CAPS....~number(.ext)/
    121124               
    122                     $line = "<assoc-file>".&Win32::GetLongPathName($line)."$2"; # make it a long file name and prefix assoc-file to it again                   
     125                    $line = "<$1>".&Win32::GetLongPathName($line)."$3"; # make it a long file name and prefix assoc-file/meta-file tagname to it again                 
    123126                }
    124127                $tmp .= $line;
     
    127130        }
    128131       
    129         # slashes in windows text need to be turned into linux style slashes
    130         $$win_text =~ s@\\@/@g;
     132       
     133        # index gdb file
     134        if($dbname =~ m/$strColName/) {
     135            my $tmp = ""; # rebuild windows file's set of lines after processing them one by one
     136            for my $line (split /^/, $$win_text) { # split the string into newlines
     137               
     138                if($line =~ m@^<ex.File.Directory>(.*)@s) { # word-pdf collection contains double windows backslashes
     139                    (my $escaped_path = $1) =~ s@\\\\@\\@g;             
     140                    $line = "<ex.File.Directory>$escaped_path";
     141                }
     142                elsif($line =~ m@^<Title>(.*)@s) {
     143#                   print STDERR "***** TITLE: |$1|\n";
     144               
     145                    # word-pdf collection: Title of ps files contain new lines at end when
     146                    # GreenstoneXMLPlugin::xml_end_tag() writes the Title back out after utf8 decode
     147                    # if($metadata_name eq "Title") { $metadata_value =~ s/[\n\r]*$//; }
     148               
     149                    (my $title = $1) =~ s@(\r|\n|\\n)*$@@; # get rid of trailing newlines/carriage returns
     150                    $line = "<Title>$title\n"; # add single newline                 
     151                }
     152                $tmp .= $line;
     153            }
     154            $$win_text = $tmp;         
     155        }
     156       
     157       
     158        # slashes in windows metadata text need to be turned into linux style slashes
     159        $$win_text =~ s@\\@/@g; #$$win_text =~ s@\\([^n|r|\|"])@/$1@g; # filepath something\rtf remains something\rtf
    131160       
    132161        # cut down absolute paths to files to just collect/colname/.../file, same as before
    133         $$lin_text =~ s@^([^\\//]*).*(\\|/)(collect(\\|/)$strColName)(.*)$@$1$3$5@mg;
    134         $$win_text =~ s@^([^\\//]*).*(\\|/)(collect(\\|/)$strColName)(.*)$@$1$3$5@mg;
     162        $$lin_text =~ s@^([^\\/]*(//)*).*(\\|/)(collect(\\|/)$strColName)(.*)$@$1$4$6@mg; # $$lin_text =~ s@^([^\\\/]*(//)?).*(\\|/)(collect(\\|/)$strColName)(.*)$@$1$4$6@mg;
     163        $$win_text =~ s@^([^\\/]*(//)*).*(\\|/)(collect(\\|/)$strColName)(.*)$@$1$4$6@mg;       
    135164       
    136165        # for the windows text, need to further get rid of the driveletter after [ or <meta>
     
    166195    my ($dbtailname, $db_contents) = @_; # db filename without suffix
    167196   
    168     if($dbtailname !~ m/archiveinf/) { # only archiveinf-doc and archive-inf source need special Windows processing, not col.gdb
    169         return 0;
    170     }
     197    #if($dbtailname !~ m/archiveinf/) { # only archiveinf-doc and archive-inf source need special Windows processing, not col.gdb
     198    #   return 0;
     199    #}
    171200    return ($db_contents =~ m/\\/) ? 1 : 0; # windows slashes detected. Better test would be: [Something\something] OR <tag>something\something
    172201    # for doc.xml:
Note: See TracChangeset for help on using the changeset viewer.