Ignore:
Timestamp:
2013-08-19T20:42:40+12:00 (11 years ago)
Author:
ak19
Message:

Bringing windows diffcol up to date for the latest tutorials. A new field whose value can be slightly different. Need to test for windows gdb differently when the input collection is OAI. Backslashes in docmets need to be normalised.

File:
1 edited

Legend:

Unmodified
Added
Removed
  • other-projects/nightly-tasks/diffcol/trunk/diffcol/gdbdiff.pm

    r28071 r28086  
    6666    # The total_numbytes field can vary depending on how many backslashes exist in the urls in the main body text, as each
    6767    # of these windows slashes get escaped with another backslash, and the resulting string is used as key into rel link db
    68     my $ignore_line_re = "\n<(FileSize|lastmodified|lastmodifieddate|oailastmodified|oailastmodifieddate|ex.File.FileModifyDate|ex.File.FilePermissions|total_numbytes)>([^\n])*";
     68    my $ignore_line_re = "\n<(FileSize|lastmodified|lastmodifieddate|oailastmodified|oailastmodifieddate|ex.File.FileModifyDate|ex.File.FilePermissions|total_numbytes|ex.Composite.LightValue)>([^\n])*";
    6969    $model_text =~ s/$ignore_line_re//g;
    7070    $test_text =~ s/$ignore_line_re//g;
     
    8181    my $modelIsWin = &isDBWindowsSensitive($dbname, $model_text);
    8282   
    83     if($testIsWin == $modelIsWin) { # both linux or both windows, do the basic test we did on linux machines:
     83    if($testIsWin == $modelIsWin) {     
     84    # both linux or both windows, do the basic test we did on linux machines:
    8485            # ignore absolute path prefixes in modelcol and testcol (necessary for archiveinf-doc and -src.gdb files)
    8586
     
    120121               
    121122                # assoc-file and meta-file contain filepaths, ensure these are long windows file paths now (will later convert to linux slashes)   
    122                 if($line =~ m@^<(assoc-file|meta-file)>(.*)(\s+)@s) { 
     123                if($line =~ m@^<(assoc-file|meta-file)>(.*)(\s+)@s) {
    123124                    $line = $2; # may be a short file name
    124125                    # perhaps test here if it is a shortfilename? should match /CAPS....~number(.ext)/
     
    135136        if($dbname =~ m/$strColName/) {
    136137            my $tmp = ""; # rebuild windows file's set of lines after processing them one by one
    137             for my $line (split /^/, $$win_text) { # split the string into newlines
    138            
     138            for my $line (split /^/, $$win_text) { # split the string into newlines         
     139
    139140                # In the following regex, add any .gdb fieldnames that represent a path and so would contain double backslashes
    140141                # on Windows (to escape the single backlash of win filepaths). They will be turned into single-backslashes here,
     
    143144                # E.g. On windows, the Word-PDF collection(s) contains double backslashes in the ex.File.Directory field
    144145                # the MARC-Exploded collection contains double backslashes in the null_file entry field of the .gdb file               
    145                 if($line =~ m@^<(ex.File.Directory|null_file)>(.*)@s) {                 
     146                if($line =~ m@^<(ex.File.Directory|null_file)>(.*)@s) {
    146147                    my ($fieldname, $escaped_path) = ($1, $2);
    147148                    $escaped_path =~ s@\\\\@/@g; #(my $escaped_path = $2) =~ s@\\\\@\\@g;
     
    282283            return 1;
    283284        }
     285        elsif ($db_contents =~ m@^(<ex.File.Directory>[a-zA-Z]:\\\\)@m) { # <ex.File.Directory>C:\\path\\path for OAI collection
     286            return 1;
     287        }
    284288        return 0;       
    285289    }   
Note: See TracChangeset for help on using the changeset viewer.