Ignore:
Timestamp:
08/09/13 22:46:32 (7 years ago)
Author:
ak19
Message:

Better detection of whether a gdb file is a windows-generated one or not. The weakness of the earlier test was only discovered with the Multimedia tutorial collection.

Location:
other-projects/nightly-tasks/diffcol/trunk/diffcol
Files:
2 edited

Legend:

Unmodified
Added
Removed
  • other-projects/nightly-tasks/diffcol/trunk/diffcol/diffcol.pl

    r28008 r28019  
    676676                # Doing so is okay, since we're not modifying the doc.xml in the model or test collections, just normalising them in-memory for comparison
    677677                $$lin_contents =~ s@([^\\])\\([^\\])@$1\/$2@g;
     678               
     679                # Advanced Beatles collection,
     680                # linux version contains: IMG SRC=_httpextlink_&rl=1&href=http:///\\"http://www.boskowan.com/ (extra / slash)
     681                # while windows contains: IMG SRC=_httpextlink_&rl=1&href=http://\\"http://www.boskowan.com/
     682                # Normalising to windows version for doing a diff
     683                $$lin_contents =~ s@href=http:///@href=http://@g;
    678684            }
    679685           
  • other-projects/nightly-tasks/diffcol/trunk/diffcol/gdbdiff.pm

    r28005 r28019  
    143143                # E.g. On windows, the Word-PDF collection(s) contains double backslashes in the ex.File.Directory field
    144144                # the MARC-Exploded collection contains double backslashes in the null_file entry field of the .gdb file               
    145                 if($line =~ m@^<(ex.File.Directory|null_file)>(.*)@s) {
     145                if($line =~ m@^<(ex.File.Directory|null_file)>(.*)@s) {                 
    146146                    my ($fieldname, $escaped_path) = ($1, $2);
    147                     $escaped_path =~ s@\\\\@\\@g; #(my $escaped_path = $2) =~ s@\\\\@\\@g;
     147                    $escaped_path =~ s@\\\\@/@g; #(my $escaped_path = $2) =~ s@\\\\@\\@g;
    148148                    $line = "<$fieldname>$escaped_path";
    149149                }
     
    160160                $tmp .= $line;
    161161            }
    162             $$win_text = $tmp;         
    163         }
    164        
    165        
    166         # slashes in windows metadata text need to be turned into linux style slashes
    167         $$win_text =~ s@\\@/@g; #$$win_text =~ s@\\([^n|r|\|"])@/$1@g; # filepath something\rtf remains something\rtf
     162            $$win_text = $tmp;
     163       
     164            # slashes in windows metadata text need to be turned into linux style slashes.
     165            # index\col.gdb uses double backslashes, and single for \n,\t
     166            #$$win_text =~ s@\\\\@/@g;
     167        }
     168        else { # archiveinf gdb file
     169       
     170            # slashes in windows metadata text need to be turned into linux style slashes.
     171            # In the two archivesinf gdb files, filepaths may use single backslashes
     172            $$win_text =~ s@\\@/@g; #$$win_text =~ s@\\([^n|r|\|"])@/$1@g; # filepath something\rtf remains something\rtf
     173        }
    168174       
    169175        # cut down absolute paths to files to just collect/colname/.../file, same as before
     
    244250    my ($dbtailname, $db_contents) = @_; # db filename without suffix
    245251   
    246     #if($dbtailname !~ m/archiveinf/) { # only archiveinf-doc and archive-inf source need special Windows processing, not col.gdb
    247     #   return 0;
    248     #}
    249     return ($db_contents =~ m/\\/) ? 1 : 0; # windows slashes detected. Better test would be: [Something\something] OR <tag>something\something
    250     # for doc.xml:
    251     #     <Metadata name="gsdlsourcefilename">import/html_files/cleves.html</Metadata>
     252#   return ($db_contents =~ m/\\/) ? 1 : 0; # windows slashes detected. Better test would be: [Something\something] OR <tag>something\something
     253   
     254    if($dbtailname =~ m/^archiveinf-doc/) {
     255        return ($db_contents =~ m@<src-file>[a-zA-Z]:\\@) ? 1 : 0; # <src-file>C:\path
     256    }
     257    elsif($dbtailname =~ m/^archiveinf-src/) { # <src-file>C:\path
     258        return ($db_contents =~ m@\[[a-zA-Z]:\\@) ? 1 : 0; # [C:\path]
     259    }
     260    else { # index/col.gdb file
     261        if ($db_contents =~ m@<URL>http://[a-zA-Z]:/@) { # <URL>http://C:/path
     262            return 1;
     263        }       
     264        elsif ($db_contents =~ m@^(<URL>http://[a-zA-Z]:/)|(<null_file>[^\\]*\\)@m) { # <URL>http://C:/path OR <null_file>CMSwp-all.00000001\\00000035.nul         
     265            return 1;
     266        }
     267        return 0;       
     268    }   
    252269}
    253270
Note: See TracChangeset for help on using the changeset viewer.