Show
Ignore:
Timestamp:
09.08.2013 22:46:32 (6 years ago)
Author:
ak19
Message:

Better detection of whether a gdb file is a windows-generated one or not. The weakness of the earlier test was only discovered with the Multimedia tutorial collection.

Location:
other-projects/nightly-tasks/diffcol/trunk/diffcol
Files:
2 modified

Legend:

Unmodified
Added
Removed
  • other-projects/nightly-tasks/diffcol/trunk/diffcol/diffcol.pl

    r28008 r28019  
    676676                # Doing so is okay, since we're not modifying the doc.xml in the model or test collections, just normalising them in-memory for comparison 
    677677                $$lin_contents =~ s@([^\\])\\([^\\])@$1\/$2@g; 
     678                 
     679                # Advanced Beatles collection,  
     680                # linux version contains: IMG SRC=_httpextlink_&rl=1&href=http:///\\"http://www.boskowan.com/ (extra / slash) 
     681                # while windows contains: IMG SRC=_httpextlink_&rl=1&href=http://\\"http://www.boskowan.com/ 
     682                # Normalising to windows version for doing a diff 
     683                $$lin_contents =~ s@href=http:///@href=http://@g; 
    678684            } 
    679685             
  • other-projects/nightly-tasks/diffcol/trunk/diffcol/gdbdiff.pm

    r28005 r28019  
    143143                # E.g. On windows, the Word-PDF collection(s) contains double backslashes in the ex.File.Directory field 
    144144                # the MARC-Exploded collection contains double backslashes in the null_file entry field of the .gdb file                 
    145                 if($line =~ m@^<(ex.File.Directory|null_file)>(.*)@s) { 
     145                if($line =~ m@^<(ex.File.Directory|null_file)>(.*)@s) {                  
    146146                    my ($fieldname, $escaped_path) = ($1, $2); 
    147                     $escaped_path =~ s@\\\\@\\@g; #(my $escaped_path = $2) =~ s@\\\\@\\@g; 
     147                    $escaped_path =~ s@\\\\@/@g; #(my $escaped_path = $2) =~ s@\\\\@\\@g; 
    148148                    $line = "<$fieldname>$escaped_path"; 
    149149                }  
     
    160160                $tmp .= $line; 
    161161            } 
    162             $$win_text = $tmp;           
    163         } 
    164          
    165          
    166         # slashes in windows metadata text need to be turned into linux style slashes 
    167         $$win_text =~ s@\\@/@g; #$$win_text =~ s@\\([^n|r|\|"])@/$1@g; # filepath something\rtf remains something\rtf 
     162            $$win_text = $tmp; 
     163         
     164            # slashes in windows metadata text need to be turned into linux style slashes.  
     165            # index\col.gdb uses double backslashes, and single for \n,\t 
     166            #$$win_text =~ s@\\\\@/@g; 
     167        } 
     168        else { # archiveinf gdb file 
     169         
     170            # slashes in windows metadata text need to be turned into linux style slashes.  
     171            # In the two archivesinf gdb files, filepaths may use single backslashes 
     172            $$win_text =~ s@\\@/@g; #$$win_text =~ s@\\([^n|r|\|"])@/$1@g; # filepath something\rtf remains something\rtf 
     173        } 
    168174         
    169175        # cut down absolute paths to files to just collect/colname/.../file, same as before 
     
    244250    my ($dbtailname, $db_contents) = @_; # db filename without suffix 
    245251     
    246     #if($dbtailname !~ m/archiveinf/) { # only archiveinf-doc and archive-inf source need special Windows processing, not col.gdb 
    247     #   return 0; 
    248     #} 
    249     return ($db_contents =~ m/\\/) ? 1 : 0; # windows slashes detected. Better test would be: [Something\something] OR <tag>something\something 
    250     # for doc.xml: 
    251     #     <Metadata name="gsdlsourcefilename">import/html_files/cleves.html</Metadata> 
     252#   return ($db_contents =~ m/\\/) ? 1 : 0; # windows slashes detected. Better test would be: [Something\something] OR <tag>something\something  
     253     
     254    if($dbtailname =~ m/^archiveinf-doc/) { 
     255        return ($db_contents =~ m@<src-file>[a-zA-Z]:\\@) ? 1 : 0; # <src-file>C:\path 
     256    }  
     257    elsif($dbtailname =~ m/^archiveinf-src/) { # <src-file>C:\path 
     258        return ($db_contents =~ m@\[[a-zA-Z]:\\@) ? 1 : 0; # [C:\path] 
     259    }  
     260    else { # index/col.gdb file 
     261        if ($db_contents =~ m@<URL>http://[a-zA-Z]:/@) { # <URL>http://C:/path 
     262            return 1; 
     263        }        
     264        elsif ($db_contents =~ m@^(<URL>http://[a-zA-Z]:/)|(<null_file>[^\\]*\\)@m) { # <URL>http://C:/path OR <null_file>CMSwp-all.00000001\\00000035.nul           
     265            return 1; 
     266        } 
     267        return 0;        
     268    }    
    252269} 
    253270