[21711] | 1 | package gdbdiff;
|
---|
| 2 |
|
---|
| 3 | BEGIN {
|
---|
| 4 | die "GSDLHOME not set\n" unless defined $ENV{'GSDLHOME'};
|
---|
| 5 | die "GSDLOS not set\n" unless defined $ENV{'GSDLOS'};
|
---|
| 6 | unshift (@INC, "$ENV{'GSDLHOME'}/perllib");
|
---|
| 7 | unshift (@INC, "$ENV{'GSDLHOME'}/perllib/cpan");
|
---|
| 8 | }
|
---|
| 9 |
|
---|
| 10 | use util;
|
---|
| 11 | use diffutil;
|
---|
| 12 | use Text::Diff;
|
---|
[27701] | 13 | use Cwd;
|
---|
[21711] | 14 |
|
---|
[27696] | 15 | if ($^O =~ m/mswin/i) {
|
---|
| 16 | require Win32; # for working out Windows Long Filenames from Win 8.3 short filenames
|
---|
| 17 | }
|
---|
[27695] | 18 |
|
---|
[21711] | 19 | sub readin_gdb
|
---|
| 20 | {
|
---|
| 21 | my ($cmd) = @_;
|
---|
| 22 |
|
---|
| 23 | open(PIN,"$cmd|")
|
---|
| 24 | || die "Unable to open pipe to $cmd: $!\n";
|
---|
| 25 |
|
---|
| 26 | my $text_content = "";
|
---|
| 27 |
|
---|
| 28 | while (defined (my $line = <PIN>)) {
|
---|
| 29 | $text_content .= $line;
|
---|
| 30 | }
|
---|
| 31 |
|
---|
| 32 | close(PIN);
|
---|
| 33 | return $text_content;
|
---|
| 34 | }
|
---|
| 35 |
|
---|
[28661] | 36 | sub read_db
|
---|
| 37 | {
|
---|
| 38 | # need to sort text output of both test and model col database files, to normalise them for the comparison
|
---|
| 39 | # the -sort option to db2txt was added specifically to support diffcol
|
---|
| 40 | my($db_file) = @_;
|
---|
| 41 | my $db_cmd = "db2txt -sort $db_file 2>&1";
|
---|
| 42 | if($db_file =~ m/\.jdb$/) {
|
---|
| 43 | print STDERR "NOT YET IMPLEMENTED\n";
|
---|
| 44 | #$db_cmd = "jdb2txt -sort $db_file 2>&1";
|
---|
| 45 | }
|
---|
| 46 |
|
---|
| 47 | my $db_text = readin_gdb($db_cmd);
|
---|
| 48 | return $db_text;
|
---|
| 49 | }
|
---|
| 50 |
|
---|
| 51 | sub text_to_db_to_text
|
---|
| 52 | {
|
---|
| 53 | my($db_text, $db_filename) = @_;
|
---|
| 54 |
|
---|
| 55 | # http://stackoverflow.com/questions/1909262/how-can-i-pipe-input-into-a-java-command-from-perl
|
---|
| 56 | open PIPE, "| txt2db $db_filename";
|
---|
| 57 | print PIPE "$db_text";
|
---|
| 58 | close(PIPE);
|
---|
| 59 |
|
---|
| 60 | return &read_db("$db_filename");
|
---|
| 61 | }
|
---|
| 62 |
|
---|
[27695] | 63 | # for debugging. Prints txt contents of db to file
|
---|
| 64 | sub print_string_to_file
|
---|
| 65 | {
|
---|
[28661] | 66 | my ($text, $outfile) = @_;
|
---|
[27695] | 67 |
|
---|
[28661] | 68 | open(FOUT, ">$outfile") or die "ERROR failed to write to $outfile: $!\n";
|
---|
[27695] | 69 | print FOUT $text;
|
---|
| 70 | close(FOUT);
|
---|
| 71 | }
|
---|
[21711] | 72 |
|
---|
| 73 | sub test_gdb
|
---|
| 74 | {
|
---|
[28238] | 75 | my ($full_modeldb, $full_testdb, $strColName, $test_os, $model_os, $strTestCol, $strModelCol) = @_;
|
---|
[21711] | 76 |
|
---|
| 77 | # print "Now is testing database\n";
|
---|
[27695] | 78 | my ($dbname, $dirname, $suffix)= &File::Basename::fileparse($full_testdb, "\\.[^\\.]+\$");
|
---|
| 79 |
|
---|
[28661] | 80 | my $model_text = read_db($full_modeldb);
|
---|
| 81 | my $test_text = read_db($full_testdb);
|
---|
[21711] | 82 |
|
---|
[27743] | 83 | # my $savepath = &getcwd."/../"; # TASK_HOME env var does not exist at this stage, but it's one level up from current directory
|
---|
[28005] | 84 | # print_string_to_file($test_text, $savepath.$dbname."_test.out1");
|
---|
| 85 | # print_string_to_file($model_text, $savepath.$dbname."_model.out1");
|
---|
[27604] | 86 |
|
---|
| 87 | # filter out the fields that can be ignored in the two database files
|
---|
[27701] | 88 | # The total_numbytes field can vary depending on how many backslashes exist in the urls in the main body text, as each
|
---|
| 89 | # of these windows slashes get escaped with another backslash, and the resulting string is used as key into rel link db
|
---|
[28086] | 90 | my $ignore_line_re = "\n<(FileSize|lastmodified|lastmodifieddate|oailastmodified|oailastmodifieddate|ex.File.FileModifyDate|ex.File.FilePermissions|total_numbytes|ex.Composite.LightValue)>([^\n])*";
|
---|
[27604] | 91 | $model_text =~ s/$ignore_line_re//g;
|
---|
| 92 | $test_text =~ s/$ignore_line_re//g;
|
---|
| 93 |
|
---|
[27730] | 94 | # tmp dirs have subdirs with random numbers in name, remove subdir
|
---|
[27766] | 95 | # these tmpdirs are located inside the collection directory
|
---|
[27730] | 96 | $model_text =~ s@(tmp[\\\/])(\d*[\\\/])@$1@g;
|
---|
| 97 | $test_text =~ s@(tmp[\\\/])(\d*[\\\/])@$1@g;
|
---|
[27701] | 98 |
|
---|
[27695] | 99 | # if the OS doesn't match and one of them is windows, extra work needs to be done to bring the db files
|
---|
| 100 | # in test and model collection to an even base for comparison
|
---|
| 101 |
|
---|
[28172] | 102 | my $testIsWin = ($test_os ne "compute") ? ($test_os eq "windows") : &isDBWindowsSensitive($dbname, $test_text);
|
---|
| 103 | my $modelIsWin = ($model_os ne "compute") ? ($model_os eq "windows") : &isDBWindowsSensitive($dbname, $model_text);
|
---|
[27695] | 104 |
|
---|
[28086] | 105 | if($testIsWin == $modelIsWin) {
|
---|
| 106 | # both linux or both windows, do the basic test we did on linux machines:
|
---|
[27695] | 107 | # ignore absolute path prefixes in modelcol and testcol (necessary for archiveinf-doc and -src.gdb files)
|
---|
[27604] | 108 |
|
---|
[27695] | 109 | # Remember the original model col on SVN could have been built anywhere,
|
---|
| 110 | # and in the gdb files, absolute paths are stored to the collection location.
|
---|
| 111 | # Crop these paths to the collect/<colname> point.
|
---|
| 112 |
|
---|
| 113 | # Entries are of the form [Entry] or <Entry>. In order to do a sensible diff,
|
---|
| 114 | # need to remove the prefix to the collect/colname folder in any (absolute) path that occurs in Entry
|
---|
| 115 | # E.g. [/full/path/collect/colname/import/file.ext] should become [collect/colname/import/file.ext]
|
---|
| 116 | # Better regex is of the form /BEGIN((?:(?!BEGIN).)*)END/, see http://docstore.mik.ua/orelly/perl/cookbook/ch06_16.htm
|
---|
[27604] | 117 |
|
---|
[27743] | 118 | $model_text =~ s@^([^\\/]*(//)*).*(\\|/)(collect(\\|/)$strColName)(.*)$@$1$4$6@mg;
|
---|
| 119 | $test_text =~ s@^([^\\/]*(//)*).*(\\|/)(collect(\\|/)$strColName)(.*)$@$1$4$6@mg;
|
---|
| 120 | #$model_text =~ s@^([^\\//]*).*(\\|/)(collect(\\|/)$strColName)(.*)$@$1$3$5@mg;
|
---|
| 121 | #$test_text =~ s@^([^\\//]*).*(\\|/)(collect(\\|/)$strColName)(.*)$@$1$3$5@mg;
|
---|
[27695] | 122 | }
|
---|
| 123 |
|
---|
| 124 | else { # one of the collections was built on windows
|
---|
| 125 | # handling slashes and other differences between a model coll built on one OS (e.g. linux)
|
---|
| 126 | # and a test collection built and diffed on another OS (windows)
|
---|
| 127 |
|
---|
| 128 | my ($win_text, $lin_text); # references
|
---|
[28238] | 129 | my $collection_path = $strTestCol; # full path to a windows collection
|
---|
| 130 |
|
---|
[27695] | 131 | if($testIsWin) {
|
---|
[28238] | 132 | $collection_path = $strTestCol; # test collection path is windows
|
---|
[27695] | 133 | $win_text = \$test_text;
|
---|
| 134 | $lin_text = \$model_text;
|
---|
| 135 | } else {
|
---|
[28238] | 136 | $collection_path = $strModelCol; # model collection path is windows
|
---|
[27695] | 137 | $win_text = \$model_text;
|
---|
| 138 | $lin_text = \$test_text;
|
---|
| 139 | }
|
---|
| 140 |
|
---|
| 141 | if($dbname =~ m/archiveinf-doc/) {
|
---|
[28238] | 142 |
|
---|
| 143 | (my $collection_path_re = $collection_path) =~ s@\\@\\\\@g;
|
---|
| 144 |
|
---|
[27695] | 145 | my $tmp = ""; # rebuild windows file's set of lines after processing them one by one
|
---|
| 146 |
|
---|
| 147 | # convert short filenames to long perl:
|
---|
| 148 | # http://www.mombu.com/programming/perl/t-convert-dos-83-filenames-to-win32-long-filenames-using-perl-525448.html
|
---|
| 149 | for my $line (split /^/, $$win_text) { # split the string into newlines
|
---|
| 150 |
|
---|
[27743] | 151 | # assoc-file and meta-file contain filepaths, ensure these are long windows file paths now (will later convert to linux slashes)
|
---|
[28224] | 152 | if($line =~ m@^<(assoc-file|meta-file|src-file)>(.*)(\s+)@s) {
|
---|
[28238] | 153 | my ($field, $value, $suffix) = ($1, $2, $3);
|
---|
| 154 | $line = $value; # it may be a short file name
|
---|
[28224] | 155 |
|
---|
[28238] | 156 | if($line !~ m/^\@/) { # if the path doesn't use a "relative" @GSPATH@ placeholder string, but is an absolute path instead
|
---|
| 157 | # perhaps test here if it is a shortfilename? should match /CAPS....~number(.ext)/
|
---|
| 158 | $line = "<$field>".&Win32::GetLongPathName($line)."$suffix"; # make it a long file name and prefix assoc-file/meta-file tagname to it again
|
---|
| 159 | }
|
---|
| 160 | else { # if $line contains @THISCOLLECTIONPATH@, still need to deal with DOS filenames suffixes:
|
---|
| 161 | # replace placeholder with absolute path and expand to long filename, then insert placeholder in its original place again
|
---|
| 162 | $line =~ s/\@THISCOLLECTPATH\@/$collection_path/;
|
---|
| 163 | $line = &Win32::GetLongPathName($line);
|
---|
| 164 | $line =~ s/^$collection_path_re/\@THISCOLLECTPATH\@/;
|
---|
| 165 | $line = "<$field>".$line."$suffix";
|
---|
[28224] | 166 | }
|
---|
[27695] | 167 | }
|
---|
| 168 | $tmp .= $line;
|
---|
| 169 | }
|
---|
| 170 | $$win_text = $tmp;
|
---|
| 171 | }
|
---|
| 172 |
|
---|
| 173 |
|
---|
[27743] | 174 | # index gdb file
|
---|
| 175 | if($dbname =~ m/$strColName/) {
|
---|
| 176 | my $tmp = ""; # rebuild windows file's set of lines after processing them one by one
|
---|
[28086] | 177 | for my $line (split /^/, $$win_text) { # split the string into newlines
|
---|
| 178 |
|
---|
[28005] | 179 | # In the following regex, add any .gdb fieldnames that represent a path and so would contain double backslashes
|
---|
| 180 | # on Windows (to escape the single backlash of win filepaths). They will be turned into single-backslashes here,
|
---|
| 181 | # and converted into single forward slashes futher below when the txt version of the win gdb file is normalised
|
---|
| 182 | # to compare it with the linux version.
|
---|
| 183 | # E.g. On windows, the Word-PDF collection(s) contains double backslashes in the ex.File.Directory field
|
---|
| 184 | # the MARC-Exploded collection contains double backslashes in the null_file entry field of the .gdb file
|
---|
[28086] | 185 | if($line =~ m@^<(ex.File.Directory|null_file)>(.*)@s) {
|
---|
[28005] | 186 | my ($fieldname, $escaped_path) = ($1, $2);
|
---|
[28019] | 187 | $escaped_path =~ s@\\\\@/@g; #(my $escaped_path = $2) =~ s@\\\\@\\@g;
|
---|
[28005] | 188 | $line = "<$fieldname>$escaped_path";
|
---|
[27743] | 189 | }
|
---|
| 190 | elsif($line =~ m@^<Title>(.*)@s) {
|
---|
| 191 | # print STDERR "***** TITLE: |$1|\n";
|
---|
| 192 |
|
---|
| 193 | # word-pdf collection: Title of ps files contain new lines at end when
|
---|
| 194 | # GreenstoneXMLPlugin::xml_end_tag() writes the Title back out after utf8 decode
|
---|
| 195 | # if($metadata_name eq "Title") { $metadata_value =~ s/[\n\r]*$//; }
|
---|
| 196 |
|
---|
| 197 | (my $title = $1) =~ s@(\r|\n|\\n)*$@@; # get rid of trailing newlines/carriage returns
|
---|
| 198 | $line = "<Title>$title\n"; # add single newline
|
---|
| 199 | }
|
---|
| 200 | $tmp .= $line;
|
---|
| 201 | }
|
---|
[28019] | 202 | $$win_text = $tmp;
|
---|
| 203 |
|
---|
| 204 | # slashes in windows metadata text need to be turned into linux style slashes.
|
---|
| 205 | # index\col.gdb uses double backslashes, and single for \n,\t
|
---|
| 206 | #$$win_text =~ s@\\\\@/@g;
|
---|
[27743] | 207 | }
|
---|
[28019] | 208 | else { # archiveinf gdb file
|
---|
[27743] | 209 |
|
---|
[28019] | 210 | # slashes in windows metadata text need to be turned into linux style slashes.
|
---|
| 211 | # In the two archivesinf gdb files, filepaths may use single backslashes
|
---|
| 212 | $$win_text =~ s@\\@/@g; #$$win_text =~ s@\\([^n|r|\|"])@/$1@g; # filepath something\rtf remains something\rtf
|
---|
| 213 | }
|
---|
[27743] | 214 |
|
---|
[27695] | 215 | # cut down absolute paths to files to just collect/colname/.../file, same as before
|
---|
[27743] | 216 | $$lin_text =~ s@^([^\\/]*(//)*).*(\\|/)(collect(\\|/)$strColName)(.*)$@$1$4$6@mg; # $$lin_text =~ s@^([^\\\/]*(//)?).*(\\|/)(collect(\\|/)$strColName)(.*)$@$1$4$6@mg;
|
---|
| 217 | $$win_text =~ s@^([^\\/]*(//)*).*(\\|/)(collect(\\|/)$strColName)(.*)$@$1$4$6@mg;
|
---|
[27695] | 218 |
|
---|
| 219 | # for the windows text, need to further get rid of the driveletter after [ or <meta>
|
---|
[27701] | 220 | $$win_text =~ s@^(\[|<[^>]*>)[a-zA-Z]:collect@$1collect@mg;
|
---|
[27695] | 221 |
|
---|
| 222 | } # end of equalising differences between a windows collection's db file and linux coll's db file
|
---|
[27766] | 223 |
|
---|
| 224 | # The following block of code is necessary to deal with tmp (html) source files generated when using PDFBox
|
---|
| 225 | # These tmpdirs are located inside the toplevel *greenstone* directory
|
---|
| 226 | (my $gsdlhome_re = $ENV{'GSDLHOME'}) =~ s@\\@\/@g;
|
---|
[27767] | 227 | $gsdlhome_re = ".*" unless $$ENV{'GSDLHOME'};
|
---|
[28067] | 228 | my $tmpfile_regex = "<URL>http://$gsdlhome_re/tmp/([^\.]*?)(\..{3,4})"; # $gsdlhome/tmp/randomfilename.html, file ext can be 3 or 4 chars long
|
---|
[27766] | 229 | if($test_text =~ m@$tmpfile_regex@g) {
|
---|
| 230 | # found a match, replace the tmp file name with "random", keeping the original file extension
|
---|
| 231 | # in <OrigSource|URL|UTF8URL|gsdlconvertedfilename>
|
---|
| 232 |
|
---|
| 233 | # This code is slightly different from doc.xml because each document has its own doc.xml, so this needs to be done
|
---|
| 234 | # only once for doc.xml, but multiple times in index/col.gdb since it contains the random filenames of all docs in the col
|
---|
| 235 | #my ($old_tmp_filename, $ext) = ($1, $2);
|
---|
| 236 |
|
---|
| 237 | my $new_tmp_filename = "random";
|
---|
| 238 |
|
---|
| 239 |
|
---|
[28067] | 240 | $tmpfile_regex = "(<(URL|UTF8URL|gsdlconvertedfilename|OrigSource)>(http://)?)($gsdlhome_re)?(/tmp/)?.*?(\..{3,4})";
|
---|
[27766] | 241 | if($5) {
|
---|
| 242 | $test_text =~ s@$tmpfile_regex@$1$5$new_tmp_filename$6@mg;
|
---|
| 243 | } else { # OrigSource contains only the filename
|
---|
| 244 | $test_text =~ s@$tmpfile_regex@$1$new_tmp_filename$6@mg;
|
---|
| 245 | }
|
---|
| 246 |
|
---|
| 247 | # modelcol used a different gsdlhome, but also a tmp dir, so make the same changes to its random filename
|
---|
[28067] | 248 | $tmpfile_regex = "(<(URL|UTF8URL|gsdlconvertedfilename|OrigSource)>(http://)?)(.*)?(/tmp/)?.*?(\..{3,4})";
|
---|
[27766] | 249 | if($5) {
|
---|
| 250 | $model_text =~ s@$tmpfile_regex@$1$5$new_tmp_filename$6@mg;
|
---|
| 251 | } else { # OrigSource contains only the filename
|
---|
| 252 | $model_text =~ s@$tmpfile_regex@$1$new_tmp_filename$6@mg;
|
---|
| 253 | }
|
---|
| 254 |
|
---|
| 255 | # index/col.gdb also has entries for the random tmp file names in the form: [http://research/ak19/GS2bin_5July2013/tmp/F639.html]
|
---|
[27767] | 256 | # need to equalise these also. Sadly, when there are multiple intermediate files, their random tmp filenames are not
|
---|
| 257 | # guaranteed to be generated in the same (alphabetical/numerical) order between model and test collection, so the
|
---|
| 258 | # HASH OIDs, although all of them accounted for, appear in a different order. So we have to remove the Hash OIDs.
|
---|
| 259 | #$test_text =~ s@\[http://.*/tmp/.*(\..{3,4})\]@tmp/random$1@mg; # HASH OIDs can appear in different order
|
---|
| 260 | #$model_text =~ s@\[http://.*/tmp/.*(\..{3,4})\]@tmp/random$1@mg;
|
---|
[28067] | 261 |
|
---|
[28071] | 262 | $test_text =~ s@\[http://[^\n]*?/tmp/.*?(\..{3,4})\]\n<section>([^\n]*?)\n@[tmp/random$1\n<section>$2]\n@sg;
|
---|
| 263 | $model_text =~ s@\[http://[^\n]*?/tmp/.*?(\..{3,4})\]\n<section>([^\n]*?)\n@[tmp/random$1\n<section>$2]\n@sg;
|
---|
| 264 |
|
---|
| 265 | # need to re- sort the keys, now that the absolute paths to tmp locations has been removed
|
---|
| 266 | # so that we get the tmp files in the same order in both model and test collections
|
---|
| 267 |
|
---|
[28661] | 268 | $model_text = text_to_db_to_text($model_text, "model.gdb");
|
---|
| 269 | $test_text = text_to_db_to_text($test_text, "test.gdb");
|
---|
[27766] | 270 | }
|
---|
[27604] | 271 |
|
---|
[27695] | 272 | # now can go back to using $model_text and $test_text
|
---|
[27701] | 273 | # print_string_to_file($test_text, $savepath.$dbname."_test.out");
|
---|
| 274 | # print_string_to_file($model_text, $savepath.$dbname."_model.out");
|
---|
[27695] | 275 |
|
---|
[21711] | 276 | my $report_type = "OldStyle"; # Can not change this type.
|
---|
| 277 | my $diff_gdb = diff \$model_text, \$test_text, { STYLE => $report_type };
|
---|
| 278 |
|
---|
[27604] | 279 | # leaving the ignore regex as it used to be in the following, in case it helps with single line comparisons
|
---|
[27725] | 280 | $diff_gdb = &diffutil::GenerateOutput($diff_gdb,"^<(lastmodified|lastmodifieddate|oailastmodified|oailastmodifieddate|ImageSize)>.*");
|
---|
[21711] | 281 |
|
---|
| 282 | if($diff_gdb eq "")
|
---|
| 283 | {
|
---|
| 284 | return "";
|
---|
| 285 | }
|
---|
| 286 | else
|
---|
| 287 | {
|
---|
| 288 | return "Difference Report: Differences found in the Database file: \n$diff_gdb";
|
---|
| 289 | }
|
---|
| 290 | # Call diff?
|
---|
| 291 | }
|
---|
| 292 |
|
---|
[27695] | 293 | # returns true if the contents are windows AND it matters for the diffing on the db that it's windows
|
---|
| 294 | # For col.gdb it does not seem to matter so far, if it is generated on a windows machine and to be compared to a linux-generated col.gdb
|
---|
| 295 | sub isDBWindowsSensitive
|
---|
| 296 | {
|
---|
| 297 | my ($dbtailname, $db_contents) = @_; # db filename without suffix
|
---|
| 298 |
|
---|
[28019] | 299 | # return ($db_contents =~ m/\\/) ? 1 : 0; # windows slashes detected. Better test would be: [Something\something] OR <tag>something\something
|
---|
| 300 |
|
---|
| 301 | if($dbtailname =~ m/^archiveinf-doc/) {
|
---|
| 302 | return ($db_contents =~ m@<src-file>[a-zA-Z]:\\@) ? 1 : 0; # <src-file>C:\path
|
---|
| 303 | }
|
---|
| 304 | elsif($dbtailname =~ m/^archiveinf-src/) { # <src-file>C:\path
|
---|
| 305 | return ($db_contents =~ m@\[[a-zA-Z]:\\@) ? 1 : 0; # [C:\path]
|
---|
| 306 | }
|
---|
| 307 | else { # index/col.gdb file
|
---|
| 308 | if ($db_contents =~ m@<URL>http://[a-zA-Z]:/@) { # <URL>http://C:/path
|
---|
| 309 | return 1;
|
---|
| 310 | }
|
---|
| 311 | elsif ($db_contents =~ m@^(<URL>http://[a-zA-Z]:/)|(<null_file>[^\\]*\\)@m) { # <URL>http://C:/path OR <null_file>CMSwp-all.00000001\\00000035.nul
|
---|
| 312 | return 1;
|
---|
| 313 | }
|
---|
[28086] | 314 | elsif ($db_contents =~ m@^(<ex.File.Directory>[a-zA-Z]:\\\\)@m) { # <ex.File.Directory>C:\\path\\path for OAI collection
|
---|
| 315 | return 1;
|
---|
| 316 | }
|
---|
[28019] | 317 | return 0;
|
---|
| 318 | }
|
---|
[27695] | 319 | }
|
---|
| 320 |
|
---|
[21711] | 321 | 1;
|
---|