source: other-projects/nightly-tasks/diffcol/trunk/diffcol/gdbdiff.pm@ 36807

Last change on this file since 36807 was 36807, checked in by anupama, 19 months ago

We used to run diffcol as a nightly task only for GS2. Commit 36655 was the first stage of getting diffcol to work for GS3, but skipped a lot of important code branches (like comparing the index\text\j/gdb files) in order to fix up the easier parts of the code. Now that I think the remainder of the diffcol scripts have been got to work with diffcol for GS3, where the index\text\flatdb files are compared and diffcol works for them, I can commit the important changes as well as commented out debugging statements made to the diffcol scripts that get the full diffcol code to work for GS3 diffcol. I will recommit again after removing the debugging statements. And I still need to do a full local diffcol run again, as well as testing if diffcol still works after locally undoing my sort field changes to some GS3 model cols (the recent commits to Tudor, Word-PDF, Images-GPS and Multimedia collections) to see if Dr Bainbridge's PERL_HASH_SEED env var addition fixes all of those collections diffcol failures, making the extra sorting redundant. In that case, I will recommit those model collections after updating their col configurations to not do the extra sorting.

File size: 16.3 KB
RevLine 
[21711]1package gdbdiff;
2
3BEGIN {
4 die "GSDLHOME not set\n" unless defined $ENV{'GSDLHOME'};
5 die "GSDLOS not set\n" unless defined $ENV{'GSDLOS'};
6 unshift (@INC, "$ENV{'GSDLHOME'}/perllib");
7 unshift (@INC, "$ENV{'GSDLHOME'}/perllib/cpan");
8}
9
10use util;
11use diffutil;
12use Text::Diff;
[27701]13use Cwd;
[21711]14
[27696]15if ($^O =~ m/mswin/i) {
16 require Win32; # for working out Windows Long Filenames from Win 8.3 short filenames
17}
[27695]18
[21711]19sub readin_gdb
20{
21 my ($cmd) = @_;
22
23 open(PIN,"$cmd|")
24 || die "Unable to open pipe to $cmd: $!\n";
25
26 my $text_content = "";
27
28 while (defined (my $line = <PIN>)) {
29 $text_content .= $line;
30 }
31
32 close(PIN);
33 return $text_content;
34}
35
[28661]36sub read_db
37{
38 # need to sort text output of both test and model col database files, to normalise them for the comparison
39 # the -sort option to db2txt was added specifically to support diffcol
40 my($db_file) = @_;
41 my $db_cmd = "db2txt -sort $db_file 2>&1";
42 if($db_file =~ m/\.jdb$/) {
[34955]43 my $runperl = ($ENV{'GSDLOS'} =~ m/windows/) ? "perl -S" : "";
44 $db_cmd = "$runperl jdb2txt.pl -sort $db_file";
[28661]45 }
46
47 my $db_text = readin_gdb($db_cmd);
48 return $db_text;
49}
50
51sub text_to_db_to_text
52{
53 my($db_text, $db_filename) = @_;
54
55 # http://stackoverflow.com/questions/1909262/how-can-i-pipe-input-into-a-java-command-from-perl
56
[34955]57 if($db_filename =~ m/\.jdb$/) {
58 my $runperl = ($ENV{'GSDLOS'} =~ m/windows/) ? "perl -S" : "";
59 open PIPE, "| $runperl txt2jdb.pl $db_filename";
60 print PIPE "$db_text";
61 close(PIPE);
62 } else {
63 open PIPE, "| txt2db $db_filename";
64 print PIPE "$db_text";
65 close(PIPE);
66 }
[28661]67 return &read_db("$db_filename");
68}
69
[27695]70# for debugging. Prints txt contents of db to file
71sub print_string_to_file
72{
[28661]73 my ($text, $outfile) = @_;
[27695]74
[28661]75 open(FOUT, ">$outfile") or die "ERROR failed to write to $outfile: $!\n";
[27695]76 print FOUT $text;
77 close(FOUT);
78}
[21711]79
80sub test_gdb
81{
[29495]82 my ($full_modeldb, $full_testdb, $strColName, $test_os, $model_os, $strTestCol, $strModelCol, $debugging) = @_;
[21711]83
84 # print "Now is testing database\n";
[27695]85 my ($dbname, $dirname, $suffix)= &File::Basename::fileparse($full_testdb, "\\.[^\\.]+\$");
86
[28661]87 my $model_text = read_db($full_modeldb);
88 my $test_text = read_db($full_testdb);
[21711]89
[29495]90 my $savepath = &getcwd."/../"; # TASK_HOME env var does not exist at this stage, but it's one level up from current directory
91 if($debugging) {
92 print_string_to_file($test_text, $savepath.$dbname."_test.out1");
93 print_string_to_file($model_text, $savepath.$dbname."_model.out1");
94 }
[27604]95
96 # filter out the fields that can be ignored in the two database files
[27701]97 # The total_numbytes field can vary depending on how many backslashes exist in the urls in the main body text, as each
98 # of these windows slashes get escaped with another backslash, and the resulting string is used as key into rel link db
[35231]99 my $ignore_line_re = "\n<(FileSize|lastmodified|lastmodifieddate|oailastmodified|oailastmodifieddate|ex.File.FileModifyDate|ex.File.FilePermissions|ex.File.FileAccessDate|ex.File.FileInodeChangeDate|ex.File.FileCreateDate|total_numbytes|ex.Composite.LightValue)>([^\n])*";
[27604]100 $model_text =~ s/$ignore_line_re//g;
101 $test_text =~ s/$ignore_line_re//g;
102
[27730]103 # tmp dirs have subdirs with random numbers in name, remove subdir
[27766]104 # these tmpdirs are located inside the collection directory
[27730]105 $model_text =~ s@(tmp[\\\/])(\d*[\\\/])@$1@g;
106 $test_text =~ s@(tmp[\\\/])(\d*[\\\/])@$1@g;
[27701]107
[36807]108
109#print STDERR "@@@@ DEBUGGING: $debugging\n";
110#print STDERR "******** full_modeldb: $full_modeldb\n$model_text\n\n";
111#print STDERR "******** full_testdb: $full_testdb\n$test_text\n\n";
112
[27695]113 # if the OS doesn't match and one of them is windows, extra work needs to be done to bring the db files
114 # in test and model collection to an even base for comparison
115
[28172]116 my $testIsWin = ($test_os ne "compute") ? ($test_os eq "windows") : &isDBWindowsSensitive($dbname, $test_text);
117 my $modelIsWin = ($model_os ne "compute") ? ($model_os eq "windows") : &isDBWindowsSensitive($dbname, $model_text);
[27695]118
[28086]119 if($testIsWin == $modelIsWin) {
120 # both linux or both windows, do the basic test we did on linux machines:
[27695]121 # ignore absolute path prefixes in modelcol and testcol (necessary for archiveinf-doc and -src.gdb files)
[27604]122
[27695]123 # Remember the original model col on SVN could have been built anywhere,
124 # and in the gdb files, absolute paths are stored to the collection location.
125 # Crop these paths to the collect/<colname> point.
126
127 # Entries are of the form [Entry] or <Entry>. In order to do a sensible diff,
128 # need to remove the prefix to the collect/colname folder in any (absolute) path that occurs in Entry
129 # E.g. [/full/path/collect/colname/import/file.ext] should become [collect/colname/import/file.ext]
130 # Better regex is of the form /BEGIN((?:(?!BEGIN).)*)END/, see http://docstore.mik.ua/orelly/perl/cookbook/ch06_16.htm
[27604]131
[27743]132 $model_text =~ s@^([^\\/]*(//)*).*(\\|/)(collect(\\|/)$strColName)(.*)$@$1$4$6@mg;
133 $test_text =~ s@^([^\\/]*(//)*).*(\\|/)(collect(\\|/)$strColName)(.*)$@$1$4$6@mg;
134 #$model_text =~ s@^([^\\//]*).*(\\|/)(collect(\\|/)$strColName)(.*)$@$1$3$5@mg;
135 #$test_text =~ s@^([^\\//]*).*(\\|/)(collect(\\|/)$strColName)(.*)$@$1$3$5@mg;
[27695]136 }
137
138 else { # one of the collections was built on windows
139 # handling slashes and other differences between a model coll built on one OS (e.g. linux)
140 # and a test collection built and diffed on another OS (windows)
141
142 my ($win_text, $lin_text); # references
[28238]143 my $collection_path = $strTestCol; # full path to a windows collection
144
[27695]145 if($testIsWin) {
[28238]146 $collection_path = $strTestCol; # test collection path is windows
[27695]147 $win_text = \$test_text;
148 $lin_text = \$model_text;
149 } else {
[28238]150 $collection_path = $strModelCol; # model collection path is windows
[27695]151 $win_text = \$model_text;
152 $lin_text = \$test_text;
153 }
154
155 if($dbname =~ m/archiveinf-doc/) {
[28238]156
157 (my $collection_path_re = $collection_path) =~ s@\\@\\\\@g;
158
[27695]159 my $tmp = ""; # rebuild windows file's set of lines after processing them one by one
160
161 # convert short filenames to long perl:
162 # http://www.mombu.com/programming/perl/t-convert-dos-83-filenames-to-win32-long-filenames-using-perl-525448.html
163 for my $line (split /^/, $$win_text) { # split the string into newlines
164
[27743]165 # assoc-file and meta-file contain filepaths, ensure these are long windows file paths now (will later convert to linux slashes)
[28224]166 if($line =~ m@^<(assoc-file|meta-file|src-file)>(.*)(\s+)@s) {
[28238]167 my ($field, $value, $suffix) = ($1, $2, $3);
168 $line = $value; # it may be a short file name
[28224]169
[28238]170 if($line !~ m/^\@/) { # if the path doesn't use a "relative" @GSPATH@ placeholder string, but is an absolute path instead
171 # perhaps test here if it is a shortfilename? should match /CAPS....~number(.ext)/
172 $line = "<$field>".&Win32::GetLongPathName($line)."$suffix"; # make it a long file name and prefix assoc-file/meta-file tagname to it again
173 }
174 else { # if $line contains @THISCOLLECTIONPATH@, still need to deal with DOS filenames suffixes:
175 # replace placeholder with absolute path and expand to long filename, then insert placeholder in its original place again
176 $line =~ s/\@THISCOLLECTPATH\@/$collection_path/;
177 $line = &Win32::GetLongPathName($line);
178 $line =~ s/^$collection_path_re/\@THISCOLLECTPATH\@/;
179 $line = "<$field>".$line."$suffix";
[28224]180 }
[27695]181 }
182 $tmp .= $line;
183 }
184 $$win_text = $tmp;
185 }
186
187
[27743]188 # index gdb file
189 if($dbname =~ m/$strColName/) {
190 my $tmp = ""; # rebuild windows file's set of lines after processing them one by one
[28086]191 for my $line (split /^/, $$win_text) { # split the string into newlines
192
[28005]193 # In the following regex, add any .gdb fieldnames that represent a path and so would contain double backslashes
194 # on Windows (to escape the single backlash of win filepaths). They will be turned into single-backslashes here,
195 # and converted into single forward slashes futher below when the txt version of the win gdb file is normalised
196 # to compare it with the linux version.
197 # E.g. On windows, the Word-PDF collection(s) contains double backslashes in the ex.File.Directory field
198 # the MARC-Exploded collection contains double backslashes in the null_file entry field of the .gdb file
[28086]199 if($line =~ m@^<(ex.File.Directory|null_file)>(.*)@s) {
[28005]200 my ($fieldname, $escaped_path) = ($1, $2);
[28019]201 $escaped_path =~ s@\\\\@/@g; #(my $escaped_path = $2) =~ s@\\\\@\\@g;
[28005]202 $line = "<$fieldname>$escaped_path";
[27743]203 }
204 elsif($line =~ m@^<Title>(.*)@s) {
205# print STDERR "***** TITLE: |$1|\n";
206
207 # word-pdf collection: Title of ps files contain new lines at end when
208 # GreenstoneXMLPlugin::xml_end_tag() writes the Title back out after utf8 decode
209 # if($metadata_name eq "Title") { $metadata_value =~ s/[\n\r]*$//; }
210
211 (my $title = $1) =~ s@(\r|\n|\\n)*$@@; # get rid of trailing newlines/carriage returns
212 $line = "<Title>$title\n"; # add single newline
213 }
214 $tmp .= $line;
215 }
[28019]216 $$win_text = $tmp;
217
218 # slashes in windows metadata text need to be turned into linux style slashes.
219 # index\col.gdb uses double backslashes, and single for \n,\t
220 #$$win_text =~ s@\\\\@/@g;
[27743]221 }
[28019]222 else { # archiveinf gdb file
[27743]223
[28019]224 # slashes in windows metadata text need to be turned into linux style slashes.
225 # In the two archivesinf gdb files, filepaths may use single backslashes
226 $$win_text =~ s@\\@/@g; #$$win_text =~ s@\\([^n|r|\|"])@/$1@g; # filepath something\rtf remains something\rtf
227 }
[27743]228
[27695]229 # cut down absolute paths to files to just collect/colname/.../file, same as before
[27743]230 $$lin_text =~ s@^([^\\/]*(//)*).*(\\|/)(collect(\\|/)$strColName)(.*)$@$1$4$6@mg; # $$lin_text =~ s@^([^\\\/]*(//)?).*(\\|/)(collect(\\|/)$strColName)(.*)$@$1$4$6@mg;
231 $$win_text =~ s@^([^\\/]*(//)*).*(\\|/)(collect(\\|/)$strColName)(.*)$@$1$4$6@mg;
[27695]232
233 # for the windows text, need to further get rid of the driveletter after [ or <meta>
[27701]234 $$win_text =~ s@^(\[|<[^>]*>)[a-zA-Z]:collect@$1collect@mg;
[27695]235
236 } # end of equalising differences between a windows collection's db file and linux coll's db file
[27766]237
[36807]238
239 # Windows or linux: if index is a flat db file, then ensure the docIDs listed in <contains> field of
240 # both test and model flat db file are alphabetically sorted. So to the numbers in <mdoffset> field.
241 # Despite PERL_PERTURB_KEYS envvar being set to 0 on both machine when generating model collections
242 # and when test collections were generated on test machine, still collections like Images-GPS and some
243 # other colls list items in <contains> and <mdoffset> in different orders. So reordering alphabetically.
244 #if($dbname =~ m/$strColName/) {
245 # regex modifiers mge: multi-line, global (replace as many as match), e allows function call in substitution
246 ##$model_text =~ s@^<contains>(.*)@sort_contains_field($1, "MODEL", $debugging)@mge;
247 ##$test_text =~ s@^<contains>(.*)@sort_contains_field($1, "TEST", $debugging)@mge;
248 # $model_text =~ s@^<(contains|mdoffset)>(.*)@sort_field_value($1, $2, "MODEL", $debugging)@mge;
249 # $test_text =~ s@^<(contains|mdoffset)>(.*)@sort_field_value($1, $2, "TEST", $debugging)@mge;
250 #}
251
[27766]252 # The following block of code is necessary to deal with tmp (html) source files generated when using PDFBox
253 # These tmpdirs are located inside the toplevel *greenstone* directory
254 (my $gsdlhome_re = $ENV{'GSDLHOME'}) =~ s@\\@\/@g;
[27767]255 $gsdlhome_re = ".*" unless $$ENV{'GSDLHOME'};
[28067]256 my $tmpfile_regex = "<URL>http://$gsdlhome_re/tmp/([^\.]*?)(\..{3,4})"; # $gsdlhome/tmp/randomfilename.html, file ext can be 3 or 4 chars long
[27766]257 if($test_text =~ m@$tmpfile_regex@g) {
258 # found a match, replace the tmp file name with "random", keeping the original file extension
259 # in <OrigSource|URL|UTF8URL|gsdlconvertedfilename>
260
261 # This code is slightly different from doc.xml because each document has its own doc.xml, so this needs to be done
262 # only once for doc.xml, but multiple times in index/col.gdb since it contains the random filenames of all docs in the col
263 #my ($old_tmp_filename, $ext) = ($1, $2);
264
265 my $new_tmp_filename = "random";
266
267
[28067]268 $tmpfile_regex = "(<(URL|UTF8URL|gsdlconvertedfilename|OrigSource)>(http://)?)($gsdlhome_re)?(/tmp/)?.*?(\..{3,4})";
[27766]269 if($5) {
270 $test_text =~ s@$tmpfile_regex@$1$5$new_tmp_filename$6@mg;
271 } else { # OrigSource contains only the filename
272 $test_text =~ s@$tmpfile_regex@$1$new_tmp_filename$6@mg;
273 }
274
275 # modelcol used a different gsdlhome, but also a tmp dir, so make the same changes to its random filename
[28067]276 $tmpfile_regex = "(<(URL|UTF8URL|gsdlconvertedfilename|OrigSource)>(http://)?)(.*)?(/tmp/)?.*?(\..{3,4})";
[27766]277 if($5) {
278 $model_text =~ s@$tmpfile_regex@$1$5$new_tmp_filename$6@mg;
279 } else { # OrigSource contains only the filename
280 $model_text =~ s@$tmpfile_regex@$1$new_tmp_filename$6@mg;
281 }
282
283 # index/col.gdb also has entries for the random tmp file names in the form: [http://research/ak19/GS2bin_5July2013/tmp/F639.html]
[27767]284 # need to equalise these also. Sadly, when there are multiple intermediate files, their random tmp filenames are not
285 # guaranteed to be generated in the same (alphabetical/numerical) order between model and test collection, so the
286 # HASH OIDs, although all of them accounted for, appear in a different order. So we have to remove the Hash OIDs.
287 #$test_text =~ s@\[http://.*/tmp/.*(\..{3,4})\]@tmp/random$1@mg; # HASH OIDs can appear in different order
288 #$model_text =~ s@\[http://.*/tmp/.*(\..{3,4})\]@tmp/random$1@mg;
[28067]289
[28071]290 $test_text =~ s@\[http://[^\n]*?/tmp/.*?(\..{3,4})\]\n<section>([^\n]*?)\n@[tmp/random$1\n<section>$2]\n@sg;
291 $model_text =~ s@\[http://[^\n]*?/tmp/.*?(\..{3,4})\]\n<section>([^\n]*?)\n@[tmp/random$1\n<section>$2]\n@sg;
292
293 # need to re- sort the keys, now that the absolute paths to tmp locations has been removed
294 # so that we get the tmp files in the same order in both model and test collections
295
[28661]296 $model_text = text_to_db_to_text($model_text, "model.gdb");
297 $test_text = text_to_db_to_text($test_text, "test.gdb");
[27766]298 }
[27604]299
[29495]300 # now can go back to using $model_text and $test_text
301
302 if($debugging) {
303 print_string_to_file($test_text, $savepath.$dbname."_test.out");
304 print_string_to_file($model_text, $savepath.$dbname."_model.out");
305 }
[34955]306
[21711]307 my $report_type = "OldStyle"; # Can not change this type.
308 my $diff_gdb = diff \$model_text, \$test_text, { STYLE => $report_type };
309
[27604]310 # leaving the ignore regex as it used to be in the following, in case it helps with single line comparisons
[27725]311 $diff_gdb = &diffutil::GenerateOutput($diff_gdb,"^<(lastmodified|lastmodifieddate|oailastmodified|oailastmodifieddate|ImageSize)>.*");
[21711]312
313 if($diff_gdb eq "")
314 {
315 return "";
316 }
317 else
318 {
319 return "Difference Report: Differences found in the Database file: \n$diff_gdb";
320 }
321 # Call diff?
322}
323
[36807]324# Unused, but may come in handy when debugging again: regex substitution helper function
325sub sort_field_value {
326 my($fieldname, $fieldvalue, $displayStr, $debugging) = @_;
327
328 print STDERR "\n$displayStr BEFORE sort: <$fieldname>$fieldvalue\n" if($debugging);
329
330 $fieldvalue =~ s@(\r|\n|\\n)*$@@; # get rid of trailing newlines/carriage returns
331 my @values_list = split(';', $fieldvalue);
332 @values_list = sort @values_list;
333 $fieldvalue = "<$fieldname>".join(';', @values_list). "\n";
334
335 print STDERR "$displayStr AFTER sort: $fieldvalue\n" if($debugging);
336
337 return $fieldvalue;
338}
339
340
[27695]341# returns true if the contents are windows AND it matters for the diffing on the db that it's windows
342# For col.gdb it does not seem to matter so far, if it is generated on a windows machine and to be compared to a linux-generated col.gdb
343sub isDBWindowsSensitive
344{
345 my ($dbtailname, $db_contents) = @_; # db filename without suffix
346
[28019]347# return ($db_contents =~ m/\\/) ? 1 : 0; # windows slashes detected. Better test would be: [Something\something] OR <tag>something\something
348
349 if($dbtailname =~ m/^archiveinf-doc/) {
350 return ($db_contents =~ m@<src-file>[a-zA-Z]:\\@) ? 1 : 0; # <src-file>C:\path
351 }
352 elsif($dbtailname =~ m/^archiveinf-src/) { # <src-file>C:\path
353 return ($db_contents =~ m@\[[a-zA-Z]:\\@) ? 1 : 0; # [C:\path]
354 }
355 else { # index/col.gdb file
356 if ($db_contents =~ m@<URL>http://[a-zA-Z]:/@) { # <URL>http://C:/path
357 return 1;
358 }
359 elsif ($db_contents =~ m@^(<URL>http://[a-zA-Z]:/)|(<null_file>[^\\]*\\)@m) { # <URL>http://C:/path OR <null_file>CMSwp-all.00000001\\00000035.nul
360 return 1;
361 }
[28086]362 elsif ($db_contents =~ m@^(<ex.File.Directory>[a-zA-Z]:\\\\)@m) { # <ex.File.Directory>C:\\path\\path for OAI collection
363 return 1;
364 }
[28019]365 return 0;
366 }
[27695]367}
368
[21711]3691;
Note: See TracBrowser for help on using the repository browser.