source: other-projects/nightly-tasks/diffcol/trunk/diffcol/gdbdiff.pm@ 28172

Last change on this file since 28172 was 28172, checked in by ak19, 11 years ago

task.pl now takes a model_os param as argument to the run_test command and passes in the test_OS and model_OS variables to diffcol.pl. These specify on what Operating System the model collections were built and what the current OS is, on which the test collections will be built when the tests are run and diffed. This will provide a more foolproof way for diffcol to work out on what OS the model collection was generated and what the current OS is (or what OS task.pl chooses to specify as the current OS for test collections).

File size: 13.1 KB
Line 
1package gdbdiff;
2
3BEGIN {
4 die "GSDLHOME not set\n" unless defined $ENV{'GSDLHOME'};
5 die "GSDLOS not set\n" unless defined $ENV{'GSDLOS'};
6 unshift (@INC, "$ENV{'GSDLHOME'}/perllib");
7 unshift (@INC, "$ENV{'GSDLHOME'}/perllib/cpan");
8}
9
10use util;
11use diffutil;
12use Text::Diff;
13use Cwd;
14
15if ($^O =~ m/mswin/i) {
16 require Win32; # for working out Windows Long Filenames from Win 8.3 short filenames
17}
18
19sub readin_gdb
20{
21 my ($cmd) = @_;
22
23 open(PIN,"$cmd|")
24 || die "Unable to open pipe to $cmd: $!\n";
25
26 my $text_content = "";
27
28 while (defined (my $line = <PIN>)) {
29 $text_content .= $line;
30 }
31
32 close(PIN);
33 return $text_content;
34}
35
36# for debugging. Prints txt contents of db to file
37sub print_string_to_file
38{
39 my ($text, $outfile) = @_;
40
41 open(FOUT, ">$outfile") or die "ERROR failed to write to $outfile: $!\n";
42 print FOUT $text;
43 close(FOUT);
44}
45
46sub test_gdb
47{
48 my ($full_modeldb, $full_testdb, $strColName, $test_os, $model_os) = @_;
49
50 # print "Now is testing database\n";
51 my ($dbname, $dirname, $suffix)= &File::Basename::fileparse($full_testdb, "\\.[^\\.]+\$");
52
53 # need to sort text output of both test and model col database files, to normalise them for the comparison
54 # the -sort option to db2txt was added specifically to support diffcol
55 my $model_cmd = "db2txt -sort $full_modeldb 2>&1";
56 my $test_cmd = "db2txt -sort $full_testdb 2>&1";
57
58 my $model_text = readin_gdb($model_cmd);
59 my $test_text = readin_gdb($test_cmd);
60
61# my $savepath = &getcwd."/../"; # TASK_HOME env var does not exist at this stage, but it's one level up from current directory
62# print_string_to_file($test_text, $savepath.$dbname."_test.out1");
63# print_string_to_file($model_text, $savepath.$dbname."_model.out1");
64
65 # filter out the fields that can be ignored in the two database files
66 # The total_numbytes field can vary depending on how many backslashes exist in the urls in the main body text, as each
67 # of these windows slashes get escaped with another backslash, and the resulting string is used as key into rel link db
68 my $ignore_line_re = "\n<(FileSize|lastmodified|lastmodifieddate|oailastmodified|oailastmodifieddate|ex.File.FileModifyDate|ex.File.FilePermissions|total_numbytes|ex.Composite.LightValue)>([^\n])*";
69 $model_text =~ s/$ignore_line_re//g;
70 $test_text =~ s/$ignore_line_re//g;
71
72 # tmp dirs have subdirs with random numbers in name, remove subdir
73 # these tmpdirs are located inside the collection directory
74 $model_text =~ s@(tmp[\\\/])(\d*[\\\/])@$1@g;
75 $test_text =~ s@(tmp[\\\/])(\d*[\\\/])@$1@g;
76
77 # if the OS doesn't match and one of them is windows, extra work needs to be done to bring the db files
78 # in test and model collection to an even base for comparison
79
80 my $testIsWin = ($test_os ne "compute") ? ($test_os eq "windows") : &isDBWindowsSensitive($dbname, $test_text);
81 my $modelIsWin = ($model_os ne "compute") ? ($model_os eq "windows") : &isDBWindowsSensitive($dbname, $model_text);
82
83 if($testIsWin == $modelIsWin) {
84 # both linux or both windows, do the basic test we did on linux machines:
85 # ignore absolute path prefixes in modelcol and testcol (necessary for archiveinf-doc and -src.gdb files)
86
87 # Remember the original model col on SVN could have been built anywhere,
88 # and in the gdb files, absolute paths are stored to the collection location.
89 # Crop these paths to the collect/<colname> point.
90
91 # Entries are of the form [Entry] or <Entry>. In order to do a sensible diff,
92 # need to remove the prefix to the collect/colname folder in any (absolute) path that occurs in Entry
93 # E.g. [/full/path/collect/colname/import/file.ext] should become [collect/colname/import/file.ext]
94 # Better regex is of the form /BEGIN((?:(?!BEGIN).)*)END/, see http://docstore.mik.ua/orelly/perl/cookbook/ch06_16.htm
95
96 $model_text =~ s@^([^\\/]*(//)*).*(\\|/)(collect(\\|/)$strColName)(.*)$@$1$4$6@mg;
97 $test_text =~ s@^([^\\/]*(//)*).*(\\|/)(collect(\\|/)$strColName)(.*)$@$1$4$6@mg;
98 #$model_text =~ s@^([^\\//]*).*(\\|/)(collect(\\|/)$strColName)(.*)$@$1$3$5@mg;
99 #$test_text =~ s@^([^\\//]*).*(\\|/)(collect(\\|/)$strColName)(.*)$@$1$3$5@mg;
100 }
101
102 else { # one of the collections was built on windows
103 # handling slashes and other differences between a model coll built on one OS (e.g. linux)
104 # and a test collection built and diffed on another OS (windows)
105
106 my ($win_text, $lin_text); # references
107 if($testIsWin) {
108 $win_text = \$test_text;
109 $lin_text = \$model_text;
110 } else {
111 $win_text = \$model_text;
112 $lin_text = \$test_text;
113 }
114
115 if($dbname =~ m/archiveinf-doc/) {
116 my $tmp = ""; # rebuild windows file's set of lines after processing them one by one
117
118 # convert short filenames to long perl:
119 # http://www.mombu.com/programming/perl/t-convert-dos-83-filenames-to-win32-long-filenames-using-perl-525448.html
120 for my $line (split /^/, $$win_text) { # split the string into newlines
121
122 # assoc-file and meta-file contain filepaths, ensure these are long windows file paths now (will later convert to linux slashes)
123 if($line =~ m@^<(assoc-file|meta-file)>(.*)(\s+)@s) {
124 $line = $2; # may be a short file name
125 # perhaps test here if it is a shortfilename? should match /CAPS....~number(.ext)/
126
127 $line = "<$1>".&Win32::GetLongPathName($line)."$3"; # make it a long file name and prefix assoc-file/meta-file tagname to it again
128 }
129 $tmp .= $line;
130 }
131 $$win_text = $tmp;
132 }
133
134
135 # index gdb file
136 if($dbname =~ m/$strColName/) {
137 my $tmp = ""; # rebuild windows file's set of lines after processing them one by one
138 for my $line (split /^/, $$win_text) { # split the string into newlines
139
140 # In the following regex, add any .gdb fieldnames that represent a path and so would contain double backslashes
141 # on Windows (to escape the single backlash of win filepaths). They will be turned into single-backslashes here,
142 # and converted into single forward slashes futher below when the txt version of the win gdb file is normalised
143 # to compare it with the linux version.
144 # E.g. On windows, the Word-PDF collection(s) contains double backslashes in the ex.File.Directory field
145 # the MARC-Exploded collection contains double backslashes in the null_file entry field of the .gdb file
146 if($line =~ m@^<(ex.File.Directory|null_file)>(.*)@s) {
147 my ($fieldname, $escaped_path) = ($1, $2);
148 $escaped_path =~ s@\\\\@/@g; #(my $escaped_path = $2) =~ s@\\\\@\\@g;
149 $line = "<$fieldname>$escaped_path";
150 }
151 elsif($line =~ m@^<Title>(.*)@s) {
152# print STDERR "***** TITLE: |$1|\n";
153
154 # word-pdf collection: Title of ps files contain new lines at end when
155 # GreenstoneXMLPlugin::xml_end_tag() writes the Title back out after utf8 decode
156 # if($metadata_name eq "Title") { $metadata_value =~ s/[\n\r]*$//; }
157
158 (my $title = $1) =~ s@(\r|\n|\\n)*$@@; # get rid of trailing newlines/carriage returns
159 $line = "<Title>$title\n"; # add single newline
160 }
161 $tmp .= $line;
162 }
163 $$win_text = $tmp;
164
165 # slashes in windows metadata text need to be turned into linux style slashes.
166 # index\col.gdb uses double backslashes, and single for \n,\t
167 #$$win_text =~ s@\\\\@/@g;
168 }
169 else { # archiveinf gdb file
170
171 # slashes in windows metadata text need to be turned into linux style slashes.
172 # In the two archivesinf gdb files, filepaths may use single backslashes
173 $$win_text =~ s@\\@/@g; #$$win_text =~ s@\\([^n|r|\|"])@/$1@g; # filepath something\rtf remains something\rtf
174 }
175
176 # cut down absolute paths to files to just collect/colname/.../file, same as before
177 $$lin_text =~ s@^([^\\/]*(//)*).*(\\|/)(collect(\\|/)$strColName)(.*)$@$1$4$6@mg; # $$lin_text =~ s@^([^\\\/]*(//)?).*(\\|/)(collect(\\|/)$strColName)(.*)$@$1$4$6@mg;
178 $$win_text =~ s@^([^\\/]*(//)*).*(\\|/)(collect(\\|/)$strColName)(.*)$@$1$4$6@mg;
179
180 # for the windows text, need to further get rid of the driveletter after [ or <meta>
181 $$win_text =~ s@^(\[|<[^>]*>)[a-zA-Z]:collect@$1collect@mg;
182
183 } # end of equalising differences between a windows collection's db file and linux coll's db file
184
185 # The following block of code is necessary to deal with tmp (html) source files generated when using PDFBox
186 # These tmpdirs are located inside the toplevel *greenstone* directory
187 (my $gsdlhome_re = $ENV{'GSDLHOME'}) =~ s@\\@\/@g;
188 $gsdlhome_re = ".*" unless $$ENV{'GSDLHOME'};
189 my $tmpfile_regex = "<URL>http://$gsdlhome_re/tmp/([^\.]*?)(\..{3,4})"; # $gsdlhome/tmp/randomfilename.html, file ext can be 3 or 4 chars long
190 if($test_text =~ m@$tmpfile_regex@g) {
191 # found a match, replace the tmp file name with "random", keeping the original file extension
192 # in <OrigSource|URL|UTF8URL|gsdlconvertedfilename>
193
194 # This code is slightly different from doc.xml because each document has its own doc.xml, so this needs to be done
195 # only once for doc.xml, but multiple times in index/col.gdb since it contains the random filenames of all docs in the col
196 #my ($old_tmp_filename, $ext) = ($1, $2);
197
198 my $new_tmp_filename = "random";
199
200
201 $tmpfile_regex = "(<(URL|UTF8URL|gsdlconvertedfilename|OrigSource)>(http://)?)($gsdlhome_re)?(/tmp/)?.*?(\..{3,4})";
202 if($5) {
203 $test_text =~ s@$tmpfile_regex@$1$5$new_tmp_filename$6@mg;
204 } else { # OrigSource contains only the filename
205 $test_text =~ s@$tmpfile_regex@$1$new_tmp_filename$6@mg;
206 }
207
208 # modelcol used a different gsdlhome, but also a tmp dir, so make the same changes to its random filename
209 $tmpfile_regex = "(<(URL|UTF8URL|gsdlconvertedfilename|OrigSource)>(http://)?)(.*)?(/tmp/)?.*?(\..{3,4})";
210 if($5) {
211 $model_text =~ s@$tmpfile_regex@$1$5$new_tmp_filename$6@mg;
212 } else { # OrigSource contains only the filename
213 $model_text =~ s@$tmpfile_regex@$1$new_tmp_filename$6@mg;
214 }
215
216 # index/col.gdb also has entries for the random tmp file names in the form: [http://research/ak19/GS2bin_5July2013/tmp/F639.html]
217 # need to equalise these also. Sadly, when there are multiple intermediate files, their random tmp filenames are not
218 # guaranteed to be generated in the same (alphabetical/numerical) order between model and test collection, so the
219 # HASH OIDs, although all of them accounted for, appear in a different order. So we have to remove the Hash OIDs.
220 #$test_text =~ s@\[http://.*/tmp/.*(\..{3,4})\]@tmp/random$1@mg; # HASH OIDs can appear in different order
221 #$model_text =~ s@\[http://.*/tmp/.*(\..{3,4})\]@tmp/random$1@mg;
222
223 $test_text =~ s@\[http://[^\n]*?/tmp/.*?(\..{3,4})\]\n<section>([^\n]*?)\n@[tmp/random$1\n<section>$2]\n@sg;
224 $model_text =~ s@\[http://[^\n]*?/tmp/.*?(\..{3,4})\]\n<section>([^\n]*?)\n@[tmp/random$1\n<section>$2]\n@sg;
225
226 # need to re- sort the keys, now that the absolute paths to tmp locations has been removed
227 # so that we get the tmp files in the same order in both model and test collections
228
229 # http://stackoverflow.com/questions/1909262/how-can-i-pipe-input-into-a-java-command-from-perl
230 open PIPE, "| txt2db model.gdb";
231 print PIPE "$model_text";
232 close(PIPE);
233 open PIPE, "| txt2db test.gdb";
234 print PIPE "$test_text";
235 close(PIPE);
236
237 $model_cmd = " db2txt -sort model.gdb 2>&1";
238 $test_cmd = "db2txt -sort test.gdb 2>&1";
239 $model_text = readin_gdb($model_cmd);
240 $test_text = readin_gdb($test_cmd);
241 }
242
243 # now can go back to using $model_text and $test_text
244# print_string_to_file($test_text, $savepath.$dbname."_test.out");
245# print_string_to_file($model_text, $savepath.$dbname."_model.out");
246
247 my $report_type = "OldStyle"; # Can not change this type.
248 my $diff_gdb = diff \$model_text, \$test_text, { STYLE => $report_type };
249
250 # leaving the ignore regex as it used to be in the following, in case it helps with single line comparisons
251 $diff_gdb = &diffutil::GenerateOutput($diff_gdb,"^<(lastmodified|lastmodifieddate|oailastmodified|oailastmodifieddate|ImageSize)>.*");
252
253 if($diff_gdb eq "")
254 {
255 return "";
256 }
257 else
258 {
259 return "Difference Report: Differences found in the Database file: \n$diff_gdb";
260 }
261 # Call diff?
262}
263
264# returns true if the contents are windows AND it matters for the diffing on the db that it's windows
265# For col.gdb it does not seem to matter so far, if it is generated on a windows machine and to be compared to a linux-generated col.gdb
266sub isDBWindowsSensitive
267{
268 my ($dbtailname, $db_contents) = @_; # db filename without suffix
269
270# return ($db_contents =~ m/\\/) ? 1 : 0; # windows slashes detected. Better test would be: [Something\something] OR <tag>something\something
271
272 if($dbtailname =~ m/^archiveinf-doc/) {
273 return ($db_contents =~ m@<src-file>[a-zA-Z]:\\@) ? 1 : 0; # <src-file>C:\path
274 }
275 elsif($dbtailname =~ m/^archiveinf-src/) { # <src-file>C:\path
276 return ($db_contents =~ m@\[[a-zA-Z]:\\@) ? 1 : 0; # [C:\path]
277 }
278 else { # index/col.gdb file
279 if ($db_contents =~ m@<URL>http://[a-zA-Z]:/@) { # <URL>http://C:/path
280 return 1;
281 }
282 elsif ($db_contents =~ m@^(<URL>http://[a-zA-Z]:/)|(<null_file>[^\\]*\\)@m) { # <URL>http://C:/path OR <null_file>CMSwp-all.00000001\\00000035.nul
283 return 1;
284 }
285 elsif ($db_contents =~ m@^(<ex.File.Directory>[a-zA-Z]:\\\\)@m) { # <ex.File.Directory>C:\\path\\path for OAI collection
286 return 1;
287 }
288 return 0;
289 }
290}
291
2921;
Note: See TracBrowser for help on using the repository browser.