source: other-projects/nightly-tasks/diffcol/trunk/diffcol/gdbdiff.pm@ 28019

Last change on this file since 28019 was 28019, checked in by ak19, 11 years ago

Better detection of whether a gdb file is a windows-generated one or not. The weakness of the earlier test was only discovered with the Multimedia tutorial collection.

File size: 12.2 KB
Line 
1package gdbdiff;
2
3BEGIN {
4 die "GSDLHOME not set\n" unless defined $ENV{'GSDLHOME'};
5 die "GSDLOS not set\n" unless defined $ENV{'GSDLOS'};
6 unshift (@INC, "$ENV{'GSDLHOME'}/perllib");
7 unshift (@INC, "$ENV{'GSDLHOME'}/perllib/cpan");
8}
9
10use util;
11use diffutil;
12use Text::Diff;
13use Cwd;
14
15if ($^O =~ m/mswin/i) {
16 require Win32; # for working out Windows Long Filenames from Win 8.3 short filenames
17}
18
19sub readin_gdb
20{
21 my ($cmd) = @_;
22
23 open(PIN,"$cmd|")
24 || die "Unable to open pipe to $cmd: $!\n";
25
26 my $text_content = "";
27
28 while (defined (my $line = <PIN>)) {
29 $text_content .= $line;
30 }
31
32 close(PIN);
33 return $text_content;
34}
35
36# for debugging. Prints txt contents of db to file
37sub print_string_to_file
38{
39 my ($text, $outfile) = @_;
40
41 open(FOUT, ">$outfile") or die "ERROR failed to write to $outfile: $!\n";
42 print FOUT $text;
43 close(FOUT);
44}
45
46sub test_gdb
47{
48 my ($full_modeldb, $full_testdb,$strColName) = @_;
49
50 # print "Now is testing database\n";
51 my ($dbname, $dirname, $suffix)= &File::Basename::fileparse($full_testdb, "\\.[^\\.]+\$");
52
53 # need to sort text output of both test and model col database files, to normalise them for the comparison
54 # the -sort option to db2txt was added specifically to support diffcol
55 my $model_cmd = "db2txt -sort $full_modeldb 2>&1";
56 my $test_cmd = "db2txt -sort $full_testdb 2>&1";
57
58 my $model_text = readin_gdb($model_cmd);
59 my $test_text = readin_gdb($test_cmd);
60
61# my $savepath = &getcwd."/../"; # TASK_HOME env var does not exist at this stage, but it's one level up from current directory
62# print_string_to_file($test_text, $savepath.$dbname."_test.out1");
63# print_string_to_file($model_text, $savepath.$dbname."_model.out1");
64
65 # filter out the fields that can be ignored in the two database files
66 # The total_numbytes field can vary depending on how many backslashes exist in the urls in the main body text, as each
67 # of these windows slashes get escaped with another backslash, and the resulting string is used as key into rel link db
68 my $ignore_line_re = "\n<(FileSize|lastmodified|lastmodifieddate|oailastmodified|oailastmodifieddate|ex.File.FileModifyDate|ex.File.FilePermissions|total_numbytes)>([^\n])*";
69 $model_text =~ s/$ignore_line_re//g;
70 $test_text =~ s/$ignore_line_re//g;
71
72 # tmp dirs have subdirs with random numbers in name, remove subdir
73 # these tmpdirs are located inside the collection directory
74 $model_text =~ s@(tmp[\\\/])(\d*[\\\/])@$1@g;
75 $test_text =~ s@(tmp[\\\/])(\d*[\\\/])@$1@g;
76
77 # if the OS doesn't match and one of them is windows, extra work needs to be done to bring the db files
78 # in test and model collection to an even base for comparison
79
80 my $testIsWin = &isDBWindowsSensitive($dbname, $test_text);
81 my $modelIsWin = &isDBWindowsSensitive($dbname, $model_text);
82
83 if($testIsWin == $modelIsWin) { # both linux or both windows, do the basic test we did on linux machines:
84 # ignore absolute path prefixes in modelcol and testcol (necessary for archiveinf-doc and -src.gdb files)
85
86 # Remember the original model col on SVN could have been built anywhere,
87 # and in the gdb files, absolute paths are stored to the collection location.
88 # Crop these paths to the collect/<colname> point.
89
90 # Entries are of the form [Entry] or <Entry>. In order to do a sensible diff,
91 # need to remove the prefix to the collect/colname folder in any (absolute) path that occurs in Entry
92 # E.g. [/full/path/collect/colname/import/file.ext] should become [collect/colname/import/file.ext]
93 # Better regex is of the form /BEGIN((?:(?!BEGIN).)*)END/, see http://docstore.mik.ua/orelly/perl/cookbook/ch06_16.htm
94
95 $model_text =~ s@^([^\\/]*(//)*).*(\\|/)(collect(\\|/)$strColName)(.*)$@$1$4$6@mg;
96 $test_text =~ s@^([^\\/]*(//)*).*(\\|/)(collect(\\|/)$strColName)(.*)$@$1$4$6@mg;
97 #$model_text =~ s@^([^\\//]*).*(\\|/)(collect(\\|/)$strColName)(.*)$@$1$3$5@mg;
98 #$test_text =~ s@^([^\\//]*).*(\\|/)(collect(\\|/)$strColName)(.*)$@$1$3$5@mg;
99 }
100
101 else { # one of the collections was built on windows
102 # handling slashes and other differences between a model coll built on one OS (e.g. linux)
103 # and a test collection built and diffed on another OS (windows)
104
105 my ($win_text, $lin_text); # references
106 if($testIsWin) {
107 $win_text = \$test_text;
108 $lin_text = \$model_text;
109 } else {
110 $win_text = \$model_text;
111 $lin_text = \$test_text;
112 }
113
114 if($dbname =~ m/archiveinf-doc/) {
115 my $tmp = ""; # rebuild windows file's set of lines after processing them one by one
116
117 # convert short filenames to long perl:
118 # http://www.mombu.com/programming/perl/t-convert-dos-83-filenames-to-win32-long-filenames-using-perl-525448.html
119 for my $line (split /^/, $$win_text) { # split the string into newlines
120
121 # assoc-file and meta-file contain filepaths, ensure these are long windows file paths now (will later convert to linux slashes)
122 if($line =~ m@^<(assoc-file|meta-file)>(.*)(\s+)@s) {
123 $line = $2; # may be a short file name
124 # perhaps test here if it is a shortfilename? should match /CAPS....~number(.ext)/
125
126 $line = "<$1>".&Win32::GetLongPathName($line)."$3"; # make it a long file name and prefix assoc-file/meta-file tagname to it again
127 }
128 $tmp .= $line;
129 }
130 $$win_text = $tmp;
131 }
132
133
134 # index gdb file
135 if($dbname =~ m/$strColName/) {
136 my $tmp = ""; # rebuild windows file's set of lines after processing them one by one
137 for my $line (split /^/, $$win_text) { # split the string into newlines
138
139 # In the following regex, add any .gdb fieldnames that represent a path and so would contain double backslashes
140 # on Windows (to escape the single backlash of win filepaths). They will be turned into single-backslashes here,
141 # and converted into single forward slashes futher below when the txt version of the win gdb file is normalised
142 # to compare it with the linux version.
143 # E.g. On windows, the Word-PDF collection(s) contains double backslashes in the ex.File.Directory field
144 # the MARC-Exploded collection contains double backslashes in the null_file entry field of the .gdb file
145 if($line =~ m@^<(ex.File.Directory|null_file)>(.*)@s) {
146 my ($fieldname, $escaped_path) = ($1, $2);
147 $escaped_path =~ s@\\\\@/@g; #(my $escaped_path = $2) =~ s@\\\\@\\@g;
148 $line = "<$fieldname>$escaped_path";
149 }
150 elsif($line =~ m@^<Title>(.*)@s) {
151# print STDERR "***** TITLE: |$1|\n";
152
153 # word-pdf collection: Title of ps files contain new lines at end when
154 # GreenstoneXMLPlugin::xml_end_tag() writes the Title back out after utf8 decode
155 # if($metadata_name eq "Title") { $metadata_value =~ s/[\n\r]*$//; }
156
157 (my $title = $1) =~ s@(\r|\n|\\n)*$@@; # get rid of trailing newlines/carriage returns
158 $line = "<Title>$title\n"; # add single newline
159 }
160 $tmp .= $line;
161 }
162 $$win_text = $tmp;
163
164 # slashes in windows metadata text need to be turned into linux style slashes.
165 # index\col.gdb uses double backslashes, and single for \n,\t
166 #$$win_text =~ s@\\\\@/@g;
167 }
168 else { # archiveinf gdb file
169
170 # slashes in windows metadata text need to be turned into linux style slashes.
171 # In the two archivesinf gdb files, filepaths may use single backslashes
172 $$win_text =~ s@\\@/@g; #$$win_text =~ s@\\([^n|r|\|"])@/$1@g; # filepath something\rtf remains something\rtf
173 }
174
175 # cut down absolute paths to files to just collect/colname/.../file, same as before
176 $$lin_text =~ s@^([^\\/]*(//)*).*(\\|/)(collect(\\|/)$strColName)(.*)$@$1$4$6@mg; # $$lin_text =~ s@^([^\\\/]*(//)?).*(\\|/)(collect(\\|/)$strColName)(.*)$@$1$4$6@mg;
177 $$win_text =~ s@^([^\\/]*(//)*).*(\\|/)(collect(\\|/)$strColName)(.*)$@$1$4$6@mg;
178
179 # for the windows text, need to further get rid of the driveletter after [ or <meta>
180 $$win_text =~ s@^(\[|<[^>]*>)[a-zA-Z]:collect@$1collect@mg;
181
182 } # end of equalising differences between a windows collection's db file and linux coll's db file
183
184 # The following block of code is necessary to deal with tmp (html) source files generated when using PDFBox
185 # These tmpdirs are located inside the toplevel *greenstone* directory
186 (my $gsdlhome_re = $ENV{'GSDLHOME'}) =~ s@\\@\/@g;
187 $gsdlhome_re = ".*" unless $$ENV{'GSDLHOME'};
188 my $tmpfile_regex = "<URL>http://$gsdlhome_re/tmp/([^\.]*)(\..{3,4})"; # $gsdlhome/tmp/randomfilename.html, file ext can be 3 or 4 chars long
189 if($test_text =~ m@$tmpfile_regex@g) {
190 # found a match, replace the tmp file name with "random", keeping the original file extension
191 # in <OrigSource|URL|UTF8URL|gsdlconvertedfilename>
192
193 # This code is slightly different from doc.xml because each document has its own doc.xml, so this needs to be done
194 # only once for doc.xml, but multiple times in index/col.gdb since it contains the random filenames of all docs in the col
195 #my ($old_tmp_filename, $ext) = ($1, $2);
196
197 my $new_tmp_filename = "random";
198
199
200 $tmpfile_regex = "(<(URL|UTF8URL|gsdlconvertedfilename|OrigSource)>(http://)?)($gsdlhome_re)?(/tmp/)?.*(\..{3,4})";
201 if($5) {
202 $test_text =~ s@$tmpfile_regex@$1$5$new_tmp_filename$6@mg;
203 } else { # OrigSource contains only the filename
204 $test_text =~ s@$tmpfile_regex@$1$new_tmp_filename$6@mg;
205 }
206
207 # modelcol used a different gsdlhome, but also a tmp dir, so make the same changes to its random filename
208 $tmpfile_regex = "(<(URL|UTF8URL|gsdlconvertedfilename|OrigSource)>(http://)?)(.*)?(/tmp/)?.*(\..{3,4})";
209 if($5) {
210 $model_text =~ s@$tmpfile_regex@$1$5$new_tmp_filename$6@mg;
211 } else { # OrigSource contains only the filename
212 $model_text =~ s@$tmpfile_regex@$1$new_tmp_filename$6@mg;
213 }
214
215 # index/col.gdb also has entries for the random tmp file names in the form: [http://research/ak19/GS2bin_5July2013/tmp/F639.html]
216 # need to equalise these also. Sadly, when there are multiple intermediate files, their random tmp filenames are not
217 # guaranteed to be generated in the same (alphabetical/numerical) order between model and test collection, so the
218 # HASH OIDs, although all of them accounted for, appear in a different order. So we have to remove the Hash OIDs.
219 #$test_text =~ s@\[http://.*/tmp/.*(\..{3,4})\]@tmp/random$1@mg; # HASH OIDs can appear in different order
220 #$model_text =~ s@\[http://.*/tmp/.*(\..{3,4})\]@tmp/random$1@mg;
221 $test_text =~ s@\[http://.*/tmp/.*(\..{3,4})\]\n<section>[^\n]*\n@tmp/random$1\n<section>RandomHash\n@sg;
222 $model_text =~ s@\[http://.*/tmp/.*(\..{3,4})\]\n<section>[^\n]*\n@tmp/random$1\n<section>RandomHash\n@sg;
223 }
224
225 # now can go back to using $model_text and $test_text
226# print_string_to_file($test_text, $savepath.$dbname."_test.out");
227# print_string_to_file($model_text, $savepath.$dbname."_model.out");
228
229 my $report_type = "OldStyle"; # Can not change this type.
230 my $diff_gdb = diff \$model_text, \$test_text, { STYLE => $report_type };
231
232 # leaving the ignore regex as it used to be in the following, in case it helps with single line comparisons
233 $diff_gdb = &diffutil::GenerateOutput($diff_gdb,"^<(lastmodified|lastmodifieddate|oailastmodified|oailastmodifieddate|ImageSize)>.*");
234
235 if($diff_gdb eq "")
236 {
237 return "";
238 }
239 else
240 {
241 return "Difference Report: Differences found in the Database file: \n$diff_gdb";
242 }
243 # Call diff?
244}
245
246# returns true if the contents are windows AND it matters for the diffing on the db that it's windows
247# For col.gdb it does not seem to matter so far, if it is generated on a windows machine and to be compared to a linux-generated col.gdb
248sub isDBWindowsSensitive
249{
250 my ($dbtailname, $db_contents) = @_; # db filename without suffix
251
252# return ($db_contents =~ m/\\/) ? 1 : 0; # windows slashes detected. Better test would be: [Something\something] OR <tag>something\something
253
254 if($dbtailname =~ m/^archiveinf-doc/) {
255 return ($db_contents =~ m@<src-file>[a-zA-Z]:\\@) ? 1 : 0; # <src-file>C:\path
256 }
257 elsif($dbtailname =~ m/^archiveinf-src/) { # <src-file>C:\path
258 return ($db_contents =~ m@\[[a-zA-Z]:\\@) ? 1 : 0; # [C:\path]
259 }
260 else { # index/col.gdb file
261 if ($db_contents =~ m@<URL>http://[a-zA-Z]:/@) { # <URL>http://C:/path
262 return 1;
263 }
264 elsif ($db_contents =~ m@^(<URL>http://[a-zA-Z]:/)|(<null_file>[^\\]*\\)@m) { # <URL>http://C:/path OR <null_file>CMSwp-all.00000001\\00000035.nul
265 return 1;
266 }
267 return 0;
268 }
269}
270
2711;
Note: See TracBrowser for help on using the repository browser.