Context Navigation

source: other-projects/nightly-tasks/diffcol/trunk/diffcol/gdbdiff.pm@ 29495

Last change on this file since 29495 was 29495, checked in by ak19, 9 years ago
Related to previous commit 29494 accidentally committed under sjs49 and which was prematurely committed. That commit and this one adds a debug flag to the run_test action of the diffcol task, that will store the intermediate debug files in the top level diffcol folder for inspection.
File size: 14.2 KB

Line
1	package gdbdiff;
2
3	BEGIN {
4	die "GSDLHOME not set\n" unless defined $ENV{'GSDLHOME'};
5	die "GSDLOS not set\n" unless defined $ENV{'GSDLOS'};
6	unshift (@INC, "$ENV{'GSDLHOME'}/perllib");
7	unshift (@INC, "$ENV{'GSDLHOME'}/perllib/cpan");
8	}
9
10	use util;
11	use diffutil;
12	use Text::Diff;
13	use Cwd;
14
15	if ($^O =~ m/mswin/i) {
16	require Win32; # for working out Windows Long Filenames from Win 8.3 short filenames
17	}
18
19	sub readin_gdb
20	{
21	my ($cmd) = @_;
22
23	open(PIN,"$cmd\|")
24	\|\| die "Unable to open pipe to $cmd: $!\n";
25
26	my $text_content = "";
27
28	while (defined (my $line = <PIN>)) {
29	$text_content .= $line;
30	}
31
32	close(PIN);
33	return $text_content;
34	}
35
36	sub read_db
37	{
38	# need to sort text output of both test and model col database files, to normalise them for the comparison
39	# the -sort option to db2txt was added specifically to support diffcol
40	my($db_file) = @_;
41	my $db_cmd = "db2txt -sort $db_file 2>&1";
42	if($db_file =~ m/\.jdb$/) {
43	print STDERR "NOT YET IMPLEMENTED\n";
44	#$db_cmd = "jdb2txt -sort $db_file 2>&1";
45	}
46
47	my $db_text = readin_gdb($db_cmd);
48	return $db_text;
49	}
50
51	sub text_to_db_to_text
52	{
53	my($db_text, $db_filename) = @_;
54
55	# http://stackoverflow.com/questions/1909262/how-can-i-pipe-input-into-a-java-command-from-perl
56	open PIPE, "\| txt2db $db_filename";
57	print PIPE "$db_text";
58	close(PIPE);
59
60	return &read_db("$db_filename");
61	}
62
63	# for debugging. Prints txt contents of db to file
64	sub print_string_to_file
65	{
66	my ($text, $outfile) = @_;
67
68	open(FOUT, ">$outfile") or die "ERROR failed to write to $outfile: $!\n";
69	print FOUT $text;
70	close(FOUT);
71	}
72
73	sub test_gdb
74	{
75	my ($full_modeldb, $full_testdb, $strColName, $test_os, $model_os, $strTestCol, $strModelCol, $debugging) = @_;
76
77	# print "Now is testing database\n";
78	my ($dbname, $dirname, $suffix)= &File::Basename::fileparse($full_testdb, "\\.[^\\.]+\$");
79
80	my $model_text = read_db($full_modeldb);
81	my $test_text = read_db($full_testdb);
82
83	my $savepath = &getcwd."/../"; # TASK_HOME env var does not exist at this stage, but it's one level up from current directory
84	if($debugging) {
85	print_string_to_file($test_text, $savepath.$dbname."_test.out1");
86	print_string_to_file($model_text, $savepath.$dbname."_model.out1");
87	}
88
89	# filter out the fields that can be ignored in the two database files
90	# The total_numbytes field can vary depending on how many backslashes exist in the urls in the main body text, as each
91	# of these windows slashes get escaped with another backslash, and the resulting string is used as key into rel link db
92	my $ignore_line_re = "\n<(FileSize\|lastmodified\|lastmodifieddate\|oailastmodified\|oailastmodifieddate\|ex.File.FileModifyDate\|ex.File.FilePermissions\|total_numbytes\|ex.Composite.LightValue)>([^\n])*";
93	$model_text =~ s/$ignore_line_re//g;
94	$test_text =~ s/$ignore_line_re//g;
95
96	# tmp dirs have subdirs with random numbers in name, remove subdir
97	# these tmpdirs are located inside the collection directory
98	$model_text =~ s@(tmp[\\\/])(\d*[\\\/])@$1@g;
99	$test_text =~ s@(tmp[\\\/])(\d*[\\\/])@$1@g;
100
101	# if the OS doesn't match and one of them is windows, extra work needs to be done to bring the db files
102	# in test and model collection to an even base for comparison
103
104	my $testIsWin = ($test_os ne "compute") ? ($test_os eq "windows") : &isDBWindowsSensitive($dbname, $test_text);
105	my $modelIsWin = ($model_os ne "compute") ? ($model_os eq "windows") : &isDBWindowsSensitive($dbname, $model_text);
106
107	if($testIsWin == $modelIsWin) {
108	# both linux or both windows, do the basic test we did on linux machines:
109	# ignore absolute path prefixes in modelcol and testcol (necessary for archiveinf-doc and -src.gdb files)
110
111	# Remember the original model col on SVN could have been built anywhere,
112	# and in the gdb files, absolute paths are stored to the collection location.
113	# Crop these paths to the collect/<colname> point.
114
115	# Entries are of the form [Entry] or <Entry>. In order to do a sensible diff,
116	# need to remove the prefix to the collect/colname folder in any (absolute) path that occurs in Entry
117	# E.g. [/full/path/collect/colname/import/file.ext] should become [collect/colname/import/file.ext]
118	# Better regex is of the form /BEGIN((?:(?!BEGIN).)*)END/, see http://docstore.mik.ua/orelly/perl/cookbook/ch06_16.htm
119
120	$model_text =~ s@^([^\\/](//)).(\\\|/)(collect(\\\|/)$strColName)(.)$@$1$4$6@mg;
121	$test_text =~ s@^([^\\/](//)).(\\\|/)(collect(\\\|/)$strColName)(.)$@$1$4$6@mg;
122	#$model_text =~ s@^([^\\//]).(\\\|/)(collect(\\\|/)$strColName)(.*)$@$1$3$5@mg;
123	#$test_text =~ s@^([^\\//]).(\\\|/)(collect(\\\|/)$strColName)(.*)$@$1$3$5@mg;
124	}
125
126	else { # one of the collections was built on windows
127	# handling slashes and other differences between a model coll built on one OS (e.g. linux)
128	# and a test collection built and diffed on another OS (windows)
129
130	my ($win_text, $lin_text); # references
131	my $collection_path = $strTestCol; # full path to a windows collection
132
133	if($testIsWin) {
134	$collection_path = $strTestCol; # test collection path is windows
135	$win_text = \$test_text;
136	$lin_text = \$model_text;
137	} else {
138	$collection_path = $strModelCol; # model collection path is windows
139	$win_text = \$model_text;
140	$lin_text = \$test_text;
141	}
142
143	if($dbname =~ m/archiveinf-doc/) {
144
145	(my $collection_path_re = $collection_path) =~ s@\\@\\\\@g;
146
147	my $tmp = ""; # rebuild windows file's set of lines after processing them one by one
148
149	# convert short filenames to long perl:
150	# http://www.mombu.com/programming/perl/t-convert-dos-83-filenames-to-win32-long-filenames-using-perl-525448.html
151	for my $line (split /^/, $$win_text) { # split the string into newlines
152
153	# assoc-file and meta-file contain filepaths, ensure these are long windows file paths now (will later convert to linux slashes)
154	if($line =~ m@^<(assoc-file\|meta-file\|src-file)>(.*)(\s+)@s) {
155	my ($field, $value, $suffix) = ($1, $2, $3);
156	$line = $value; # it may be a short file name
157
158	if($line !~ m/^\@/) { # if the path doesn't use a "relative" @GSPATH@ placeholder string, but is an absolute path instead
159	# perhaps test here if it is a shortfilename? should match /CAPS....~number(.ext)/
160	$line = "<$field>".&Win32::GetLongPathName($line)."$suffix"; # make it a long file name and prefix assoc-file/meta-file tagname to it again
161	}
162	else { # if $line contains @THISCOLLECTIONPATH@, still need to deal with DOS filenames suffixes:
163	# replace placeholder with absolute path and expand to long filename, then insert placeholder in its original place again
164	$line =~ s/\@THISCOLLECTPATH\@/$collection_path/;
165	$line = &Win32::GetLongPathName($line);
166	$line =~ s/^$collection_path_re/\@THISCOLLECTPATH\@/;
167	$line = "<$field>".$line."$suffix";
168	}
169	}
170	$tmp .= $line;
171	}
172	$$win_text = $tmp;
173	}
174
175
176	# index gdb file
177	if($dbname =~ m/$strColName/) {
178	my $tmp = ""; # rebuild windows file's set of lines after processing them one by one
179	for my $line (split /^/, $$win_text) { # split the string into newlines
180
181	# In the following regex, add any .gdb fieldnames that represent a path and so would contain double backslashes
182	# on Windows (to escape the single backlash of win filepaths). They will be turned into single-backslashes here,
183	# and converted into single forward slashes futher below when the txt version of the win gdb file is normalised
184	# to compare it with the linux version.
185	# E.g. On windows, the Word-PDF collection(s) contains double backslashes in the ex.File.Directory field
186	# the MARC-Exploded collection contains double backslashes in the null_file entry field of the .gdb file
187	if($line =~ m@^<(ex.File.Directory\|null_file)>(.*)@s) {
188	my ($fieldname, $escaped_path) = ($1, $2);
189	$escaped_path =~ s@\\\\@/@g; #(my $escaped_path = $2) =~ s@\\\\@\\@g;
190	$line = "<$fieldname>$escaped_path";
191	}
192	elsif($line =~ m@^<Title>(.*)@s) {
193	# print STDERR "***** TITLE: \|$1\|\n";
194
195	# word-pdf collection: Title of ps files contain new lines at end when
196	# GreenstoneXMLPlugin::xml_end_tag() writes the Title back out after utf8 decode
197	# if($metadata_name eq "Title") { $metadata_value =~ s/[\n\r]*$//; }
198
199	(my $title = $1) =~ s@(\r\|\n\|\\n)*$@@; # get rid of trailing newlines/carriage returns
200	$line = "<Title>$title\n"; # add single newline
201	}
202	$tmp .= $line;
203	}
204	$$win_text = $tmp;
205
206	# slashes in windows metadata text need to be turned into linux style slashes.
207	# index\col.gdb uses double backslashes, and single for \n,\t
208	#$$win_text =~ s@\\\\@/@g;
209	}
210	else { # archiveinf gdb file
211
212	# slashes in windows metadata text need to be turned into linux style slashes.
213	# In the two archivesinf gdb files, filepaths may use single backslashes
214	$$win_text =~ s@\\@/@g; #$$win_text =~ s@\\([^n\|r\|\\|"])@/$1@g; # filepath something\rtf remains something\rtf
215	}
216
217	# cut down absolute paths to files to just collect/colname/.../file, same as before
218	$$lin_text =~ s@^([^\\/](//)).(\\\|/)(collect(\\\|/)$strColName)(.)$@$1$4$6@mg; # $$lin_text =~ s@^([^\\\/](//)?).(\\\|/)(collect(\\\|/)$strColName)(.*)$@$1$4$6@mg;
219	$$win_text =~ s@^([^\\/](//)).(\\\|/)(collect(\\\|/)$strColName)(.)$@$1$4$6@mg;
220
221	# for the windows text, need to further get rid of the driveletter after [ or <meta>
222	$$win_text =~ s@^(\[\|<[^>]*>)[a-zA-Z]:collect@$1collect@mg;
223
224	} # end of equalising differences between a windows collection's db file and linux coll's db file
225
226	# The following block of code is necessary to deal with tmp (html) source files generated when using PDFBox
227	# These tmpdirs are located inside the toplevel greenstone directory
228	(my $gsdlhome_re = $ENV{'GSDLHOME'}) =~ s@\\@\/@g;
229	$gsdlhome_re = ".*" unless $$ENV{'GSDLHOME'};
230	my $tmpfile_regex = "<URL>http://$gsdlhome_re/tmp/([^\.]*?)(\..{3,4})"; # $gsdlhome/tmp/randomfilename.html, file ext can be 3 or 4 chars long
231	if($test_text =~ m@$tmpfile_regex@g) {
232	# found a match, replace the tmp file name with "random", keeping the original file extension
233	# in <OrigSource\|URL\|UTF8URL\|gsdlconvertedfilename>
234
235	# This code is slightly different from doc.xml because each document has its own doc.xml, so this needs to be done
236	# only once for doc.xml, but multiple times in index/col.gdb since it contains the random filenames of all docs in the col
237	#my ($old_tmp_filename, $ext) = ($1, $2);
238
239	my $new_tmp_filename = "random";
240
241
242	$tmpfile_regex = "(<(URL\|UTF8URL\|gsdlconvertedfilename\|OrigSource)>(http://)?)($gsdlhome_re)?(/tmp/)?.*?(\..{3,4})";
243	if($5) {
244	$test_text =~ s@$tmpfile_regex@$1$5$new_tmp_filename$6@mg;
245	} else { # OrigSource contains only the filename
246	$test_text =~ s@$tmpfile_regex@$1$new_tmp_filename$6@mg;
247	}
248
249	# modelcol used a different gsdlhome, but also a tmp dir, so make the same changes to its random filename
250	$tmpfile_regex = "(<(URL\|UTF8URL\|gsdlconvertedfilename\|OrigSource)>(http://)?)(.)?(/tmp/)?.?(\..{3,4})";
251	if($5) {
252	$model_text =~ s@$tmpfile_regex@$1$5$new_tmp_filename$6@mg;
253	} else { # OrigSource contains only the filename
254	$model_text =~ s@$tmpfile_regex@$1$new_tmp_filename$6@mg;
255	}
256
257	# index/col.gdb also has entries for the random tmp file names in the form: [http://research/ak19/GS2bin_5July2013/tmp/F639.html]
258	# need to equalise these also. Sadly, when there are multiple intermediate files, their random tmp filenames are not
259	# guaranteed to be generated in the same (alphabetical/numerical) order between model and test collection, so the
260	# HASH OIDs, although all of them accounted for, appear in a different order. So we have to remove the Hash OIDs.
261	#$test_text =~ s@\[http://./tmp/.(\..{3,4})\]@tmp/random$1@mg; # HASH OIDs can appear in different order
262	#$model_text =~ s@\[http://./tmp/.(\..{3,4})\]@tmp/random$1@mg;
263
264	$test_text =~ s@\[http://[^\n]?/tmp/.?(\..{3,4})\]\n<section>([^\n]*?)\n@[tmp/random$1\n<section>$2]\n@sg;
265	$model_text =~ s@\[http://[^\n]?/tmp/.?(\..{3,4})\]\n<section>([^\n]*?)\n@[tmp/random$1\n<section>$2]\n@sg;
266
267	# need to re- sort the keys, now that the absolute paths to tmp locations has been removed
268	# so that we get the tmp files in the same order in both model and test collections
269
270	$model_text = text_to_db_to_text($model_text, "model.gdb");
271	$test_text = text_to_db_to_text($test_text, "test.gdb");
272	}
273
274	# now can go back to using $model_text and $test_text
275
276	if($debugging) {
277	print_string_to_file($test_text, $savepath.$dbname."_test.out");
278	print_string_to_file($model_text, $savepath.$dbname."_model.out");
279	}
280
281	my $report_type = "OldStyle"; # Can not change this type.
282	my $diff_gdb = diff \$model_text, \$test_text, { STYLE => $report_type };
283
284	# leaving the ignore regex as it used to be in the following, in case it helps with single line comparisons
285	$diff_gdb = &diffutil::GenerateOutput($diff_gdb,"^<(lastmodified\|lastmodifieddate\|oailastmodified\|oailastmodifieddate\|ImageSize)>.*");
286
287	if($diff_gdb eq "")
288	{
289	return "";
290	}
291	else
292	{
293	return "Difference Report: Differences found in the Database file: \n$diff_gdb";
294	}
295	# Call diff?
296	}
297
298	# returns true if the contents are windows AND it matters for the diffing on the db that it's windows
299	# For col.gdb it does not seem to matter so far, if it is generated on a windows machine and to be compared to a linux-generated col.gdb
300	sub isDBWindowsSensitive
301	{
302	my ($dbtailname, $db_contents) = @_; # db filename without suffix
303
304	# return ($db_contents =~ m/\\/) ? 1 : 0; # windows slashes detected. Better test would be: [Something\something] OR <tag>something\something
305
306	if($dbtailname =~ m/^archiveinf-doc/) {
307	return ($db_contents =~ m@<src-file>[a-zA-Z]:\\@) ? 1 : 0; # <src-file>C:\path
308	}
309	elsif($dbtailname =~ m/^archiveinf-src/) { # <src-file>C:\path
310	return ($db_contents =~ m@\[[a-zA-Z]:\\@) ? 1 : 0; # [C:\path]
311	}
312	else { # index/col.gdb file
313	if ($db_contents =~ m@<URL>http://[a-zA-Z]:/@) { # <URL>http://C:/path
314	return 1;
315	}
316	elsif ($db_contents =~ m@^(<URL>http://[a-zA-Z]:/)\|(<null_file>[^\\]*\\)@m) { # <URL>http://C:/path OR <null_file>CMSwp-all.00000001\\00000035.nul
317	return 1;
318	}
319	elsif ($db_contents =~ m@^(<ex.File.Directory>[a-zA-Z]:\\\\)@m) { # <ex.File.Directory>C:\\path\\path for OAI collection
320	return 1;
321	}
322	return 0;
323	}
324	}
325
326	1;

Note: See TracBrowser for help on using the repository browser.

Download in other formats: