Context Navigation

source: other-projects/nightly-tasks/diffcol/trunk/diffcol/gdbdiff.pm@ 35231

Last change on this file since 35231 was 35231, checked in by kjdon, 3 years ago
Another field (ex.File.FileCreateDate) that needs to be ignored when diffing model colls and test colls for diffcol. Tested these changes out just now and now the Windows diffcol went through.
File size: 14.5 KB

Line
1	package gdbdiff;
2
3	BEGIN {
4	die "GSDLHOME not set\n" unless defined $ENV{'GSDLHOME'};
5	die "GSDLOS not set\n" unless defined $ENV{'GSDLOS'};
6	unshift (@INC, "$ENV{'GSDLHOME'}/perllib");
7	unshift (@INC, "$ENV{'GSDLHOME'}/perllib/cpan");
8	}
9
10	use util;
11	use diffutil;
12	use Text::Diff;
13	use Cwd;
14
15	if ($^O =~ m/mswin/i) {
16	require Win32; # for working out Windows Long Filenames from Win 8.3 short filenames
17	}
18
19	sub readin_gdb
20	{
21	my ($cmd) = @_;
22
23	open(PIN,"$cmd\|")
24	\|\| die "Unable to open pipe to $cmd: $!\n";
25
26	my $text_content = "";
27
28	while (defined (my $line = <PIN>)) {
29	$text_content .= $line;
30	}
31
32	close(PIN);
33	return $text_content;
34	}
35
36	sub read_db
37	{
38	# need to sort text output of both test and model col database files, to normalise them for the comparison
39	# the -sort option to db2txt was added specifically to support diffcol
40	my($db_file) = @_;
41	my $db_cmd = "db2txt -sort $db_file 2>&1";
42	if($db_file =~ m/\.jdb$/) {
43	my $runperl = ($ENV{'GSDLOS'} =~ m/windows/) ? "perl -S" : "";
44	$db_cmd = "$runperl jdb2txt.pl -sort $db_file";
45	}
46
47	my $db_text = readin_gdb($db_cmd);
48	return $db_text;
49	}
50
51	sub text_to_db_to_text
52	{
53	my($db_text, $db_filename) = @_;
54
55	# http://stackoverflow.com/questions/1909262/how-can-i-pipe-input-into-a-java-command-from-perl
56
57	if($db_filename =~ m/\.jdb$/) {
58	my $runperl = ($ENV{'GSDLOS'} =~ m/windows/) ? "perl -S" : "";
59	open PIPE, "\| $runperl txt2jdb.pl $db_filename";
60	print PIPE "$db_text";
61	close(PIPE);
62	} else {
63	open PIPE, "\| txt2db $db_filename";
64	print PIPE "$db_text";
65	close(PIPE);
66	}
67	return &read_db("$db_filename");
68	}
69
70	# for debugging. Prints txt contents of db to file
71	sub print_string_to_file
72	{
73	my ($text, $outfile) = @_;
74
75	open(FOUT, ">$outfile") or die "ERROR failed to write to $outfile: $!\n";
76	print FOUT $text;
77	close(FOUT);
78	}
79
80	sub test_gdb
81	{
82	my ($full_modeldb, $full_testdb, $strColName, $test_os, $model_os, $strTestCol, $strModelCol, $debugging) = @_;
83
84	# print "Now is testing database\n";
85	my ($dbname, $dirname, $suffix)= &File::Basename::fileparse($full_testdb, "\\.[^\\.]+\$");
86
87	my $model_text = read_db($full_modeldb);
88	my $test_text = read_db($full_testdb);
89
90	my $savepath = &getcwd."/../"; # TASK_HOME env var does not exist at this stage, but it's one level up from current directory
91	if($debugging) {
92	print_string_to_file($test_text, $savepath.$dbname."_test.out1");
93	print_string_to_file($model_text, $savepath.$dbname."_model.out1");
94	}
95
96	# filter out the fields that can be ignored in the two database files
97	# The total_numbytes field can vary depending on how many backslashes exist in the urls in the main body text, as each
98	# of these windows slashes get escaped with another backslash, and the resulting string is used as key into rel link db
99	my $ignore_line_re = "\n<(FileSize\|lastmodified\|lastmodifieddate\|oailastmodified\|oailastmodifieddate\|ex.File.FileModifyDate\|ex.File.FilePermissions\|ex.File.FileAccessDate\|ex.File.FileInodeChangeDate\|ex.File.FileCreateDate\|total_numbytes\|ex.Composite.LightValue)>([^\n])*";
100	$model_text =~ s/$ignore_line_re//g;
101	$test_text =~ s/$ignore_line_re//g;
102
103	# tmp dirs have subdirs with random numbers in name, remove subdir
104	# these tmpdirs are located inside the collection directory
105	$model_text =~ s@(tmp[\\\/])(\d*[\\\/])@$1@g;
106	$test_text =~ s@(tmp[\\\/])(\d*[\\\/])@$1@g;
107
108	# if the OS doesn't match and one of them is windows, extra work needs to be done to bring the db files
109	# in test and model collection to an even base for comparison
110
111	my $testIsWin = ($test_os ne "compute") ? ($test_os eq "windows") : &isDBWindowsSensitive($dbname, $test_text);
112	my $modelIsWin = ($model_os ne "compute") ? ($model_os eq "windows") : &isDBWindowsSensitive($dbname, $model_text);
113
114	if($testIsWin == $modelIsWin) {
115	# both linux or both windows, do the basic test we did on linux machines:
116	# ignore absolute path prefixes in modelcol and testcol (necessary for archiveinf-doc and -src.gdb files)
117
118	# Remember the original model col on SVN could have been built anywhere,
119	# and in the gdb files, absolute paths are stored to the collection location.
120	# Crop these paths to the collect/<colname> point.
121
122	# Entries are of the form [Entry] or <Entry>. In order to do a sensible diff,
123	# need to remove the prefix to the collect/colname folder in any (absolute) path that occurs in Entry
124	# E.g. [/full/path/collect/colname/import/file.ext] should become [collect/colname/import/file.ext]
125	# Better regex is of the form /BEGIN((?:(?!BEGIN).)*)END/, see http://docstore.mik.ua/orelly/perl/cookbook/ch06_16.htm
126
127	$model_text =~ s@^([^\\/](//)).(\\\|/)(collect(\\\|/)$strColName)(.)$@$1$4$6@mg;
128	$test_text =~ s@^([^\\/](//)).(\\\|/)(collect(\\\|/)$strColName)(.)$@$1$4$6@mg;
129	#$model_text =~ s@^([^\\//]).(\\\|/)(collect(\\\|/)$strColName)(.*)$@$1$3$5@mg;
130	#$test_text =~ s@^([^\\//]).(\\\|/)(collect(\\\|/)$strColName)(.*)$@$1$3$5@mg;
131	}
132
133	else { # one of the collections was built on windows
134	# handling slashes and other differences between a model coll built on one OS (e.g. linux)
135	# and a test collection built and diffed on another OS (windows)
136
137	my ($win_text, $lin_text); # references
138	my $collection_path = $strTestCol; # full path to a windows collection
139
140	if($testIsWin) {
141	$collection_path = $strTestCol; # test collection path is windows
142	$win_text = \$test_text;
143	$lin_text = \$model_text;
144	} else {
145	$collection_path = $strModelCol; # model collection path is windows
146	$win_text = \$model_text;
147	$lin_text = \$test_text;
148	}
149
150	if($dbname =~ m/archiveinf-doc/) {
151
152	(my $collection_path_re = $collection_path) =~ s@\\@\\\\@g;
153
154	my $tmp = ""; # rebuild windows file's set of lines after processing them one by one
155
156	# convert short filenames to long perl:
157	# http://www.mombu.com/programming/perl/t-convert-dos-83-filenames-to-win32-long-filenames-using-perl-525448.html
158	for my $line (split /^/, $$win_text) { # split the string into newlines
159
160	# assoc-file and meta-file contain filepaths, ensure these are long windows file paths now (will later convert to linux slashes)
161	if($line =~ m@^<(assoc-file\|meta-file\|src-file)>(.*)(\s+)@s) {
162	my ($field, $value, $suffix) = ($1, $2, $3);
163	$line = $value; # it may be a short file name
164
165	if($line !~ m/^\@/) { # if the path doesn't use a "relative" @GSPATH@ placeholder string, but is an absolute path instead
166	# perhaps test here if it is a shortfilename? should match /CAPS....~number(.ext)/
167	$line = "<$field>".&Win32::GetLongPathName($line)."$suffix"; # make it a long file name and prefix assoc-file/meta-file tagname to it again
168	}
169	else { # if $line contains @THISCOLLECTIONPATH@, still need to deal with DOS filenames suffixes:
170	# replace placeholder with absolute path and expand to long filename, then insert placeholder in its original place again
171	$line =~ s/\@THISCOLLECTPATH\@/$collection_path/;
172	$line = &Win32::GetLongPathName($line);
173	$line =~ s/^$collection_path_re/\@THISCOLLECTPATH\@/;
174	$line = "<$field>".$line."$suffix";
175	}
176	}
177	$tmp .= $line;
178	}
179	$$win_text = $tmp;
180	}
181
182
183	# index gdb file
184	if($dbname =~ m/$strColName/) {
185	my $tmp = ""; # rebuild windows file's set of lines after processing them one by one
186	for my $line (split /^/, $$win_text) { # split the string into newlines
187
188	# In the following regex, add any .gdb fieldnames that represent a path and so would contain double backslashes
189	# on Windows (to escape the single backlash of win filepaths). They will be turned into single-backslashes here,
190	# and converted into single forward slashes futher below when the txt version of the win gdb file is normalised
191	# to compare it with the linux version.
192	# E.g. On windows, the Word-PDF collection(s) contains double backslashes in the ex.File.Directory field
193	# the MARC-Exploded collection contains double backslashes in the null_file entry field of the .gdb file
194	if($line =~ m@^<(ex.File.Directory\|null_file)>(.*)@s) {
195	my ($fieldname, $escaped_path) = ($1, $2);
196	$escaped_path =~ s@\\\\@/@g; #(my $escaped_path = $2) =~ s@\\\\@\\@g;
197	$line = "<$fieldname>$escaped_path";
198	}
199	elsif($line =~ m@^<Title>(.*)@s) {
200	# print STDERR "***** TITLE: \|$1\|\n";
201
202	# word-pdf collection: Title of ps files contain new lines at end when
203	# GreenstoneXMLPlugin::xml_end_tag() writes the Title back out after utf8 decode
204	# if($metadata_name eq "Title") { $metadata_value =~ s/[\n\r]*$//; }
205
206	(my $title = $1) =~ s@(\r\|\n\|\\n)*$@@; # get rid of trailing newlines/carriage returns
207	$line = "<Title>$title\n"; # add single newline
208	}
209	$tmp .= $line;
210	}
211	$$win_text = $tmp;
212
213	# slashes in windows metadata text need to be turned into linux style slashes.
214	# index\col.gdb uses double backslashes, and single for \n,\t
215	#$$win_text =~ s@\\\\@/@g;
216	}
217	else { # archiveinf gdb file
218
219	# slashes in windows metadata text need to be turned into linux style slashes.
220	# In the two archivesinf gdb files, filepaths may use single backslashes
221	$$win_text =~ s@\\@/@g; #$$win_text =~ s@\\([^n\|r\|\\|"])@/$1@g; # filepath something\rtf remains something\rtf
222	}
223
224	# cut down absolute paths to files to just collect/colname/.../file, same as before
225	$$lin_text =~ s@^([^\\/](//)).(\\\|/)(collect(\\\|/)$strColName)(.)$@$1$4$6@mg; # $$lin_text =~ s@^([^\\\/](//)?).(\\\|/)(collect(\\\|/)$strColName)(.*)$@$1$4$6@mg;
226	$$win_text =~ s@^([^\\/](//)).(\\\|/)(collect(\\\|/)$strColName)(.)$@$1$4$6@mg;
227
228	# for the windows text, need to further get rid of the driveletter after [ or <meta>
229	$$win_text =~ s@^(\[\|<[^>]*>)[a-zA-Z]:collect@$1collect@mg;
230
231	} # end of equalising differences between a windows collection's db file and linux coll's db file
232
233	# The following block of code is necessary to deal with tmp (html) source files generated when using PDFBox
234	# These tmpdirs are located inside the toplevel greenstone directory
235	(my $gsdlhome_re = $ENV{'GSDLHOME'}) =~ s@\\@\/@g;
236	$gsdlhome_re = ".*" unless $$ENV{'GSDLHOME'};
237	my $tmpfile_regex = "<URL>http://$gsdlhome_re/tmp/([^\.]*?)(\..{3,4})"; # $gsdlhome/tmp/randomfilename.html, file ext can be 3 or 4 chars long
238	if($test_text =~ m@$tmpfile_regex@g) {
239	# found a match, replace the tmp file name with "random", keeping the original file extension
240	# in <OrigSource\|URL\|UTF8URL\|gsdlconvertedfilename>
241
242	# This code is slightly different from doc.xml because each document has its own doc.xml, so this needs to be done
243	# only once for doc.xml, but multiple times in index/col.gdb since it contains the random filenames of all docs in the col
244	#my ($old_tmp_filename, $ext) = ($1, $2);
245
246	my $new_tmp_filename = "random";
247
248
249	$tmpfile_regex = "(<(URL\|UTF8URL\|gsdlconvertedfilename\|OrigSource)>(http://)?)($gsdlhome_re)?(/tmp/)?.*?(\..{3,4})";
250	if($5) {
251	$test_text =~ s@$tmpfile_regex@$1$5$new_tmp_filename$6@mg;
252	} else { # OrigSource contains only the filename
253	$test_text =~ s@$tmpfile_regex@$1$new_tmp_filename$6@mg;
254	}
255
256	# modelcol used a different gsdlhome, but also a tmp dir, so make the same changes to its random filename
257	$tmpfile_regex = "(<(URL\|UTF8URL\|gsdlconvertedfilename\|OrigSource)>(http://)?)(.)?(/tmp/)?.?(\..{3,4})";
258	if($5) {
259	$model_text =~ s@$tmpfile_regex@$1$5$new_tmp_filename$6@mg;
260	} else { # OrigSource contains only the filename
261	$model_text =~ s@$tmpfile_regex@$1$new_tmp_filename$6@mg;
262	}
263
264	# index/col.gdb also has entries for the random tmp file names in the form: [http://research/ak19/GS2bin_5July2013/tmp/F639.html]
265	# need to equalise these also. Sadly, when there are multiple intermediate files, their random tmp filenames are not
266	# guaranteed to be generated in the same (alphabetical/numerical) order between model and test collection, so the
267	# HASH OIDs, although all of them accounted for, appear in a different order. So we have to remove the Hash OIDs.
268	#$test_text =~ s@\[http://./tmp/.(\..{3,4})\]@tmp/random$1@mg; # HASH OIDs can appear in different order
269	#$model_text =~ s@\[http://./tmp/.(\..{3,4})\]@tmp/random$1@mg;
270
271	$test_text =~ s@\[http://[^\n]?/tmp/.?(\..{3,4})\]\n<section>([^\n]*?)\n@[tmp/random$1\n<section>$2]\n@sg;
272	$model_text =~ s@\[http://[^\n]?/tmp/.?(\..{3,4})\]\n<section>([^\n]*?)\n@[tmp/random$1\n<section>$2]\n@sg;
273
274	# need to re- sort the keys, now that the absolute paths to tmp locations has been removed
275	# so that we get the tmp files in the same order in both model and test collections
276
277	$model_text = text_to_db_to_text($model_text, "model.gdb");
278	$test_text = text_to_db_to_text($test_text, "test.gdb");
279	}
280
281	# now can go back to using $model_text and $test_text
282
283	if($debugging) {
284	print_string_to_file($test_text, $savepath.$dbname."_test.out");
285	print_string_to_file($model_text, $savepath.$dbname."_model.out");
286	}
287
288	my $report_type = "OldStyle"; # Can not change this type.
289	my $diff_gdb = diff \$model_text, \$test_text, { STYLE => $report_type };
290
291	# leaving the ignore regex as it used to be in the following, in case it helps with single line comparisons
292	$diff_gdb = &diffutil::GenerateOutput($diff_gdb,"^<(lastmodified\|lastmodifieddate\|oailastmodified\|oailastmodifieddate\|ImageSize)>.*");
293
294	if($diff_gdb eq "")
295	{
296	return "";
297	}
298	else
299	{
300	return "Difference Report: Differences found in the Database file: \n$diff_gdb";
301	}
302	# Call diff?
303	}
304
305	# returns true if the contents are windows AND it matters for the diffing on the db that it's windows
306	# For col.gdb it does not seem to matter so far, if it is generated on a windows machine and to be compared to a linux-generated col.gdb
307	sub isDBWindowsSensitive
308	{
309	my ($dbtailname, $db_contents) = @_; # db filename without suffix
310
311	# return ($db_contents =~ m/\\/) ? 1 : 0; # windows slashes detected. Better test would be: [Something\something] OR <tag>something\something
312
313	if($dbtailname =~ m/^archiveinf-doc/) {
314	return ($db_contents =~ m@<src-file>[a-zA-Z]:\\@) ? 1 : 0; # <src-file>C:\path
315	}
316	elsif($dbtailname =~ m/^archiveinf-src/) { # <src-file>C:\path
317	return ($db_contents =~ m@\[[a-zA-Z]:\\@) ? 1 : 0; # [C:\path]
318	}
319	else { # index/col.gdb file
320	if ($db_contents =~ m@<URL>http://[a-zA-Z]:/@) { # <URL>http://C:/path
321	return 1;
322	}
323	elsif ($db_contents =~ m@^(<URL>http://[a-zA-Z]:/)\|(<null_file>[^\\]*\\)@m) { # <URL>http://C:/path OR <null_file>CMSwp-all.00000001\\00000035.nul
324	return 1;
325	}
326	elsif ($db_contents =~ m@^(<ex.File.Directory>[a-zA-Z]:\\\\)@m) { # <ex.File.Directory>C:\\path\\path for OAI collection
327	return 1;
328	}
329	return 0;
330	}
331	}
332
333	1;

Note: See TracBrowser for help on using the repository browser.

Download in other formats: