Context Navigation

gdbdiff.pm@ 28238

Last change on this file since 28238 was 28238, checked in by ak19, 11 years ago

Modifications to handle placeholders for Greenstone standard path prefixes when the suffix of such a path is of the form of a Windows shortfilename. This occurs in archiveinf-doc.gdb on Windows, and possibly archiveinf-src.gdb too. Need to first reconstruct the full path locally (without placeholder), then convert it to a windows long filename, then stick the placeholder back into its place and do the diff between the test and model databases.

File size: 14.0 KB

Line
1	package gdbdiff;
2
3	BEGIN {
4	die "GSDLHOME not set\n" unless defined $ENV{'GSDLHOME'};
5	die "GSDLOS not set\n" unless defined $ENV{'GSDLOS'};
6	unshift (@INC, "$ENV{'GSDLHOME'}/perllib");
7	unshift (@INC, "$ENV{'GSDLHOME'}/perllib/cpan");
8	}
9
10	use util;
11	use diffutil;
12	use Text::Diff;
13	use Cwd;
14
15	if ($^O =~ m/mswin/i) {
16	require Win32; # for working out Windows Long Filenames from Win 8.3 short filenames
17	}
18
19	sub readin_gdb
20	{
21	my ($cmd) = @_;
22
23	open(PIN,"$cmd\|")
24	\|\| die "Unable to open pipe to $cmd: $!\n";
25
26	my $text_content = "";
27
28	while (defined (my $line = <PIN>)) {
29	$text_content .= $line;
30	}
31
32	close(PIN);
33	return $text_content;
34	}
35
36	# for debugging. Prints txt contents of db to file
37	sub print_string_to_file
38	{
39	my ($text, $outfile) = @_;
40
41	open(FOUT, ">$outfile") or die "ERROR failed to write to $outfile: $!\n";
42	print FOUT $text;
43	close(FOUT);
44	}
45
46	sub test_gdb
47	{
48	my ($full_modeldb, $full_testdb, $strColName, $test_os, $model_os, $strTestCol, $strModelCol) = @_;
49
50	# print "Now is testing database\n";
51	my ($dbname, $dirname, $suffix)= &File::Basename::fileparse($full_testdb, "\\.[^\\.]+\$");
52
53	# need to sort text output of both test and model col database files, to normalise them for the comparison
54	# the -sort option to db2txt was added specifically to support diffcol
55	my $model_cmd = "db2txt -sort $full_modeldb 2>&1";
56	my $test_cmd = "db2txt -sort $full_testdb 2>&1";
57
58	my $model_text = readin_gdb($model_cmd);
59	my $test_text = readin_gdb($test_cmd);
60
61	# my $savepath = &getcwd."/../"; # TASK_HOME env var does not exist at this stage, but it's one level up from current directory
62	# print_string_to_file($test_text, $savepath.$dbname."_test.out1");
63	# print_string_to_file($model_text, $savepath.$dbname."_model.out1");
64
65	# filter out the fields that can be ignored in the two database files
66	# The total_numbytes field can vary depending on how many backslashes exist in the urls in the main body text, as each
67	# of these windows slashes get escaped with another backslash, and the resulting string is used as key into rel link db
68	my $ignore_line_re = "\n<(FileSize\|lastmodified\|lastmodifieddate\|oailastmodified\|oailastmodifieddate\|ex.File.FileModifyDate\|ex.File.FilePermissions\|total_numbytes\|ex.Composite.LightValue)>([^\n])*";
69	$model_text =~ s/$ignore_line_re//g;
70	$test_text =~ s/$ignore_line_re//g;
71
72	# tmp dirs have subdirs with random numbers in name, remove subdir
73	# these tmpdirs are located inside the collection directory
74	$model_text =~ s@(tmp[\\\/])(\d*[\\\/])@$1@g;
75	$test_text =~ s@(tmp[\\\/])(\d*[\\\/])@$1@g;
76
77	# if the OS doesn't match and one of them is windows, extra work needs to be done to bring the db files
78	# in test and model collection to an even base for comparison
79
80	my $testIsWin = ($test_os ne "compute") ? ($test_os eq "windows") : &isDBWindowsSensitive($dbname, $test_text);
81	my $modelIsWin = ($model_os ne "compute") ? ($model_os eq "windows") : &isDBWindowsSensitive($dbname, $model_text);
82
83	if($testIsWin == $modelIsWin) {
84	# both linux or both windows, do the basic test we did on linux machines:
85	# ignore absolute path prefixes in modelcol and testcol (necessary for archiveinf-doc and -src.gdb files)
86
87	# Remember the original model col on SVN could have been built anywhere,
88	# and in the gdb files, absolute paths are stored to the collection location.
89	# Crop these paths to the collect/<colname> point.
90
91	# Entries are of the form [Entry] or <Entry>. In order to do a sensible diff,
92	# need to remove the prefix to the collect/colname folder in any (absolute) path that occurs in Entry
93	# E.g. [/full/path/collect/colname/import/file.ext] should become [collect/colname/import/file.ext]
94	# Better regex is of the form /BEGIN((?:(?!BEGIN).)*)END/, see http://docstore.mik.ua/orelly/perl/cookbook/ch06_16.htm
95
96	$model_text =~ s@^([^\\/](//)).(\\\|/)(collect(\\\|/)$strColName)(.)$@$1$4$6@mg;
97	$test_text =~ s@^([^\\/](//)).(\\\|/)(collect(\\\|/)$strColName)(.)$@$1$4$6@mg;
98	#$model_text =~ s@^([^\\//]).(\\\|/)(collect(\\\|/)$strColName)(.*)$@$1$3$5@mg;
99	#$test_text =~ s@^([^\\//]).(\\\|/)(collect(\\\|/)$strColName)(.*)$@$1$3$5@mg;
100	}
101
102	else { # one of the collections was built on windows
103	# handling slashes and other differences between a model coll built on one OS (e.g. linux)
104	# and a test collection built and diffed on another OS (windows)
105
106	my ($win_text, $lin_text); # references
107	my $collection_path = $strTestCol; # full path to a windows collection
108
109	if($testIsWin) {
110	$collection_path = $strTestCol; # test collection path is windows
111	$win_text = \$test_text;
112	$lin_text = \$model_text;
113	} else {
114	$collection_path = $strModelCol; # model collection path is windows
115	$win_text = \$model_text;
116	$lin_text = \$test_text;
117	}
118
119	if($dbname =~ m/archiveinf-doc/) {
120
121	(my $collection_path_re = $collection_path) =~ s@\\@\\\\@g;
122
123	my $tmp = ""; # rebuild windows file's set of lines after processing them one by one
124
125	# convert short filenames to long perl:
126	# http://www.mombu.com/programming/perl/t-convert-dos-83-filenames-to-win32-long-filenames-using-perl-525448.html
127	for my $line (split /^/, $$win_text) { # split the string into newlines
128
129	# assoc-file and meta-file contain filepaths, ensure these are long windows file paths now (will later convert to linux slashes)
130	if($line =~ m@^<(assoc-file\|meta-file\|src-file)>(.*)(\s+)@s) {
131	my ($field, $value, $suffix) = ($1, $2, $3);
132	$line = $value; # it may be a short file name
133
134	if($line !~ m/^\@/) { # if the path doesn't use a "relative" @GSPATH@ placeholder string, but is an absolute path instead
135	# perhaps test here if it is a shortfilename? should match /CAPS....~number(.ext)/
136	$line = "<$field>".&Win32::GetLongPathName($line)."$suffix"; # make it a long file name and prefix assoc-file/meta-file tagname to it again
137	}
138	else { # if $line contains @THISCOLLECTIONPATH@, still need to deal with DOS filenames suffixes:
139	# replace placeholder with absolute path and expand to long filename, then insert placeholder in its original place again
140	$line =~ s/\@THISCOLLECTPATH\@/$collection_path/;
141	$line = &Win32::GetLongPathName($line);
142	$line =~ s/^$collection_path_re/\@THISCOLLECTPATH\@/;
143	$line = "<$field>".$line."$suffix";
144	}
145	}
146	$tmp .= $line;
147	}
148	$$win_text = $tmp;
149	}
150
151
152	# index gdb file
153	if($dbname =~ m/$strColName/) {
154	my $tmp = ""; # rebuild windows file's set of lines after processing them one by one
155	for my $line (split /^/, $$win_text) { # split the string into newlines
156
157	# In the following regex, add any .gdb fieldnames that represent a path and so would contain double backslashes
158	# on Windows (to escape the single backlash of win filepaths). They will be turned into single-backslashes here,
159	# and converted into single forward slashes futher below when the txt version of the win gdb file is normalised
160	# to compare it with the linux version.
161	# E.g. On windows, the Word-PDF collection(s) contains double backslashes in the ex.File.Directory field
162	# the MARC-Exploded collection contains double backslashes in the null_file entry field of the .gdb file
163	if($line =~ m@^<(ex.File.Directory\|null_file)>(.*)@s) {
164	my ($fieldname, $escaped_path) = ($1, $2);
165	$escaped_path =~ s@\\\\@/@g; #(my $escaped_path = $2) =~ s@\\\\@\\@g;
166	$line = "<$fieldname>$escaped_path";
167	}
168	elsif($line =~ m@^<Title>(.*)@s) {
169	# print STDERR "***** TITLE: \|$1\|\n";
170
171	# word-pdf collection: Title of ps files contain new lines at end when
172	# GreenstoneXMLPlugin::xml_end_tag() writes the Title back out after utf8 decode
173	# if($metadata_name eq "Title") { $metadata_value =~ s/[\n\r]*$//; }
174
175	(my $title = $1) =~ s@(\r\|\n\|\\n)*$@@; # get rid of trailing newlines/carriage returns
176	$line = "<Title>$title\n"; # add single newline
177	}
178	$tmp .= $line;
179	}
180	$$win_text = $tmp;
181
182	# slashes in windows metadata text need to be turned into linux style slashes.
183	# index\col.gdb uses double backslashes, and single for \n,\t
184	#$$win_text =~ s@\\\\@/@g;
185	}
186	else { # archiveinf gdb file
187
188	# slashes in windows metadata text need to be turned into linux style slashes.
189	# In the two archivesinf gdb files, filepaths may use single backslashes
190	$$win_text =~ s@\\@/@g; #$$win_text =~ s@\\([^n\|r\|\\|"])@/$1@g; # filepath something\rtf remains something\rtf
191	}
192
193	# cut down absolute paths to files to just collect/colname/.../file, same as before
194	$$lin_text =~ s@^([^\\/](//)).(\\\|/)(collect(\\\|/)$strColName)(.)$@$1$4$6@mg; # $$lin_text =~ s@^([^\\\/](//)?).(\\\|/)(collect(\\\|/)$strColName)(.*)$@$1$4$6@mg;
195	$$win_text =~ s@^([^\\/](//)).(\\\|/)(collect(\\\|/)$strColName)(.)$@$1$4$6@mg;
196
197	# for the windows text, need to further get rid of the driveletter after [ or <meta>
198	$$win_text =~ s@^(\[\|<[^>]*>)[a-zA-Z]:collect@$1collect@mg;
199
200	} # end of equalising differences between a windows collection's db file and linux coll's db file
201
202	# The following block of code is necessary to deal with tmp (html) source files generated when using PDFBox
203	# These tmpdirs are located inside the toplevel greenstone directory
204	(my $gsdlhome_re = $ENV{'GSDLHOME'}) =~ s@\\@\/@g;
205	$gsdlhome_re = ".*" unless $$ENV{'GSDLHOME'};
206	my $tmpfile_regex = "<URL>http://$gsdlhome_re/tmp/([^\.]*?)(\..{3,4})"; # $gsdlhome/tmp/randomfilename.html, file ext can be 3 or 4 chars long
207	if($test_text =~ m@$tmpfile_regex@g) {
208	# found a match, replace the tmp file name with "random", keeping the original file extension
209	# in <OrigSource\|URL\|UTF8URL\|gsdlconvertedfilename>
210
211	# This code is slightly different from doc.xml because each document has its own doc.xml, so this needs to be done
212	# only once for doc.xml, but multiple times in index/col.gdb since it contains the random filenames of all docs in the col
213	#my ($old_tmp_filename, $ext) = ($1, $2);
214
215	my $new_tmp_filename = "random";
216
217
218	$tmpfile_regex = "(<(URL\|UTF8URL\|gsdlconvertedfilename\|OrigSource)>(http://)?)($gsdlhome_re)?(/tmp/)?.*?(\..{3,4})";
219	if($5) {
220	$test_text =~ s@$tmpfile_regex@$1$5$new_tmp_filename$6@mg;
221	} else { # OrigSource contains only the filename
222	$test_text =~ s@$tmpfile_regex@$1$new_tmp_filename$6@mg;
223	}
224
225	# modelcol used a different gsdlhome, but also a tmp dir, so make the same changes to its random filename
226	$tmpfile_regex = "(<(URL\|UTF8URL\|gsdlconvertedfilename\|OrigSource)>(http://)?)(.)?(/tmp/)?.?(\..{3,4})";
227	if($5) {
228	$model_text =~ s@$tmpfile_regex@$1$5$new_tmp_filename$6@mg;
229	} else { # OrigSource contains only the filename
230	$model_text =~ s@$tmpfile_regex@$1$new_tmp_filename$6@mg;
231	}
232
233	# index/col.gdb also has entries for the random tmp file names in the form: [http://research/ak19/GS2bin_5July2013/tmp/F639.html]
234	# need to equalise these also. Sadly, when there are multiple intermediate files, their random tmp filenames are not
235	# guaranteed to be generated in the same (alphabetical/numerical) order between model and test collection, so the
236	# HASH OIDs, although all of them accounted for, appear in a different order. So we have to remove the Hash OIDs.
237	#$test_text =~ s@\[http://./tmp/.(\..{3,4})\]@tmp/random$1@mg; # HASH OIDs can appear in different order
238	#$model_text =~ s@\[http://./tmp/.(\..{3,4})\]@tmp/random$1@mg;
239
240	$test_text =~ s@\[http://[^\n]?/tmp/.?(\..{3,4})\]\n<section>([^\n]*?)\n@[tmp/random$1\n<section>$2]\n@sg;
241	$model_text =~ s@\[http://[^\n]?/tmp/.?(\..{3,4})\]\n<section>([^\n]*?)\n@[tmp/random$1\n<section>$2]\n@sg;
242
243	# need to re- sort the keys, now that the absolute paths to tmp locations has been removed
244	# so that we get the tmp files in the same order in both model and test collections
245
246	# http://stackoverflow.com/questions/1909262/how-can-i-pipe-input-into-a-java-command-from-perl
247	open PIPE, "\| txt2db model.gdb";
248	print PIPE "$model_text";
249	close(PIPE);
250	open PIPE, "\| txt2db test.gdb";
251	print PIPE "$test_text";
252	close(PIPE);
253
254	$model_cmd = " db2txt -sort model.gdb 2>&1";
255	$test_cmd = "db2txt -sort test.gdb 2>&1";
256	$model_text = readin_gdb($model_cmd);
257	$test_text = readin_gdb($test_cmd);
258	}
259
260	# now can go back to using $model_text and $test_text
261	# print_string_to_file($test_text, $savepath.$dbname."_test.out");
262	# print_string_to_file($model_text, $savepath.$dbname."_model.out");
263
264	my $report_type = "OldStyle"; # Can not change this type.
265	my $diff_gdb = diff \$model_text, \$test_text, { STYLE => $report_type };
266
267	# leaving the ignore regex as it used to be in the following, in case it helps with single line comparisons
268	$diff_gdb = &diffutil::GenerateOutput($diff_gdb,"^<(lastmodified\|lastmodifieddate\|oailastmodified\|oailastmodifieddate\|ImageSize)>.*");
269
270	if($diff_gdb eq "")
271	{
272	return "";
273	}
274	else
275	{
276	return "Difference Report: Differences found in the Database file: \n$diff_gdb";
277	}
278	# Call diff?
279	}
280
281	# returns true if the contents are windows AND it matters for the diffing on the db that it's windows
282	# For col.gdb it does not seem to matter so far, if it is generated on a windows machine and to be compared to a linux-generated col.gdb
283	sub isDBWindowsSensitive
284	{
285	my ($dbtailname, $db_contents) = @_; # db filename without suffix
286
287	# return ($db_contents =~ m/\\/) ? 1 : 0; # windows slashes detected. Better test would be: [Something\something] OR <tag>something\something
288
289	if($dbtailname =~ m/^archiveinf-doc/) {
290	return ($db_contents =~ m@<src-file>[a-zA-Z]:\\@) ? 1 : 0; # <src-file>C:\path
291	}
292	elsif($dbtailname =~ m/^archiveinf-src/) { # <src-file>C:\path
293	return ($db_contents =~ m@\[[a-zA-Z]:\\@) ? 1 : 0; # [C:\path]
294	}
295	else { # index/col.gdb file
296	if ($db_contents =~ m@<URL>http://[a-zA-Z]:/@) { # <URL>http://C:/path
297	return 1;
298	}
299	elsif ($db_contents =~ m@^(<URL>http://[a-zA-Z]:/)\|(<null_file>[^\\]*\\)@m) { # <URL>http://C:/path OR <null_file>CMSwp-all.00000001\\00000035.nul
300	return 1;
301	}
302	elsif ($db_contents =~ m@^(<ex.File.Directory>[a-zA-Z]:\\\\)@m) { # <ex.File.Directory>C:\\path\\path for OAI collection
303	return 1;
304	}
305	return 0;
306	}
307	}
308
309	1;

Note: See TracBrowser for help on using the repository browser.

Context Navigation

source: other-projects/nightly-tasks/diffcol/trunk/diffcol/gdbdiff.pm@ 28238

Download in other formats: