Context Navigation

gdbdiff.pm@ 36807

Last change on this file since 36807 was 36807, checked in by anupama, 19 months ago

We used to run diffcol as a nightly task only for GS2. Commit 36655 was the first stage of getting diffcol to work for GS3, but skipped a lot of important code branches (like comparing the index\text\j/gdb files) in order to fix up the easier parts of the code. Now that I think the remainder of the diffcol scripts have been got to work with diffcol for GS3, where the index\text\flatdb files are compared and diffcol works for them, I can commit the important changes as well as commented out debugging statements made to the diffcol scripts that get the full diffcol code to work for GS3 diffcol. I will recommit again after removing the debugging statements. And I still need to do a full local diffcol run again, as well as testing if diffcol still works after locally undoing my sort field changes to some GS3 model cols (the recent commits to Tudor, Word-PDF, Images-GPS and Multimedia collections) to see if Dr Bainbridge's PERL_HASH_SEED env var addition fixes all of those collections diffcol failures, making the extra sorting redundant. In that case, I will recommit those model collections after updating their col configurations to not do the extra sorting.

File size: 16.3 KB

Rev	Line
[21711]	1	package gdbdiff;
	2
	3	BEGIN {
	4	die "GSDLHOME not set\n" unless defined $ENV{'GSDLHOME'};
	5	die "GSDLOS not set\n" unless defined $ENV{'GSDLOS'};
	6	unshift (@INC, "$ENV{'GSDLHOME'}/perllib");
	7	unshift (@INC, "$ENV{'GSDLHOME'}/perllib/cpan");
	8	}
	9
	10	use util;
	11	use diffutil;
	12	use Text::Diff;
[27701]	13	use Cwd;
[21711]	14
[27696]	15	if ($^O =~ m/mswin/i) {
	16	require Win32; # for working out Windows Long Filenames from Win 8.3 short filenames
	17	}
[27695]	18
[21711]	19	sub readin_gdb
	20	{
	21	my ($cmd) = @_;
	22
	23	open(PIN,"$cmd\|")
	24	\|\| die "Unable to open pipe to $cmd: $!\n";
	25
	26	my $text_content = "";
	27
	28	while (defined (my $line = <PIN>)) {
	29	$text_content .= $line;
	30	}
	31
	32	close(PIN);
	33	return $text_content;
	34	}
	35
[28661]	36	sub read_db
	37	{
	38	# need to sort text output of both test and model col database files, to normalise them for the comparison
	39	# the -sort option to db2txt was added specifically to support diffcol
	40	my($db_file) = @_;
	41	my $db_cmd = "db2txt -sort $db_file 2>&1";
	42	if($db_file =~ m/\.jdb$/) {
[34955]	43	my $runperl = ($ENV{'GSDLOS'} =~ m/windows/) ? "perl -S" : "";
	44	$db_cmd = "$runperl jdb2txt.pl -sort $db_file";
[28661]	45	}
	46
	47	my $db_text = readin_gdb($db_cmd);
	48	return $db_text;
	49	}
	50
	51	sub text_to_db_to_text
	52	{
	53	my($db_text, $db_filename) = @_;
	54
	55	# http://stackoverflow.com/questions/1909262/how-can-i-pipe-input-into-a-java-command-from-perl
	56
[34955]	57	if($db_filename =~ m/\.jdb$/) {
	58	my $runperl = ($ENV{'GSDLOS'} =~ m/windows/) ? "perl -S" : "";
	59	open PIPE, "\| $runperl txt2jdb.pl $db_filename";
	60	print PIPE "$db_text";
	61	close(PIPE);
	62	} else {
	63	open PIPE, "\| txt2db $db_filename";
	64	print PIPE "$db_text";
	65	close(PIPE);
	66	}
[28661]	67	return &read_db("$db_filename");
	68	}
	69
[27695]	70	# for debugging. Prints txt contents of db to file
	71	sub print_string_to_file
	72	{
[28661]	73	my ($text, $outfile) = @_;
[27695]	74
[28661]	75	open(FOUT, ">$outfile") or die "ERROR failed to write to $outfile: $!\n";
[27695]	76	print FOUT $text;
	77	close(FOUT);
	78	}
[21711]	79
	80	sub test_gdb
	81	{
[29495]	82	my ($full_modeldb, $full_testdb, $strColName, $test_os, $model_os, $strTestCol, $strModelCol, $debugging) = @_;
[21711]	83
	84	# print "Now is testing database\n";
[27695]	85	my ($dbname, $dirname, $suffix)= &File::Basename::fileparse($full_testdb, "\\.[^\\.]+\$");
	86
[28661]	87	my $model_text = read_db($full_modeldb);
	88	my $test_text = read_db($full_testdb);
[21711]	89
[29495]	90	my $savepath = &getcwd."/../"; # TASK_HOME env var does not exist at this stage, but it's one level up from current directory
	91	if($debugging) {
	92	print_string_to_file($test_text, $savepath.$dbname."_test.out1");
	93	print_string_to_file($model_text, $savepath.$dbname."_model.out1");
	94	}
[27604]	95
	96	# filter out the fields that can be ignored in the two database files
[27701]	97	# The total_numbytes field can vary depending on how many backslashes exist in the urls in the main body text, as each
	98	# of these windows slashes get escaped with another backslash, and the resulting string is used as key into rel link db
[35231]	99	my $ignore_line_re = "\n<(FileSize\|lastmodified\|lastmodifieddate\|oailastmodified\|oailastmodifieddate\|ex.File.FileModifyDate\|ex.File.FilePermissions\|ex.File.FileAccessDate\|ex.File.FileInodeChangeDate\|ex.File.FileCreateDate\|total_numbytes\|ex.Composite.LightValue)>([^\n])*";
[27604]	100	$model_text =~ s/$ignore_line_re//g;
	101	$test_text =~ s/$ignore_line_re//g;
	102
[27730]	103	# tmp dirs have subdirs with random numbers in name, remove subdir
[27766]	104	# these tmpdirs are located inside the collection directory
[27730]	105	$model_text =~ s@(tmp[\\\/])(\d*[\\\/])@$1@g;
	106	$test_text =~ s@(tmp[\\\/])(\d*[\\\/])@$1@g;
[27701]	107
[36807]	108
	109	#print STDERR "@@@@ DEBUGGING: $debugging\n";
	110	#print STDERR "******** full_modeldb: $full_modeldb\n$model_text\n\n";
	111	#print STDERR "******** full_testdb: $full_testdb\n$test_text\n\n";
	112
[27695]	113	# if the OS doesn't match and one of them is windows, extra work needs to be done to bring the db files
	114	# in test and model collection to an even base for comparison
	115
[28172]	116	my $testIsWin = ($test_os ne "compute") ? ($test_os eq "windows") : &isDBWindowsSensitive($dbname, $test_text);
	117	my $modelIsWin = ($model_os ne "compute") ? ($model_os eq "windows") : &isDBWindowsSensitive($dbname, $model_text);
[27695]	118
[28086]	119	if($testIsWin == $modelIsWin) {
	120	# both linux or both windows, do the basic test we did on linux machines:
[27695]	121	# ignore absolute path prefixes in modelcol and testcol (necessary for archiveinf-doc and -src.gdb files)
[27604]	122
[27695]	123	# Remember the original model col on SVN could have been built anywhere,
	124	# and in the gdb files, absolute paths are stored to the collection location.
	125	# Crop these paths to the collect/<colname> point.
	126
	127	# Entries are of the form [Entry] or <Entry>. In order to do a sensible diff,
	128	# need to remove the prefix to the collect/colname folder in any (absolute) path that occurs in Entry
	129	# E.g. [/full/path/collect/colname/import/file.ext] should become [collect/colname/import/file.ext]
	130	# Better regex is of the form /BEGIN((?:(?!BEGIN).)*)END/, see http://docstore.mik.ua/orelly/perl/cookbook/ch06_16.htm
[27604]	131
[27743]	132	$model_text =~ s@^([^\\/](//)).(\\\|/)(collect(\\\|/)$strColName)(.)$@$1$4$6@mg;
	133	$test_text =~ s@^([^\\/](//)).(\\\|/)(collect(\\\|/)$strColName)(.)$@$1$4$6@mg;
	134	#$model_text =~ s@^([^\\//]).(\\\|/)(collect(\\\|/)$strColName)(.*)$@$1$3$5@mg;
	135	#$test_text =~ s@^([^\\//]).(\\\|/)(collect(\\\|/)$strColName)(.*)$@$1$3$5@mg;
[27695]	136	}
	137
	138	else { # one of the collections was built on windows
	139	# handling slashes and other differences between a model coll built on one OS (e.g. linux)
	140	# and a test collection built and diffed on another OS (windows)
	141
	142	my ($win_text, $lin_text); # references
[28238]	143	my $collection_path = $strTestCol; # full path to a windows collection
	144
[27695]	145	if($testIsWin) {
[28238]	146	$collection_path = $strTestCol; # test collection path is windows
[27695]	147	$win_text = \$test_text;
	148	$lin_text = \$model_text;
	149	} else {
[28238]	150	$collection_path = $strModelCol; # model collection path is windows
[27695]	151	$win_text = \$model_text;
	152	$lin_text = \$test_text;
	153	}
	154
	155	if($dbname =~ m/archiveinf-doc/) {
[28238]	156
	157	(my $collection_path_re = $collection_path) =~ s@\\@\\\\@g;
	158
[27695]	159	my $tmp = ""; # rebuild windows file's set of lines after processing them one by one
	160
	161	# convert short filenames to long perl:
	162	# http://www.mombu.com/programming/perl/t-convert-dos-83-filenames-to-win32-long-filenames-using-perl-525448.html
	163	for my $line (split /^/, $$win_text) { # split the string into newlines
	164
[27743]	165	# assoc-file and meta-file contain filepaths, ensure these are long windows file paths now (will later convert to linux slashes)
[28224]	166	if($line =~ m@^<(assoc-file\|meta-file\|src-file)>(.*)(\s+)@s) {
[28238]	167	my ($field, $value, $suffix) = ($1, $2, $3);
	168	$line = $value; # it may be a short file name
[28224]	169
[28238]	170	if($line !~ m/^\@/) { # if the path doesn't use a "relative" @GSPATH@ placeholder string, but is an absolute path instead
	171	# perhaps test here if it is a shortfilename? should match /CAPS....~number(.ext)/
	172	$line = "<$field>".&Win32::GetLongPathName($line)."$suffix"; # make it a long file name and prefix assoc-file/meta-file tagname to it again
	173	}
	174	else { # if $line contains @THISCOLLECTIONPATH@, still need to deal with DOS filenames suffixes:
	175	# replace placeholder with absolute path and expand to long filename, then insert placeholder in its original place again
	176	$line =~ s/\@THISCOLLECTPATH\@/$collection_path/;
	177	$line = &Win32::GetLongPathName($line);
	178	$line =~ s/^$collection_path_re/\@THISCOLLECTPATH\@/;
	179	$line = "<$field>".$line."$suffix";
[28224]	180	}
[27695]	181	}
	182	$tmp .= $line;
	183	}
	184	$$win_text = $tmp;
	185	}
	186
	187
[27743]	188	# index gdb file
	189	if($dbname =~ m/$strColName/) {
	190	my $tmp = ""; # rebuild windows file's set of lines after processing them one by one
[28086]	191	for my $line (split /^/, $$win_text) { # split the string into newlines
	192
[28005]	193	# In the following regex, add any .gdb fieldnames that represent a path and so would contain double backslashes
	194	# on Windows (to escape the single backlash of win filepaths). They will be turned into single-backslashes here,
	195	# and converted into single forward slashes futher below when the txt version of the win gdb file is normalised
	196	# to compare it with the linux version.
	197	# E.g. On windows, the Word-PDF collection(s) contains double backslashes in the ex.File.Directory field
	198	# the MARC-Exploded collection contains double backslashes in the null_file entry field of the .gdb file
[28086]	199	if($line =~ m@^<(ex.File.Directory\|null_file)>(.*)@s) {
[28005]	200	my ($fieldname, $escaped_path) = ($1, $2);
[28019]	201	$escaped_path =~ s@\\\\@/@g; #(my $escaped_path = $2) =~ s@\\\\@\\@g;
[28005]	202	$line = "<$fieldname>$escaped_path";
[27743]	203	}
	204	elsif($line =~ m@^<Title>(.*)@s) {
	205	# print STDERR "***** TITLE: \|$1\|\n";
	206
	207	# word-pdf collection: Title of ps files contain new lines at end when
	208	# GreenstoneXMLPlugin::xml_end_tag() writes the Title back out after utf8 decode
	209	# if($metadata_name eq "Title") { $metadata_value =~ s/[\n\r]*$//; }
	210
	211	(my $title = $1) =~ s@(\r\|\n\|\\n)*$@@; # get rid of trailing newlines/carriage returns
	212	$line = "<Title>$title\n"; # add single newline
	213	}
	214	$tmp .= $line;
	215	}
[28019]	216	$$win_text = $tmp;
	217
	218	# slashes in windows metadata text need to be turned into linux style slashes.
	219	# index\col.gdb uses double backslashes, and single for \n,\t
	220	#$$win_text =~ s@\\\\@/@g;
[27743]	221	}
[28019]	222	else { # archiveinf gdb file
[27743]	223
[28019]	224	# slashes in windows metadata text need to be turned into linux style slashes.
	225	# In the two archivesinf gdb files, filepaths may use single backslashes
	226	$$win_text =~ s@\\@/@g; #$$win_text =~ s@\\([^n\|r\|\\|"])@/$1@g; # filepath something\rtf remains something\rtf
	227	}
[27743]	228
[27695]	229	# cut down absolute paths to files to just collect/colname/.../file, same as before
[27743]	230	$$lin_text =~ s@^([^\\/](//)).(\\\|/)(collect(\\\|/)$strColName)(.)$@$1$4$6@mg; # $$lin_text =~ s@^([^\\\/](//)?).(\\\|/)(collect(\\\|/)$strColName)(.*)$@$1$4$6@mg;
	231	$$win_text =~ s@^([^\\/](//)).(\\\|/)(collect(\\\|/)$strColName)(.)$@$1$4$6@mg;
[27695]	232
	233	# for the windows text, need to further get rid of the driveletter after [ or <meta>
[27701]	234	$$win_text =~ s@^(\[\|<[^>]*>)[a-zA-Z]:collect@$1collect@mg;
[27695]	235
	236	} # end of equalising differences between a windows collection's db file and linux coll's db file
[27766]	237
[36807]	238
	239	# Windows or linux: if index is a flat db file, then ensure the docIDs listed in <contains> field of
	240	# both test and model flat db file are alphabetically sorted. So to the numbers in <mdoffset> field.
	241	# Despite PERL_PERTURB_KEYS envvar being set to 0 on both machine when generating model collections
	242	# and when test collections were generated on test machine, still collections like Images-GPS and some
	243	# other colls list items in <contains> and <mdoffset> in different orders. So reordering alphabetically.
	244	#if($dbname =~ m/$strColName/) {
	245	# regex modifiers mge: multi-line, global (replace as many as match), e allows function call in substitution
	246	##$model_text =~ s@^<contains>(.*)@sort_contains_field($1, "MODEL", $debugging)@mge;
	247	##$test_text =~ s@^<contains>(.*)@sort_contains_field($1, "TEST", $debugging)@mge;
	248	# $model_text =~ s@^<(contains\|mdoffset)>(.*)@sort_field_value($1, $2, "MODEL", $debugging)@mge;
	249	# $test_text =~ s@^<(contains\|mdoffset)>(.*)@sort_field_value($1, $2, "TEST", $debugging)@mge;
	250	#}
	251
[27766]	252	# The following block of code is necessary to deal with tmp (html) source files generated when using PDFBox
	253	# These tmpdirs are located inside the toplevel greenstone directory
	254	(my $gsdlhome_re = $ENV{'GSDLHOME'}) =~ s@\\@\/@g;
[27767]	255	$gsdlhome_re = ".*" unless $$ENV{'GSDLHOME'};
[28067]	256	my $tmpfile_regex = "<URL>http://$gsdlhome_re/tmp/([^\.]*?)(\..{3,4})"; # $gsdlhome/tmp/randomfilename.html, file ext can be 3 or 4 chars long
[27766]	257	if($test_text =~ m@$tmpfile_regex@g) {
	258	# found a match, replace the tmp file name with "random", keeping the original file extension
	259	# in <OrigSource\|URL\|UTF8URL\|gsdlconvertedfilename>
	260
	261	# This code is slightly different from doc.xml because each document has its own doc.xml, so this needs to be done
	262	# only once for doc.xml, but multiple times in index/col.gdb since it contains the random filenames of all docs in the col
	263	#my ($old_tmp_filename, $ext) = ($1, $2);
	264
	265	my $new_tmp_filename = "random";
	266
	267
[28067]	268	$tmpfile_regex = "(<(URL\|UTF8URL\|gsdlconvertedfilename\|OrigSource)>(http://)?)($gsdlhome_re)?(/tmp/)?.*?(\..{3,4})";
[27766]	269	if($5) {
	270	$test_text =~ s@$tmpfile_regex@$1$5$new_tmp_filename$6@mg;
	271	} else { # OrigSource contains only the filename
	272	$test_text =~ s@$tmpfile_regex@$1$new_tmp_filename$6@mg;
	273	}
	274
	275	# modelcol used a different gsdlhome, but also a tmp dir, so make the same changes to its random filename
[28067]	276	$tmpfile_regex = "(<(URL\|UTF8URL\|gsdlconvertedfilename\|OrigSource)>(http://)?)(.)?(/tmp/)?.?(\..{3,4})";
[27766]	277	if($5) {
	278	$model_text =~ s@$tmpfile_regex@$1$5$new_tmp_filename$6@mg;
	279	} else { # OrigSource contains only the filename
	280	$model_text =~ s@$tmpfile_regex@$1$new_tmp_filename$6@mg;
	281	}
	282
	283	# index/col.gdb also has entries for the random tmp file names in the form: [http://research/ak19/GS2bin_5July2013/tmp/F639.html]
[27767]	284	# need to equalise these also. Sadly, when there are multiple intermediate files, their random tmp filenames are not
	285	# guaranteed to be generated in the same (alphabetical/numerical) order between model and test collection, so the
	286	# HASH OIDs, although all of them accounted for, appear in a different order. So we have to remove the Hash OIDs.
	287	#$test_text =~ s@\[http://./tmp/.(\..{3,4})\]@tmp/random$1@mg; # HASH OIDs can appear in different order
	288	#$model_text =~ s@\[http://./tmp/.(\..{3,4})\]@tmp/random$1@mg;
[28067]	289
[28071]	290	$test_text =~ s@\[http://[^\n]?/tmp/.?(\..{3,4})\]\n<section>([^\n]*?)\n@[tmp/random$1\n<section>$2]\n@sg;
	291	$model_text =~ s@\[http://[^\n]?/tmp/.?(\..{3,4})\]\n<section>([^\n]*?)\n@[tmp/random$1\n<section>$2]\n@sg;
	292
	293	# need to re- sort the keys, now that the absolute paths to tmp locations has been removed
	294	# so that we get the tmp files in the same order in both model and test collections
	295
[28661]	296	$model_text = text_to_db_to_text($model_text, "model.gdb");
	297	$test_text = text_to_db_to_text($test_text, "test.gdb");
[27766]	298	}
[27604]	299
[29495]	300	# now can go back to using $model_text and $test_text
	301
	302	if($debugging) {
	303	print_string_to_file($test_text, $savepath.$dbname."_test.out");
	304	print_string_to_file($model_text, $savepath.$dbname."_model.out");
	305	}
[34955]	306
[21711]	307	my $report_type = "OldStyle"; # Can not change this type.
	308	my $diff_gdb = diff \$model_text, \$test_text, { STYLE => $report_type };
	309
[27604]	310	# leaving the ignore regex as it used to be in the following, in case it helps with single line comparisons
[27725]	311	$diff_gdb = &diffutil::GenerateOutput($diff_gdb,"^<(lastmodified\|lastmodifieddate\|oailastmodified\|oailastmodifieddate\|ImageSize)>.*");
[21711]	312
	313	if($diff_gdb eq "")
	314	{
	315	return "";
	316	}
	317	else
	318	{
	319	return "Difference Report: Differences found in the Database file: \n$diff_gdb";
	320	}
	321	# Call diff?
	322	}
	323
[36807]	324	# Unused, but may come in handy when debugging again: regex substitution helper function
	325	sub sort_field_value {
	326	my($fieldname, $fieldvalue, $displayStr, $debugging) = @_;
	327
	328	print STDERR "\n$displayStr BEFORE sort: <$fieldname>$fieldvalue\n" if($debugging);
	329
	330	$fieldvalue =~ s@(\r\|\n\|\\n)*$@@; # get rid of trailing newlines/carriage returns
	331	my @values_list = split(';', $fieldvalue);
	332	@values_list = sort @values_list;
	333	$fieldvalue = "<$fieldname>".join(';', @values_list). "\n";
	334
	335	print STDERR "$displayStr AFTER sort: $fieldvalue\n" if($debugging);
	336
	337	return $fieldvalue;
	338	}
	339
	340
[27695]	341	# returns true if the contents are windows AND it matters for the diffing on the db that it's windows
	342	# For col.gdb it does not seem to matter so far, if it is generated on a windows machine and to be compared to a linux-generated col.gdb
	343	sub isDBWindowsSensitive
	344	{
	345	my ($dbtailname, $db_contents) = @_; # db filename without suffix
	346
[28019]	347	# return ($db_contents =~ m/\\/) ? 1 : 0; # windows slashes detected. Better test would be: [Something\something] OR <tag>something\something
	348
	349	if($dbtailname =~ m/^archiveinf-doc/) {
	350	return ($db_contents =~ m@<src-file>[a-zA-Z]:\\@) ? 1 : 0; # <src-file>C:\path
	351	}
	352	elsif($dbtailname =~ m/^archiveinf-src/) { # <src-file>C:\path
	353	return ($db_contents =~ m@\[[a-zA-Z]:\\@) ? 1 : 0; # [C:\path]
	354	}
	355	else { # index/col.gdb file
	356	if ($db_contents =~ m@<URL>http://[a-zA-Z]:/@) { # <URL>http://C:/path
	357	return 1;
	358	}
	359	elsif ($db_contents =~ m@^(<URL>http://[a-zA-Z]:/)\|(<null_file>[^\\]*\\)@m) { # <URL>http://C:/path OR <null_file>CMSwp-all.00000001\\00000035.nul
	360	return 1;
	361	}
[28086]	362	elsif ($db_contents =~ m@^(<ex.File.Directory>[a-zA-Z]:\\\\)@m) { # <ex.File.Directory>C:\\path\\path for OAI collection
	363	return 1;
	364	}
[28019]	365	return 0;
	366	}
[27695]	367	}
	368
[21711]	369	1;

Note: See TracBrowser for help on using the repository browser.

Context Navigation

source: other-projects/nightly-tasks/diffcol/trunk/diffcol/gdbdiff.pm@ 36807

Download in other formats: