Context Navigation

source: other-projects/nightly-tasks/diffcol/trunk/diffcol/gdbdiff.pm@ 27767

Last change on this file since 27767 was 27767, checked in by ak19, 11 years ago
Fixes to previous commit: the random file names (created by PDFBox for its intermediate html files) are generated with random numbers. As a result, the HASH OIDs of these intermediate source files, although constant between model and test collections, can therefore appear in a different order in the model and the test collection when both are ordered alphabetically in the index/col.gdb file.
File size: 10.9 KB

Rev	Line
[21711]	1	package gdbdiff;
	2
	3	BEGIN {
	4	die "GSDLHOME not set\n" unless defined $ENV{'GSDLHOME'};
	5	die "GSDLOS not set\n" unless defined $ENV{'GSDLOS'};
	6	unshift (@INC, "$ENV{'GSDLHOME'}/perllib");
	7	unshift (@INC, "$ENV{'GSDLHOME'}/perllib/cpan");
	8	}
	9
	10	use util;
	11	use diffutil;
	12	use Text::Diff;
[27701]	13	use Cwd;
[21711]	14
[27696]	15	if ($^O =~ m/mswin/i) {
	16	require Win32; # for working out Windows Long Filenames from Win 8.3 short filenames
	17	}
[27695]	18
[21711]	19	sub readin_gdb
	20	{
	21	my ($cmd) = @_;
	22
	23	open(PIN,"$cmd\|")
	24	\|\| die "Unable to open pipe to $cmd: $!\n";
	25
	26	my $text_content = "";
	27
	28	while (defined (my $line = <PIN>)) {
	29	$text_content .= $line;
	30	}
	31
	32	close(PIN);
	33	return $text_content;
	34	}
	35
[27695]	36	# for debugging. Prints txt contents of db to file
	37	sub print_string_to_file
	38	{
	39	my ($text, $outfile) = @_;
	40
	41	open(FOUT, ">$outfile") or die "ERROR failed to write to $outfile: $!\n";
	42	print FOUT $text;
	43	close(FOUT);
	44	}
[21711]	45
	46	sub test_gdb
	47	{
[27604]	48	my ($full_modeldb, $full_testdb,$strColName) = @_;
[21711]	49
	50	# print "Now is testing database\n";
[27695]	51	my ($dbname, $dirname, $suffix)= &File::Basename::fileparse($full_testdb, "\\.[^\\.]+\$");
	52
[27604]	53	# need to sort text output of both test and model col database files, to normalise them for the comparison
	54	# the -sort option to db2txt was added specifically to support diffcol
	55	my $model_cmd = "db2txt -sort $full_modeldb 2>&1";
	56	my $test_cmd = "db2txt -sort $full_testdb 2>&1";
[21711]	57
	58	my $model_text = readin_gdb($model_cmd);
	59	my $test_text = readin_gdb($test_cmd);
[27604]	60
[27743]	61	# my $savepath = &getcwd."/../"; # TASK_HOME env var does not exist at this stage, but it's one level up from current directory
[27695]	62	# print_string_to_file($test_text, $savepath.$dbname."_test.out");
	63	# print_string_to_file($model_text, $savepath.$dbname."_model.out");
[27604]	64
	65	# filter out the fields that can be ignored in the two database files
[27701]	66	# The total_numbytes field can vary depending on how many backslashes exist in the urls in the main body text, as each
	67	# of these windows slashes get escaped with another backslash, and the resulting string is used as key into rel link db
[27730]	68	my $ignore_line_re = "\n<(lastmodified\|lastmodifieddate\|oailastmodified\|oailastmodifieddate\|ex.File.FileModifyDate\|ex.File.FilePermissions\|total_numbytes)>([^\n])*";
[27604]	69	$model_text =~ s/$ignore_line_re//g;
	70	$test_text =~ s/$ignore_line_re//g;
	71
[27730]	72	# tmp dirs have subdirs with random numbers in name, remove subdir
[27766]	73	# these tmpdirs are located inside the collection directory
[27730]	74	$model_text =~ s@(tmp[\\\/])(\d*[\\\/])@$1@g;
	75	$test_text =~ s@(tmp[\\\/])(\d*[\\\/])@$1@g;
[27701]	76
[27695]	77	# if the OS doesn't match and one of them is windows, extra work needs to be done to bring the db files
	78	# in test and model collection to an even base for comparison
	79
	80	my $testIsWin = &isDBWindowsSensitive($dbname, $test_text);
	81	my $modelIsWin = &isDBWindowsSensitive($dbname, $model_text);
	82
	83	if($testIsWin == $modelIsWin) { # both linux or both windows, do the basic test we did on linux machines:
	84	# ignore absolute path prefixes in modelcol and testcol (necessary for archiveinf-doc and -src.gdb files)
[27604]	85
[27695]	86	# Remember the original model col on SVN could have been built anywhere,
	87	# and in the gdb files, absolute paths are stored to the collection location.
	88	# Crop these paths to the collect/<colname> point.
	89
	90	# Entries are of the form [Entry] or <Entry>. In order to do a sensible diff,
	91	# need to remove the prefix to the collect/colname folder in any (absolute) path that occurs in Entry
	92	# E.g. [/full/path/collect/colname/import/file.ext] should become [collect/colname/import/file.ext]
	93	# Better regex is of the form /BEGIN((?:(?!BEGIN).)*)END/, see http://docstore.mik.ua/orelly/perl/cookbook/ch06_16.htm
[27604]	94
[27743]	95	$model_text =~ s@^([^\\/](//)).(\\\|/)(collect(\\\|/)$strColName)(.)$@$1$4$6@mg;
	96	$test_text =~ s@^([^\\/](//)).(\\\|/)(collect(\\\|/)$strColName)(.)$@$1$4$6@mg;
	97	#$model_text =~ s@^([^\\//]).(\\\|/)(collect(\\\|/)$strColName)(.*)$@$1$3$5@mg;
	98	#$test_text =~ s@^([^\\//]).(\\\|/)(collect(\\\|/)$strColName)(.*)$@$1$3$5@mg;
[27695]	99	}
	100
	101	else { # one of the collections was built on windows
	102	# handling slashes and other differences between a model coll built on one OS (e.g. linux)
	103	# and a test collection built and diffed on another OS (windows)
	104
	105	my ($win_text, $lin_text); # references
	106	if($testIsWin) {
	107	$win_text = \$test_text;
	108	$lin_text = \$model_text;
	109	} else {
	110	$win_text = \$model_text;
	111	$lin_text = \$test_text;
	112	}
	113
	114	if($dbname =~ m/archiveinf-doc/) {
	115	my $tmp = ""; # rebuild windows file's set of lines after processing them one by one
	116
	117	# convert short filenames to long perl:
	118	# http://www.mombu.com/programming/perl/t-convert-dos-83-filenames-to-win32-long-filenames-using-perl-525448.html
	119	for my $line (split /^/, $$win_text) { # split the string into newlines
	120
[27743]	121	# assoc-file and meta-file contain filepaths, ensure these are long windows file paths now (will later convert to linux slashes)
	122	if($line =~ m@^<(assoc-file\|meta-file)>(.*)(\s+)@s) {
	123	$line = $2; # may be a short file name
[27695]	124	# perhaps test here if it is a shortfilename? should match /CAPS....~number(.ext)/
	125
[27743]	126	$line = "<$1>".&Win32::GetLongPathName($line)."$3"; # make it a long file name and prefix assoc-file/meta-file tagname to it again
[27695]	127	}
	128	$tmp .= $line;
	129	}
	130	$$win_text = $tmp;
	131	}
	132
	133
[27743]	134	# index gdb file
	135	if($dbname =~ m/$strColName/) {
	136	my $tmp = ""; # rebuild windows file's set of lines after processing them one by one
	137	for my $line (split /^/, $$win_text) { # split the string into newlines
	138
	139	if($line =~ m@^<ex.File.Directory>(.*)@s) { # word-pdf collection contains double windows backslashes
	140	(my $escaped_path = $1) =~ s@\\\\@\\@g;
	141	$line = "<ex.File.Directory>$escaped_path";
	142	}
	143	elsif($line =~ m@^<Title>(.*)@s) {
	144	# print STDERR "***** TITLE: \|$1\|\n";
	145
	146	# word-pdf collection: Title of ps files contain new lines at end when
	147	# GreenstoneXMLPlugin::xml_end_tag() writes the Title back out after utf8 decode
	148	# if($metadata_name eq "Title") { $metadata_value =~ s/[\n\r]*$//; }
	149
	150	(my $title = $1) =~ s@(\r\|\n\|\\n)*$@@; # get rid of trailing newlines/carriage returns
	151	$line = "<Title>$title\n"; # add single newline
	152	}
	153	$tmp .= $line;
	154	}
	155	$$win_text = $tmp;
	156	}
	157
	158
	159	# slashes in windows metadata text need to be turned into linux style slashes
	160	$$win_text =~ s@\\@/@g; #$$win_text =~ s@\\([^n\|r\|\\|"])@/$1@g; # filepath something\rtf remains something\rtf
	161
[27695]	162	# cut down absolute paths to files to just collect/colname/.../file, same as before
[27743]	163	$$lin_text =~ s@^([^\\/](//)).(\\\|/)(collect(\\\|/)$strColName)(.)$@$1$4$6@mg; # $$lin_text =~ s@^([^\\\/](//)?).(\\\|/)(collect(\\\|/)$strColName)(.*)$@$1$4$6@mg;
	164	$$win_text =~ s@^([^\\/](//)).(\\\|/)(collect(\\\|/)$strColName)(.)$@$1$4$6@mg;
[27695]	165
	166	# for the windows text, need to further get rid of the driveletter after [ or <meta>
[27701]	167	$$win_text =~ s@^(\[\|<[^>]*>)[a-zA-Z]:collect@$1collect@mg;
[27695]	168
	169	} # end of equalising differences between a windows collection's db file and linux coll's db file
[27766]	170
	171	# The following block of code is necessary to deal with tmp (html) source files generated when using PDFBox
	172	# These tmpdirs are located inside the toplevel greenstone directory
	173	(my $gsdlhome_re = $ENV{'GSDLHOME'}) =~ s@\\@\/@g;
[27767]	174	$gsdlhome_re = ".*" unless $$ENV{'GSDLHOME'};
[27766]	175	my $tmpfile_regex = "<URL>http://$gsdlhome_re/tmp/([^\.]*)(\..{3,4})"; # $gsdlhome/tmp/randomfilename.html, file ext can be 3 or 4 chars long
	176	if($test_text =~ m@$tmpfile_regex@g) {
	177	# found a match, replace the tmp file name with "random", keeping the original file extension
	178	# in <OrigSource\|URL\|UTF8URL\|gsdlconvertedfilename>
	179
	180	# This code is slightly different from doc.xml because each document has its own doc.xml, so this needs to be done
	181	# only once for doc.xml, but multiple times in index/col.gdb since it contains the random filenames of all docs in the col
	182	#my ($old_tmp_filename, $ext) = ($1, $2);
	183
	184	my $new_tmp_filename = "random";
	185
	186
	187	$tmpfile_regex = "(<(URL\|UTF8URL\|gsdlconvertedfilename\|OrigSource)>(http://)?)($gsdlhome_re)?(/tmp/)?.*(\..{3,4})";
	188	if($5) {
	189	$test_text =~ s@$tmpfile_regex@$1$5$new_tmp_filename$6@mg;
	190	} else { # OrigSource contains only the filename
	191	$test_text =~ s@$tmpfile_regex@$1$new_tmp_filename$6@mg;
	192	}
	193
	194	# modelcol used a different gsdlhome, but also a tmp dir, so make the same changes to its random filename
	195	$tmpfile_regex = "(<(URL\|UTF8URL\|gsdlconvertedfilename\|OrigSource)>(http://)?)(.)?(/tmp/)?.(\..{3,4})";
	196	if($5) {
	197	$model_text =~ s@$tmpfile_regex@$1$5$new_tmp_filename$6@mg;
	198	} else { # OrigSource contains only the filename
	199	$model_text =~ s@$tmpfile_regex@$1$new_tmp_filename$6@mg;
	200	}
	201
	202	# index/col.gdb also has entries for the random tmp file names in the form: [http://research/ak19/GS2bin_5July2013/tmp/F639.html]
[27767]	203	# need to equalise these also. Sadly, when there are multiple intermediate files, their random tmp filenames are not
	204	# guaranteed to be generated in the same (alphabetical/numerical) order between model and test collection, so the
	205	# HASH OIDs, although all of them accounted for, appear in a different order. So we have to remove the Hash OIDs.
	206	#$test_text =~ s@\[http://./tmp/.(\..{3,4})\]@tmp/random$1@mg; # HASH OIDs can appear in different order
	207	#$model_text =~ s@\[http://./tmp/.(\..{3,4})\]@tmp/random$1@mg;
	208	$test_text =~ s@\[http://./tmp/.(\..{3,4})\]\n<section>[^\n]*\n@tmp/random$1\n<section>RandomHash\n@sg;
	209	$model_text =~ s@\[http://./tmp/.(\..{3,4})\]\n<section>[^\n]*\n@tmp/random$1\n<section>RandomHash\n@sg;
[27766]	210	}
[27604]	211
[27695]	212	# now can go back to using $model_text and $test_text
[27701]	213	# print_string_to_file($test_text, $savepath.$dbname."_test.out");
	214	# print_string_to_file($model_text, $savepath.$dbname."_model.out");
[27695]	215
[21711]	216	my $report_type = "OldStyle"; # Can not change this type.
	217	my $diff_gdb = diff \$model_text, \$test_text, { STYLE => $report_type };
	218
[27604]	219	# leaving the ignore regex as it used to be in the following, in case it helps with single line comparisons
[27725]	220	$diff_gdb = &diffutil::GenerateOutput($diff_gdb,"^<(lastmodified\|lastmodifieddate\|oailastmodified\|oailastmodifieddate\|ImageSize)>.*");
[21711]	221
	222	if($diff_gdb eq "")
	223	{
	224	return "";
	225	}
	226	else
	227	{
	228	return "Difference Report: Differences found in the Database file: \n$diff_gdb";
	229	}
	230	# Call diff?
	231	}
	232
[27695]	233	# returns true if the contents are windows AND it matters for the diffing on the db that it's windows
	234	# For col.gdb it does not seem to matter so far, if it is generated on a windows machine and to be compared to a linux-generated col.gdb
	235	sub isDBWindowsSensitive
	236	{
	237	my ($dbtailname, $db_contents) = @_; # db filename without suffix
	238
[27743]	239	#if($dbtailname !~ m/archiveinf/) { # only archiveinf-doc and archive-inf source need special Windows processing, not col.gdb
	240	# return 0;
	241	#}
[27695]	242	return ($db_contents =~ m/\\/) ? 1 : 0; # windows slashes detected. Better test would be: [Something\something] OR <tag>something\something
	243	# for doc.xml:
	244	# <Metadata name="gsdlsourcefilename">import/html_files/cleves.html</Metadata>
	245	}
	246
[21711]	247	1;

Note: See TracBrowser for help on using the repository browser.

Download in other formats: