Context Navigation

source: other-projects/nightly-tasks/diffcol/trunk/diffcol/gdbdiff.pm@ 28661

Last change on this file since 28661 was 28661, checked in by ak19, 10 years ago
Committing the next installment of code to handle diffcol for GS3. Now it successfully compiles up GS3, while diffcol still works for GS2.
File size: 14.2 KB

Rev	Line
[21711]	1	package gdbdiff;
	2
	3	BEGIN {
	4	die "GSDLHOME not set\n" unless defined $ENV{'GSDLHOME'};
	5	die "GSDLOS not set\n" unless defined $ENV{'GSDLOS'};
	6	unshift (@INC, "$ENV{'GSDLHOME'}/perllib");
	7	unshift (@INC, "$ENV{'GSDLHOME'}/perllib/cpan");
	8	}
	9
	10	use util;
	11	use diffutil;
	12	use Text::Diff;
[27701]	13	use Cwd;
[21711]	14
[27696]	15	if ($^O =~ m/mswin/i) {
	16	require Win32; # for working out Windows Long Filenames from Win 8.3 short filenames
	17	}
[27695]	18
[21711]	19	sub readin_gdb
	20	{
	21	my ($cmd) = @_;
	22
	23	open(PIN,"$cmd\|")
	24	\|\| die "Unable to open pipe to $cmd: $!\n";
	25
	26	my $text_content = "";
	27
	28	while (defined (my $line = <PIN>)) {
	29	$text_content .= $line;
	30	}
	31
	32	close(PIN);
	33	return $text_content;
	34	}
	35
[28661]	36	sub read_db
	37	{
	38	# need to sort text output of both test and model col database files, to normalise them for the comparison
	39	# the -sort option to db2txt was added specifically to support diffcol
	40	my($db_file) = @_;
	41	my $db_cmd = "db2txt -sort $db_file 2>&1";
	42	if($db_file =~ m/\.jdb$/) {
	43	print STDERR "NOT YET IMPLEMENTED\n";
	44	#$db_cmd = "jdb2txt -sort $db_file 2>&1";
	45	}
	46
	47	my $db_text = readin_gdb($db_cmd);
	48	return $db_text;
	49	}
	50
	51	sub text_to_db_to_text
	52	{
	53	my($db_text, $db_filename) = @_;
	54
	55	# http://stackoverflow.com/questions/1909262/how-can-i-pipe-input-into-a-java-command-from-perl
	56	open PIPE, "\| txt2db $db_filename";
	57	print PIPE "$db_text";
	58	close(PIPE);
	59
	60	return &read_db("$db_filename");
	61	}
	62
[27695]	63	# for debugging. Prints txt contents of db to file
	64	sub print_string_to_file
	65	{
[28661]	66	my ($text, $outfile) = @_;
[27695]	67
[28661]	68	open(FOUT, ">$outfile") or die "ERROR failed to write to $outfile: $!\n";
[27695]	69	print FOUT $text;
	70	close(FOUT);
	71	}
[21711]	72
	73	sub test_gdb
	74	{
[28238]	75	my ($full_modeldb, $full_testdb, $strColName, $test_os, $model_os, $strTestCol, $strModelCol) = @_;
[21711]	76
	77	# print "Now is testing database\n";
[27695]	78	my ($dbname, $dirname, $suffix)= &File::Basename::fileparse($full_testdb, "\\.[^\\.]+\$");
	79
[28661]	80	my $model_text = read_db($full_modeldb);
	81	my $test_text = read_db($full_testdb);
[21711]	82
[27743]	83	# my $savepath = &getcwd."/../"; # TASK_HOME env var does not exist at this stage, but it's one level up from current directory
[28005]	84	# print_string_to_file($test_text, $savepath.$dbname."_test.out1");
	85	# print_string_to_file($model_text, $savepath.$dbname."_model.out1");
[27604]	86
	87	# filter out the fields that can be ignored in the two database files
[27701]	88	# The total_numbytes field can vary depending on how many backslashes exist in the urls in the main body text, as each
	89	# of these windows slashes get escaped with another backslash, and the resulting string is used as key into rel link db
[28086]	90	my $ignore_line_re = "\n<(FileSize\|lastmodified\|lastmodifieddate\|oailastmodified\|oailastmodifieddate\|ex.File.FileModifyDate\|ex.File.FilePermissions\|total_numbytes\|ex.Composite.LightValue)>([^\n])*";
[27604]	91	$model_text =~ s/$ignore_line_re//g;
	92	$test_text =~ s/$ignore_line_re//g;
	93
[27730]	94	# tmp dirs have subdirs with random numbers in name, remove subdir
[27766]	95	# these tmpdirs are located inside the collection directory
[27730]	96	$model_text =~ s@(tmp[\\\/])(\d*[\\\/])@$1@g;
	97	$test_text =~ s@(tmp[\\\/])(\d*[\\\/])@$1@g;
[27701]	98
[27695]	99	# if the OS doesn't match and one of them is windows, extra work needs to be done to bring the db files
	100	# in test and model collection to an even base for comparison
	101
[28172]	102	my $testIsWin = ($test_os ne "compute") ? ($test_os eq "windows") : &isDBWindowsSensitive($dbname, $test_text);
	103	my $modelIsWin = ($model_os ne "compute") ? ($model_os eq "windows") : &isDBWindowsSensitive($dbname, $model_text);
[27695]	104
[28086]	105	if($testIsWin == $modelIsWin) {
	106	# both linux or both windows, do the basic test we did on linux machines:
[27695]	107	# ignore absolute path prefixes in modelcol and testcol (necessary for archiveinf-doc and -src.gdb files)
[27604]	108
[27695]	109	# Remember the original model col on SVN could have been built anywhere,
	110	# and in the gdb files, absolute paths are stored to the collection location.
	111	# Crop these paths to the collect/<colname> point.
	112
	113	# Entries are of the form [Entry] or <Entry>. In order to do a sensible diff,
	114	# need to remove the prefix to the collect/colname folder in any (absolute) path that occurs in Entry
	115	# E.g. [/full/path/collect/colname/import/file.ext] should become [collect/colname/import/file.ext]
	116	# Better regex is of the form /BEGIN((?:(?!BEGIN).)*)END/, see http://docstore.mik.ua/orelly/perl/cookbook/ch06_16.htm
[27604]	117
[27743]	118	$model_text =~ s@^([^\\/](//)).(\\\|/)(collect(\\\|/)$strColName)(.)$@$1$4$6@mg;
	119	$test_text =~ s@^([^\\/](//)).(\\\|/)(collect(\\\|/)$strColName)(.)$@$1$4$6@mg;
	120	#$model_text =~ s@^([^\\//]).(\\\|/)(collect(\\\|/)$strColName)(.*)$@$1$3$5@mg;
	121	#$test_text =~ s@^([^\\//]).(\\\|/)(collect(\\\|/)$strColName)(.*)$@$1$3$5@mg;
[27695]	122	}
	123
	124	else { # one of the collections was built on windows
	125	# handling slashes and other differences between a model coll built on one OS (e.g. linux)
	126	# and a test collection built and diffed on another OS (windows)
	127
	128	my ($win_text, $lin_text); # references
[28238]	129	my $collection_path = $strTestCol; # full path to a windows collection
	130
[27695]	131	if($testIsWin) {
[28238]	132	$collection_path = $strTestCol; # test collection path is windows
[27695]	133	$win_text = \$test_text;
	134	$lin_text = \$model_text;
	135	} else {
[28238]	136	$collection_path = $strModelCol; # model collection path is windows
[27695]	137	$win_text = \$model_text;
	138	$lin_text = \$test_text;
	139	}
	140
	141	if($dbname =~ m/archiveinf-doc/) {
[28238]	142
	143	(my $collection_path_re = $collection_path) =~ s@\\@\\\\@g;
	144
[27695]	145	my $tmp = ""; # rebuild windows file's set of lines after processing them one by one
	146
	147	# convert short filenames to long perl:
	148	# http://www.mombu.com/programming/perl/t-convert-dos-83-filenames-to-win32-long-filenames-using-perl-525448.html
	149	for my $line (split /^/, $$win_text) { # split the string into newlines
	150
[27743]	151	# assoc-file and meta-file contain filepaths, ensure these are long windows file paths now (will later convert to linux slashes)
[28224]	152	if($line =~ m@^<(assoc-file\|meta-file\|src-file)>(.*)(\s+)@s) {
[28238]	153	my ($field, $value, $suffix) = ($1, $2, $3);
	154	$line = $value; # it may be a short file name
[28224]	155
[28238]	156	if($line !~ m/^\@/) { # if the path doesn't use a "relative" @GSPATH@ placeholder string, but is an absolute path instead
	157	# perhaps test here if it is a shortfilename? should match /CAPS....~number(.ext)/
	158	$line = "<$field>".&Win32::GetLongPathName($line)."$suffix"; # make it a long file name and prefix assoc-file/meta-file tagname to it again
	159	}
	160	else { # if $line contains @THISCOLLECTIONPATH@, still need to deal with DOS filenames suffixes:
	161	# replace placeholder with absolute path and expand to long filename, then insert placeholder in its original place again
	162	$line =~ s/\@THISCOLLECTPATH\@/$collection_path/;
	163	$line = &Win32::GetLongPathName($line);
	164	$line =~ s/^$collection_path_re/\@THISCOLLECTPATH\@/;
	165	$line = "<$field>".$line."$suffix";
[28224]	166	}
[27695]	167	}
	168	$tmp .= $line;
	169	}
	170	$$win_text = $tmp;
	171	}
	172
	173
[27743]	174	# index gdb file
	175	if($dbname =~ m/$strColName/) {
	176	my $tmp = ""; # rebuild windows file's set of lines after processing them one by one
[28086]	177	for my $line (split /^/, $$win_text) { # split the string into newlines
	178
[28005]	179	# In the following regex, add any .gdb fieldnames that represent a path and so would contain double backslashes
	180	# on Windows (to escape the single backlash of win filepaths). They will be turned into single-backslashes here,
	181	# and converted into single forward slashes futher below when the txt version of the win gdb file is normalised
	182	# to compare it with the linux version.
	183	# E.g. On windows, the Word-PDF collection(s) contains double backslashes in the ex.File.Directory field
	184	# the MARC-Exploded collection contains double backslashes in the null_file entry field of the .gdb file
[28086]	185	if($line =~ m@^<(ex.File.Directory\|null_file)>(.*)@s) {
[28005]	186	my ($fieldname, $escaped_path) = ($1, $2);
[28019]	187	$escaped_path =~ s@\\\\@/@g; #(my $escaped_path = $2) =~ s@\\\\@\\@g;
[28005]	188	$line = "<$fieldname>$escaped_path";
[27743]	189	}
	190	elsif($line =~ m@^<Title>(.*)@s) {
	191	# print STDERR "***** TITLE: \|$1\|\n";
	192
	193	# word-pdf collection: Title of ps files contain new lines at end when
	194	# GreenstoneXMLPlugin::xml_end_tag() writes the Title back out after utf8 decode
	195	# if($metadata_name eq "Title") { $metadata_value =~ s/[\n\r]*$//; }
	196
	197	(my $title = $1) =~ s@(\r\|\n\|\\n)*$@@; # get rid of trailing newlines/carriage returns
	198	$line = "<Title>$title\n"; # add single newline
	199	}
	200	$tmp .= $line;
	201	}
[28019]	202	$$win_text = $tmp;
	203
	204	# slashes in windows metadata text need to be turned into linux style slashes.
	205	# index\col.gdb uses double backslashes, and single for \n,\t
	206	#$$win_text =~ s@\\\\@/@g;
[27743]	207	}
[28019]	208	else { # archiveinf gdb file
[27743]	209
[28019]	210	# slashes in windows metadata text need to be turned into linux style slashes.
	211	# In the two archivesinf gdb files, filepaths may use single backslashes
	212	$$win_text =~ s@\\@/@g; #$$win_text =~ s@\\([^n\|r\|\\|"])@/$1@g; # filepath something\rtf remains something\rtf
	213	}
[27743]	214
[27695]	215	# cut down absolute paths to files to just collect/colname/.../file, same as before
[27743]	216	$$lin_text =~ s@^([^\\/](//)).(\\\|/)(collect(\\\|/)$strColName)(.)$@$1$4$6@mg; # $$lin_text =~ s@^([^\\\/](//)?).(\\\|/)(collect(\\\|/)$strColName)(.*)$@$1$4$6@mg;
	217	$$win_text =~ s@^([^\\/](//)).(\\\|/)(collect(\\\|/)$strColName)(.)$@$1$4$6@mg;
[27695]	218
	219	# for the windows text, need to further get rid of the driveletter after [ or <meta>
[27701]	220	$$win_text =~ s@^(\[\|<[^>]*>)[a-zA-Z]:collect@$1collect@mg;
[27695]	221
	222	} # end of equalising differences between a windows collection's db file and linux coll's db file
[27766]	223
	224	# The following block of code is necessary to deal with tmp (html) source files generated when using PDFBox
	225	# These tmpdirs are located inside the toplevel greenstone directory
	226	(my $gsdlhome_re = $ENV{'GSDLHOME'}) =~ s@\\@\/@g;
[27767]	227	$gsdlhome_re = ".*" unless $$ENV{'GSDLHOME'};
[28067]	228	my $tmpfile_regex = "<URL>http://$gsdlhome_re/tmp/([^\.]*?)(\..{3,4})"; # $gsdlhome/tmp/randomfilename.html, file ext can be 3 or 4 chars long
[27766]	229	if($test_text =~ m@$tmpfile_regex@g) {
	230	# found a match, replace the tmp file name with "random", keeping the original file extension
	231	# in <OrigSource\|URL\|UTF8URL\|gsdlconvertedfilename>
	232
	233	# This code is slightly different from doc.xml because each document has its own doc.xml, so this needs to be done
	234	# only once for doc.xml, but multiple times in index/col.gdb since it contains the random filenames of all docs in the col
	235	#my ($old_tmp_filename, $ext) = ($1, $2);
	236
	237	my $new_tmp_filename = "random";
	238
	239
[28067]	240	$tmpfile_regex = "(<(URL\|UTF8URL\|gsdlconvertedfilename\|OrigSource)>(http://)?)($gsdlhome_re)?(/tmp/)?.*?(\..{3,4})";
[27766]	241	if($5) {
	242	$test_text =~ s@$tmpfile_regex@$1$5$new_tmp_filename$6@mg;
	243	} else { # OrigSource contains only the filename
	244	$test_text =~ s@$tmpfile_regex@$1$new_tmp_filename$6@mg;
	245	}
	246
	247	# modelcol used a different gsdlhome, but also a tmp dir, so make the same changes to its random filename
[28067]	248	$tmpfile_regex = "(<(URL\|UTF8URL\|gsdlconvertedfilename\|OrigSource)>(http://)?)(.)?(/tmp/)?.?(\..{3,4})";
[27766]	249	if($5) {
	250	$model_text =~ s@$tmpfile_regex@$1$5$new_tmp_filename$6@mg;
	251	} else { # OrigSource contains only the filename
	252	$model_text =~ s@$tmpfile_regex@$1$new_tmp_filename$6@mg;
	253	}
	254
	255	# index/col.gdb also has entries for the random tmp file names in the form: [http://research/ak19/GS2bin_5July2013/tmp/F639.html]
[27767]	256	# need to equalise these also. Sadly, when there are multiple intermediate files, their random tmp filenames are not
	257	# guaranteed to be generated in the same (alphabetical/numerical) order between model and test collection, so the
	258	# HASH OIDs, although all of them accounted for, appear in a different order. So we have to remove the Hash OIDs.
	259	#$test_text =~ s@\[http://./tmp/.(\..{3,4})\]@tmp/random$1@mg; # HASH OIDs can appear in different order
	260	#$model_text =~ s@\[http://./tmp/.(\..{3,4})\]@tmp/random$1@mg;
[28067]	261
[28071]	262	$test_text =~ s@\[http://[^\n]?/tmp/.?(\..{3,4})\]\n<section>([^\n]*?)\n@[tmp/random$1\n<section>$2]\n@sg;
	263	$model_text =~ s@\[http://[^\n]?/tmp/.?(\..{3,4})\]\n<section>([^\n]*?)\n@[tmp/random$1\n<section>$2]\n@sg;
	264
	265	# need to re- sort the keys, now that the absolute paths to tmp locations has been removed
	266	# so that we get the tmp files in the same order in both model and test collections
	267
[28661]	268	$model_text = text_to_db_to_text($model_text, "model.gdb");
	269	$test_text = text_to_db_to_text($test_text, "test.gdb");
[27766]	270	}
[27604]	271
[27695]	272	# now can go back to using $model_text and $test_text
[27701]	273	# print_string_to_file($test_text, $savepath.$dbname."_test.out");
	274	# print_string_to_file($model_text, $savepath.$dbname."_model.out");
[27695]	275
[21711]	276	my $report_type = "OldStyle"; # Can not change this type.
	277	my $diff_gdb = diff \$model_text, \$test_text, { STYLE => $report_type };
	278
[27604]	279	# leaving the ignore regex as it used to be in the following, in case it helps with single line comparisons
[27725]	280	$diff_gdb = &diffutil::GenerateOutput($diff_gdb,"^<(lastmodified\|lastmodifieddate\|oailastmodified\|oailastmodifieddate\|ImageSize)>.*");
[21711]	281
	282	if($diff_gdb eq "")
	283	{
	284	return "";
	285	}
	286	else
	287	{
	288	return "Difference Report: Differences found in the Database file: \n$diff_gdb";
	289	}
	290	# Call diff?
	291	}
	292
[27695]	293	# returns true if the contents are windows AND it matters for the diffing on the db that it's windows
	294	# For col.gdb it does not seem to matter so far, if it is generated on a windows machine and to be compared to a linux-generated col.gdb
	295	sub isDBWindowsSensitive
	296	{
	297	my ($dbtailname, $db_contents) = @_; # db filename without suffix
	298
[28019]	299	# return ($db_contents =~ m/\\/) ? 1 : 0; # windows slashes detected. Better test would be: [Something\something] OR <tag>something\something
	300
	301	if($dbtailname =~ m/^archiveinf-doc/) {
	302	return ($db_contents =~ m@<src-file>[a-zA-Z]:\\@) ? 1 : 0; # <src-file>C:\path
	303	}
	304	elsif($dbtailname =~ m/^archiveinf-src/) { # <src-file>C:\path
	305	return ($db_contents =~ m@\[[a-zA-Z]:\\@) ? 1 : 0; # [C:\path]
	306	}
	307	else { # index/col.gdb file
	308	if ($db_contents =~ m@<URL>http://[a-zA-Z]:/@) { # <URL>http://C:/path
	309	return 1;
	310	}
	311	elsif ($db_contents =~ m@^(<URL>http://[a-zA-Z]:/)\|(<null_file>[^\\]*\\)@m) { # <URL>http://C:/path OR <null_file>CMSwp-all.00000001\\00000035.nul
	312	return 1;
	313	}
[28086]	314	elsif ($db_contents =~ m@^(<ex.File.Directory>[a-zA-Z]:\\\\)@m) { # <ex.File.Directory>C:\\path\\path for OAI collection
	315	return 1;
	316	}
[28019]	317	return 0;
	318	}
[27695]	319	}
	320
[21711]	321	1;

Note: See TracBrowser for help on using the repository browser.

Download in other formats: