source: other-projects/nightly-tasks/diffcol/trunk/diffcol/gdbdiff.pm@ 27730

Last change on this file since 27730 was 27730, checked in by ak19, 11 years ago

More diffing issues detected when diffcol ran over the first Word and PDF tutorial. 1. Two ex.File.* fields differ, one to do with date/timestamp and another with permissions (the latter might be avoidable). These 2 ex.File.* metadata fields have been marked for ignoring. 2. The building generated tmp folders containing randomly named subfolders whose names consisted only of digits, possibly named using timestamps. These differ between builds and needed to be normalised also. One final issue remains, and that is that a ghostscript document, one on which the gs command failed, is converted to html with a different line-lengths or line-wraps on the CentOS in which the model col was generated and the Ubuntu on which the testcol was generated. This results in the entire content section of the doc.xml to be marked different between model and test.

File size: 6.9 KB
Line 
1package gdbdiff;
2
3BEGIN {
4 die "GSDLHOME not set\n" unless defined $ENV{'GSDLHOME'};
5 die "GSDLOS not set\n" unless defined $ENV{'GSDLOS'};
6 unshift (@INC, "$ENV{'GSDLHOME'}/perllib");
7 unshift (@INC, "$ENV{'GSDLHOME'}/perllib/cpan");
8}
9
10use util;
11use diffutil;
12use Text::Diff;
13use Cwd;
14
15if ($^O =~ m/mswin/i) {
16 require Win32; # for working out Windows Long Filenames from Win 8.3 short filenames
17}
18
19sub readin_gdb
20{
21 my ($cmd) = @_;
22
23 open(PIN,"$cmd|")
24 || die "Unable to open pipe to $cmd: $!\n";
25
26 my $text_content = "";
27
28 while (defined (my $line = <PIN>)) {
29 $text_content .= $line;
30 }
31
32 close(PIN);
33 return $text_content;
34}
35
36# for debugging. Prints txt contents of db to file
37sub print_string_to_file
38{
39 my ($text, $outfile) = @_;
40
41 open(FOUT, ">$outfile") or die "ERROR failed to write to $outfile: $!\n";
42 print FOUT $text;
43 close(FOUT);
44}
45
46sub test_gdb
47{
48 my ($full_modeldb, $full_testdb,$strColName) = @_;
49
50 # print "Now is testing database\n";
51 my ($dbname, $dirname, $suffix)= &File::Basename::fileparse($full_testdb, "\\.[^\\.]+\$");
52
53 # need to sort text output of both test and model col database files, to normalise them for the comparison
54 # the -sort option to db2txt was added specifically to support diffcol
55 my $model_cmd = "db2txt -sort $full_modeldb 2>&1";
56 my $test_cmd = "db2txt -sort $full_testdb 2>&1";
57
58 my $model_text = readin_gdb($model_cmd);
59 my $test_text = readin_gdb($test_cmd);
60
61# my $savepath = &getcwd."/../"; # TASK_HOME env does not exist at this stage, but it's one level up from current directory
62# print_string_to_file($test_text, $savepath.$dbname."_test.out");
63# print_string_to_file($model_text, $savepath.$dbname."_model.out");
64
65 # filter out the fields that can be ignored in the two database files
66 # The total_numbytes field can vary depending on how many backslashes exist in the urls in the main body text, as each
67 # of these windows slashes get escaped with another backslash, and the resulting string is used as key into rel link db
68 my $ignore_line_re = "\n<(lastmodified|lastmodifieddate|oailastmodified|oailastmodifieddate|ex.File.FileModifyDate|ex.File.FilePermissions|total_numbytes)>([^\n])*";
69 $model_text =~ s/$ignore_line_re//g;
70 $test_text =~ s/$ignore_line_re//g;
71
72 # tmp dirs have subdirs with random numbers in name, remove subdir
73 $model_text =~ s@(tmp[\\\/])(\d*[\\\/])@$1@g;
74 $test_text =~ s@(tmp[\\\/])(\d*[\\\/])@$1@g;
75
76 # if the OS doesn't match and one of them is windows, extra work needs to be done to bring the db files
77 # in test and model collection to an even base for comparison
78
79 my $testIsWin = &isDBWindowsSensitive($dbname, $test_text);
80 my $modelIsWin = &isDBWindowsSensitive($dbname, $model_text);
81
82 if($testIsWin == $modelIsWin) { # both linux or both windows, do the basic test we did on linux machines:
83 # ignore absolute path prefixes in modelcol and testcol (necessary for archiveinf-doc and -src.gdb files)
84
85 # Remember the original model col on SVN could have been built anywhere,
86 # and in the gdb files, absolute paths are stored to the collection location.
87 # Crop these paths to the collect/<colname> point.
88
89 # Entries are of the form [Entry] or <Entry>. In order to do a sensible diff,
90 # need to remove the prefix to the collect/colname folder in any (absolute) path that occurs in Entry
91 # E.g. [/full/path/collect/colname/import/file.ext] should become [collect/colname/import/file.ext]
92 # Better regex is of the form /BEGIN((?:(?!BEGIN).)*)END/, see http://docstore.mik.ua/orelly/perl/cookbook/ch06_16.htm
93
94 $model_text =~ s@^([^\\//]*).*(\\|/)(collect(\\|/)$strColName)(.*)$@$1$3$5@mg;
95 $test_text =~ s@^([^\\//]*).*(\\|/)(collect(\\|/)$strColName)(.*)$@$1$3$5@mg;
96 }
97
98 else { # one of the collections was built on windows
99 # handling slashes and other differences between a model coll built on one OS (e.g. linux)
100 # and a test collection built and diffed on another OS (windows)
101
102 my ($win_text, $lin_text); # references
103 if($testIsWin) {
104 $win_text = \$test_text;
105 $lin_text = \$model_text;
106 } else {
107 $win_text = \$model_text;
108 $lin_text = \$test_text;
109 }
110
111 if($dbname =~ m/archiveinf-doc/) {
112 my $tmp = ""; # rebuild windows file's set of lines after processing them one by one
113
114 # convert short filenames to long perl:
115 # http://www.mombu.com/programming/perl/t-convert-dos-83-filenames-to-win32-long-filenames-using-perl-525448.html
116 for my $line (split /^/, $$win_text) { # split the string into newlines
117
118 if($line =~ m@^<assoc-file>(.*)(\s+)@s) {
119 $line = $1; # may be a short file name
120 # perhaps test here if it is a shortfilename? should match /CAPS....~number(.ext)/
121
122 $line = "<assoc-file>".&Win32::GetLongPathName($line)."$2"; # make it a long file name and prefix assoc-file to it again
123 }
124 $tmp .= $line;
125 }
126 $$win_text = $tmp;
127 }
128
129 # slashes in windows text need to be turned into linux style slashes
130 $$win_text =~ s@\\@/@g;
131
132 # cut down absolute paths to files to just collect/colname/.../file, same as before
133 $$lin_text =~ s@^([^\\//]*).*(\\|/)(collect(\\|/)$strColName)(.*)$@$1$3$5@mg;
134 $$win_text =~ s@^([^\\//]*).*(\\|/)(collect(\\|/)$strColName)(.*)$@$1$3$5@mg;
135
136 # for the windows text, need to further get rid of the driveletter after [ or <meta>
137 $$win_text =~ s@^(\[|<[^>]*>)[a-zA-Z]:collect@$1collect@mg;
138
139 } # end of equalising differences between a windows collection's db file and linux coll's db file
140
141 # now can go back to using $model_text and $test_text
142# print_string_to_file($test_text, $savepath.$dbname."_test.out");
143# print_string_to_file($model_text, $savepath.$dbname."_model.out");
144
145 my $report_type = "OldStyle"; # Can not change this type.
146 my $diff_gdb = diff \$model_text, \$test_text, { STYLE => $report_type };
147
148 # leaving the ignore regex as it used to be in the following, in case it helps with single line comparisons
149 $diff_gdb = &diffutil::GenerateOutput($diff_gdb,"^<(lastmodified|lastmodifieddate|oailastmodified|oailastmodifieddate|ImageSize)>.*");
150
151 if($diff_gdb eq "")
152 {
153 return "";
154 }
155 else
156 {
157 return "Difference Report: Differences found in the Database file: \n$diff_gdb";
158 }
159 # Call diff?
160}
161
162# returns true if the contents are windows AND it matters for the diffing on the db that it's windows
163# For col.gdb it does not seem to matter so far, if it is generated on a windows machine and to be compared to a linux-generated col.gdb
164sub isDBWindowsSensitive
165{
166 my ($dbtailname, $db_contents) = @_; # db filename without suffix
167
168 if($dbtailname !~ m/archiveinf/) { # only archiveinf-doc and archive-inf source need special Windows processing, not col.gdb
169 return 0;
170 }
171 return ($db_contents =~ m/\\/) ? 1 : 0; # windows slashes detected. Better test would be: [Something\something] OR <tag>something\something
172 # for doc.xml:
173 # <Metadata name="gsdlsourcefilename">import/html_files/cleves.html</Metadata>
174}
175
1761;
Note: See TracBrowser for help on using the repository browser.