source: other-projects/nightly-tasks/diffcol/trunk/diffcol/gdbdiff.pm@ 27969

Last change on this file since 27969 was 27969, checked in by ak19, 11 years ago

When test-running diffcol on the Enhanced-PDF tutorial, it would choke on intermediate ~ files, the cached directory (see previous commit), and the FileSize metadata could differ slightly between OS. Further, the XSLT processing of the report would break if error messages were truncated at an incomplete entity (& symbol) since it becomes invalid xml.

File size: 10.9 KB
Line 
1package gdbdiff;
2
3BEGIN {
4 die "GSDLHOME not set\n" unless defined $ENV{'GSDLHOME'};
5 die "GSDLOS not set\n" unless defined $ENV{'GSDLOS'};
6 unshift (@INC, "$ENV{'GSDLHOME'}/perllib");
7 unshift (@INC, "$ENV{'GSDLHOME'}/perllib/cpan");
8}
9
10use util;
11use diffutil;
12use Text::Diff;
13use Cwd;
14
15if ($^O =~ m/mswin/i) {
16 require Win32; # for working out Windows Long Filenames from Win 8.3 short filenames
17}
18
19sub readin_gdb
20{
21 my ($cmd) = @_;
22
23 open(PIN,"$cmd|")
24 || die "Unable to open pipe to $cmd: $!\n";
25
26 my $text_content = "";
27
28 while (defined (my $line = <PIN>)) {
29 $text_content .= $line;
30 }
31
32 close(PIN);
33 return $text_content;
34}
35
36# for debugging. Prints txt contents of db to file
37sub print_string_to_file
38{
39 my ($text, $outfile) = @_;
40
41 open(FOUT, ">$outfile") or die "ERROR failed to write to $outfile: $!\n";
42 print FOUT $text;
43 close(FOUT);
44}
45
46sub test_gdb
47{
48 my ($full_modeldb, $full_testdb,$strColName) = @_;
49
50 # print "Now is testing database\n";
51 my ($dbname, $dirname, $suffix)= &File::Basename::fileparse($full_testdb, "\\.[^\\.]+\$");
52
53 # need to sort text output of both test and model col database files, to normalise them for the comparison
54 # the -sort option to db2txt was added specifically to support diffcol
55 my $model_cmd = "db2txt -sort $full_modeldb 2>&1";
56 my $test_cmd = "db2txt -sort $full_testdb 2>&1";
57
58 my $model_text = readin_gdb($model_cmd);
59 my $test_text = readin_gdb($test_cmd);
60
61# my $savepath = &getcwd."/../"; # TASK_HOME env var does not exist at this stage, but it's one level up from current directory
62# print_string_to_file($test_text, $savepath.$dbname."_test.out");
63# print_string_to_file($model_text, $savepath.$dbname."_model.out");
64
65 # filter out the fields that can be ignored in the two database files
66 # The total_numbytes field can vary depending on how many backslashes exist in the urls in the main body text, as each
67 # of these windows slashes get escaped with another backslash, and the resulting string is used as key into rel link db
68 my $ignore_line_re = "\n<(FileSize|lastmodified|lastmodifieddate|oailastmodified|oailastmodifieddate|ex.File.FileModifyDate|ex.File.FilePermissions|total_numbytes)>([^\n])*";
69 $model_text =~ s/$ignore_line_re//g;
70 $test_text =~ s/$ignore_line_re//g;
71
72 # tmp dirs have subdirs with random numbers in name, remove subdir
73 # these tmpdirs are located inside the collection directory
74 $model_text =~ s@(tmp[\\\/])(\d*[\\\/])@$1@g;
75 $test_text =~ s@(tmp[\\\/])(\d*[\\\/])@$1@g;
76
77 # if the OS doesn't match and one of them is windows, extra work needs to be done to bring the db files
78 # in test and model collection to an even base for comparison
79
80 my $testIsWin = &isDBWindowsSensitive($dbname, $test_text);
81 my $modelIsWin = &isDBWindowsSensitive($dbname, $model_text);
82
83 if($testIsWin == $modelIsWin) { # both linux or both windows, do the basic test we did on linux machines:
84 # ignore absolute path prefixes in modelcol and testcol (necessary for archiveinf-doc and -src.gdb files)
85
86 # Remember the original model col on SVN could have been built anywhere,
87 # and in the gdb files, absolute paths are stored to the collection location.
88 # Crop these paths to the collect/<colname> point.
89
90 # Entries are of the form [Entry] or <Entry>. In order to do a sensible diff,
91 # need to remove the prefix to the collect/colname folder in any (absolute) path that occurs in Entry
92 # E.g. [/full/path/collect/colname/import/file.ext] should become [collect/colname/import/file.ext]
93 # Better regex is of the form /BEGIN((?:(?!BEGIN).)*)END/, see http://docstore.mik.ua/orelly/perl/cookbook/ch06_16.htm
94
95 $model_text =~ s@^([^\\/]*(//)*).*(\\|/)(collect(\\|/)$strColName)(.*)$@$1$4$6@mg;
96 $test_text =~ s@^([^\\/]*(//)*).*(\\|/)(collect(\\|/)$strColName)(.*)$@$1$4$6@mg;
97 #$model_text =~ s@^([^\\//]*).*(\\|/)(collect(\\|/)$strColName)(.*)$@$1$3$5@mg;
98 #$test_text =~ s@^([^\\//]*).*(\\|/)(collect(\\|/)$strColName)(.*)$@$1$3$5@mg;
99 }
100
101 else { # one of the collections was built on windows
102 # handling slashes and other differences between a model coll built on one OS (e.g. linux)
103 # and a test collection built and diffed on another OS (windows)
104
105 my ($win_text, $lin_text); # references
106 if($testIsWin) {
107 $win_text = \$test_text;
108 $lin_text = \$model_text;
109 } else {
110 $win_text = \$model_text;
111 $lin_text = \$test_text;
112 }
113
114 if($dbname =~ m/archiveinf-doc/) {
115 my $tmp = ""; # rebuild windows file's set of lines after processing them one by one
116
117 # convert short filenames to long perl:
118 # http://www.mombu.com/programming/perl/t-convert-dos-83-filenames-to-win32-long-filenames-using-perl-525448.html
119 for my $line (split /^/, $$win_text) { # split the string into newlines
120
121 # assoc-file and meta-file contain filepaths, ensure these are long windows file paths now (will later convert to linux slashes)
122 if($line =~ m@^<(assoc-file|meta-file)>(.*)(\s+)@s) {
123 $line = $2; # may be a short file name
124 # perhaps test here if it is a shortfilename? should match /CAPS....~number(.ext)/
125
126 $line = "<$1>".&Win32::GetLongPathName($line)."$3"; # make it a long file name and prefix assoc-file/meta-file tagname to it again
127 }
128 $tmp .= $line;
129 }
130 $$win_text = $tmp;
131 }
132
133
134 # index gdb file
135 if($dbname =~ m/$strColName/) {
136 my $tmp = ""; # rebuild windows file's set of lines after processing them one by one
137 for my $line (split /^/, $$win_text) { # split the string into newlines
138
139 if($line =~ m@^<ex.File.Directory>(.*)@s) { # word-pdf collection contains double windows backslashes
140 (my $escaped_path = $1) =~ s@\\\\@\\@g;
141 $line = "<ex.File.Directory>$escaped_path";
142 }
143 elsif($line =~ m@^<Title>(.*)@s) {
144# print STDERR "***** TITLE: |$1|\n";
145
146 # word-pdf collection: Title of ps files contain new lines at end when
147 # GreenstoneXMLPlugin::xml_end_tag() writes the Title back out after utf8 decode
148 # if($metadata_name eq "Title") { $metadata_value =~ s/[\n\r]*$//; }
149
150 (my $title = $1) =~ s@(\r|\n|\\n)*$@@; # get rid of trailing newlines/carriage returns
151 $line = "<Title>$title\n"; # add single newline
152 }
153 $tmp .= $line;
154 }
155 $$win_text = $tmp;
156 }
157
158
159 # slashes in windows metadata text need to be turned into linux style slashes
160 $$win_text =~ s@\\@/@g; #$$win_text =~ s@\\([^n|r|\|"])@/$1@g; # filepath something\rtf remains something\rtf
161
162 # cut down absolute paths to files to just collect/colname/.../file, same as before
163 $$lin_text =~ s@^([^\\/]*(//)*).*(\\|/)(collect(\\|/)$strColName)(.*)$@$1$4$6@mg; # $$lin_text =~ s@^([^\\\/]*(//)?).*(\\|/)(collect(\\|/)$strColName)(.*)$@$1$4$6@mg;
164 $$win_text =~ s@^([^\\/]*(//)*).*(\\|/)(collect(\\|/)$strColName)(.*)$@$1$4$6@mg;
165
166 # for the windows text, need to further get rid of the driveletter after [ or <meta>
167 $$win_text =~ s@^(\[|<[^>]*>)[a-zA-Z]:collect@$1collect@mg;
168
169 } # end of equalising differences between a windows collection's db file and linux coll's db file
170
171 # The following block of code is necessary to deal with tmp (html) source files generated when using PDFBox
172 # These tmpdirs are located inside the toplevel *greenstone* directory
173 (my $gsdlhome_re = $ENV{'GSDLHOME'}) =~ s@\\@\/@g;
174 $gsdlhome_re = ".*" unless $$ENV{'GSDLHOME'};
175 my $tmpfile_regex = "<URL>http://$gsdlhome_re/tmp/([^\.]*)(\..{3,4})"; # $gsdlhome/tmp/randomfilename.html, file ext can be 3 or 4 chars long
176 if($test_text =~ m@$tmpfile_regex@g) {
177 # found a match, replace the tmp file name with "random", keeping the original file extension
178 # in <OrigSource|URL|UTF8URL|gsdlconvertedfilename>
179
180 # This code is slightly different from doc.xml because each document has its own doc.xml, so this needs to be done
181 # only once for doc.xml, but multiple times in index/col.gdb since it contains the random filenames of all docs in the col
182 #my ($old_tmp_filename, $ext) = ($1, $2);
183
184 my $new_tmp_filename = "random";
185
186
187 $tmpfile_regex = "(<(URL|UTF8URL|gsdlconvertedfilename|OrigSource)>(http://)?)($gsdlhome_re)?(/tmp/)?.*(\..{3,4})";
188 if($5) {
189 $test_text =~ s@$tmpfile_regex@$1$5$new_tmp_filename$6@mg;
190 } else { # OrigSource contains only the filename
191 $test_text =~ s@$tmpfile_regex@$1$new_tmp_filename$6@mg;
192 }
193
194 # modelcol used a different gsdlhome, but also a tmp dir, so make the same changes to its random filename
195 $tmpfile_regex = "(<(URL|UTF8URL|gsdlconvertedfilename|OrigSource)>(http://)?)(.*)?(/tmp/)?.*(\..{3,4})";
196 if($5) {
197 $model_text =~ s@$tmpfile_regex@$1$5$new_tmp_filename$6@mg;
198 } else { # OrigSource contains only the filename
199 $model_text =~ s@$tmpfile_regex@$1$new_tmp_filename$6@mg;
200 }
201
202 # index/col.gdb also has entries for the random tmp file names in the form: [http://research/ak19/GS2bin_5July2013/tmp/F639.html]
203 # need to equalise these also. Sadly, when there are multiple intermediate files, their random tmp filenames are not
204 # guaranteed to be generated in the same (alphabetical/numerical) order between model and test collection, so the
205 # HASH OIDs, although all of them accounted for, appear in a different order. So we have to remove the Hash OIDs.
206 #$test_text =~ s@\[http://.*/tmp/.*(\..{3,4})\]@tmp/random$1@mg; # HASH OIDs can appear in different order
207 #$model_text =~ s@\[http://.*/tmp/.*(\..{3,4})\]@tmp/random$1@mg;
208 $test_text =~ s@\[http://.*/tmp/.*(\..{3,4})\]\n<section>[^\n]*\n@tmp/random$1\n<section>RandomHash\n@sg;
209 $model_text =~ s@\[http://.*/tmp/.*(\..{3,4})\]\n<section>[^\n]*\n@tmp/random$1\n<section>RandomHash\n@sg;
210 }
211
212 # now can go back to using $model_text and $test_text
213# print_string_to_file($test_text, $savepath.$dbname."_test.out");
214# print_string_to_file($model_text, $savepath.$dbname."_model.out");
215
216 my $report_type = "OldStyle"; # Can not change this type.
217 my $diff_gdb = diff \$model_text, \$test_text, { STYLE => $report_type };
218
219 # leaving the ignore regex as it used to be in the following, in case it helps with single line comparisons
220 $diff_gdb = &diffutil::GenerateOutput($diff_gdb,"^<(lastmodified|lastmodifieddate|oailastmodified|oailastmodifieddate|ImageSize)>.*");
221
222 if($diff_gdb eq "")
223 {
224 return "";
225 }
226 else
227 {
228 return "Difference Report: Differences found in the Database file: \n$diff_gdb";
229 }
230 # Call diff?
231}
232
233# returns true if the contents are windows AND it matters for the diffing on the db that it's windows
234# For col.gdb it does not seem to matter so far, if it is generated on a windows machine and to be compared to a linux-generated col.gdb
235sub isDBWindowsSensitive
236{
237 my ($dbtailname, $db_contents) = @_; # db filename without suffix
238
239 #if($dbtailname !~ m/archiveinf/) { # only archiveinf-doc and archive-inf source need special Windows processing, not col.gdb
240 # return 0;
241 #}
242 return ($db_contents =~ m/\\/) ? 1 : 0; # windows slashes detected. Better test would be: [Something\something] OR <tag>something\something
243 # for doc.xml:
244 # <Metadata name="gsdlsourcefilename">import/html_files/cleves.html</Metadata>
245}
246
2471;
Note: See TracBrowser for help on using the repository browser.