1 | package gdbdiff;
|
---|
2 |
|
---|
3 | BEGIN {
|
---|
4 | die "GSDLHOME not set\n" unless defined $ENV{'GSDLHOME'};
|
---|
5 | die "GSDLOS not set\n" unless defined $ENV{'GSDLOS'};
|
---|
6 | unshift (@INC, "$ENV{'GSDLHOME'}/perllib");
|
---|
7 | unshift (@INC, "$ENV{'GSDLHOME'}/perllib/cpan");
|
---|
8 | }
|
---|
9 |
|
---|
10 | use util;
|
---|
11 | use diffutil;
|
---|
12 | use Text::Diff;
|
---|
13 | use Cwd;
|
---|
14 |
|
---|
15 | if ($^O =~ m/mswin/i) {
|
---|
16 | require Win32; # for working out Windows Long Filenames from Win 8.3 short filenames
|
---|
17 | }
|
---|
18 |
|
---|
19 | sub readin_gdb
|
---|
20 | {
|
---|
21 | my ($cmd) = @_;
|
---|
22 |
|
---|
23 | open(PIN,"$cmd|")
|
---|
24 | || die "Unable to open pipe to $cmd: $!\n";
|
---|
25 |
|
---|
26 | my $text_content = "";
|
---|
27 |
|
---|
28 | while (defined (my $line = <PIN>)) {
|
---|
29 | $text_content .= $line;
|
---|
30 | }
|
---|
31 |
|
---|
32 | close(PIN);
|
---|
33 | return $text_content;
|
---|
34 | }
|
---|
35 |
|
---|
36 | # for debugging. Prints txt contents of db to file
|
---|
37 | sub print_string_to_file
|
---|
38 | {
|
---|
39 | my ($text, $outfile) = @_;
|
---|
40 |
|
---|
41 | open(FOUT, ">$outfile") or die "ERROR failed to write to $outfile: $!\n";
|
---|
42 | print FOUT $text;
|
---|
43 | close(FOUT);
|
---|
44 | }
|
---|
45 |
|
---|
46 | sub test_gdb
|
---|
47 | {
|
---|
48 | my ($full_modeldb, $full_testdb,$strColName) = @_;
|
---|
49 |
|
---|
50 | # print "Now is testing database\n";
|
---|
51 | my ($dbname, $dirname, $suffix)= &File::Basename::fileparse($full_testdb, "\\.[^\\.]+\$");
|
---|
52 |
|
---|
53 | # need to sort text output of both test and model col database files, to normalise them for the comparison
|
---|
54 | # the -sort option to db2txt was added specifically to support diffcol
|
---|
55 | my $model_cmd = "db2txt -sort $full_modeldb 2>&1";
|
---|
56 | my $test_cmd = "db2txt -sort $full_testdb 2>&1";
|
---|
57 |
|
---|
58 | my $model_text = readin_gdb($model_cmd);
|
---|
59 | my $test_text = readin_gdb($test_cmd);
|
---|
60 |
|
---|
61 | # my $savepath = &getcwd."/../"; # TASK_HOME env var does not exist at this stage, but it's one level up from current directory
|
---|
62 | # print_string_to_file($test_text, $savepath.$dbname."_test.out1");
|
---|
63 | # print_string_to_file($model_text, $savepath.$dbname."_model.out1");
|
---|
64 |
|
---|
65 | # filter out the fields that can be ignored in the two database files
|
---|
66 | # The total_numbytes field can vary depending on how many backslashes exist in the urls in the main body text, as each
|
---|
67 | # of these windows slashes get escaped with another backslash, and the resulting string is used as key into rel link db
|
---|
68 | my $ignore_line_re = "\n<(FileSize|lastmodified|lastmodifieddate|oailastmodified|oailastmodifieddate|ex.File.FileModifyDate|ex.File.FilePermissions|total_numbytes)>([^\n])*";
|
---|
69 | $model_text =~ s/$ignore_line_re//g;
|
---|
70 | $test_text =~ s/$ignore_line_re//g;
|
---|
71 |
|
---|
72 | # tmp dirs have subdirs with random numbers in name, remove subdir
|
---|
73 | # these tmpdirs are located inside the collection directory
|
---|
74 | $model_text =~ s@(tmp[\\\/])(\d*[\\\/])@$1@g;
|
---|
75 | $test_text =~ s@(tmp[\\\/])(\d*[\\\/])@$1@g;
|
---|
76 |
|
---|
77 | # if the OS doesn't match and one of them is windows, extra work needs to be done to bring the db files
|
---|
78 | # in test and model collection to an even base for comparison
|
---|
79 |
|
---|
80 | my $testIsWin = &isDBWindowsSensitive($dbname, $test_text);
|
---|
81 | my $modelIsWin = &isDBWindowsSensitive($dbname, $model_text);
|
---|
82 |
|
---|
83 | if($testIsWin == $modelIsWin) { # both linux or both windows, do the basic test we did on linux machines:
|
---|
84 | # ignore absolute path prefixes in modelcol and testcol (necessary for archiveinf-doc and -src.gdb files)
|
---|
85 |
|
---|
86 | # Remember the original model col on SVN could have been built anywhere,
|
---|
87 | # and in the gdb files, absolute paths are stored to the collection location.
|
---|
88 | # Crop these paths to the collect/<colname> point.
|
---|
89 |
|
---|
90 | # Entries are of the form [Entry] or <Entry>. In order to do a sensible diff,
|
---|
91 | # need to remove the prefix to the collect/colname folder in any (absolute) path that occurs in Entry
|
---|
92 | # E.g. [/full/path/collect/colname/import/file.ext] should become [collect/colname/import/file.ext]
|
---|
93 | # Better regex is of the form /BEGIN((?:(?!BEGIN).)*)END/, see http://docstore.mik.ua/orelly/perl/cookbook/ch06_16.htm
|
---|
94 |
|
---|
95 | $model_text =~ s@^([^\\/]*(//)*).*(\\|/)(collect(\\|/)$strColName)(.*)$@$1$4$6@mg;
|
---|
96 | $test_text =~ s@^([^\\/]*(//)*).*(\\|/)(collect(\\|/)$strColName)(.*)$@$1$4$6@mg;
|
---|
97 | #$model_text =~ s@^([^\\//]*).*(\\|/)(collect(\\|/)$strColName)(.*)$@$1$3$5@mg;
|
---|
98 | #$test_text =~ s@^([^\\//]*).*(\\|/)(collect(\\|/)$strColName)(.*)$@$1$3$5@mg;
|
---|
99 | }
|
---|
100 |
|
---|
101 | else { # one of the collections was built on windows
|
---|
102 | # handling slashes and other differences between a model coll built on one OS (e.g. linux)
|
---|
103 | # and a test collection built and diffed on another OS (windows)
|
---|
104 |
|
---|
105 | my ($win_text, $lin_text); # references
|
---|
106 | if($testIsWin) {
|
---|
107 | $win_text = \$test_text;
|
---|
108 | $lin_text = \$model_text;
|
---|
109 | } else {
|
---|
110 | $win_text = \$model_text;
|
---|
111 | $lin_text = \$test_text;
|
---|
112 | }
|
---|
113 |
|
---|
114 | if($dbname =~ m/archiveinf-doc/) {
|
---|
115 | my $tmp = ""; # rebuild windows file's set of lines after processing them one by one
|
---|
116 |
|
---|
117 | # convert short filenames to long perl:
|
---|
118 | # http://www.mombu.com/programming/perl/t-convert-dos-83-filenames-to-win32-long-filenames-using-perl-525448.html
|
---|
119 | for my $line (split /^/, $$win_text) { # split the string into newlines
|
---|
120 |
|
---|
121 | # assoc-file and meta-file contain filepaths, ensure these are long windows file paths now (will later convert to linux slashes)
|
---|
122 | if($line =~ m@^<(assoc-file|meta-file)>(.*)(\s+)@s) {
|
---|
123 | $line = $2; # may be a short file name
|
---|
124 | # perhaps test here if it is a shortfilename? should match /CAPS....~number(.ext)/
|
---|
125 |
|
---|
126 | $line = "<$1>".&Win32::GetLongPathName($line)."$3"; # make it a long file name and prefix assoc-file/meta-file tagname to it again
|
---|
127 | }
|
---|
128 | $tmp .= $line;
|
---|
129 | }
|
---|
130 | $$win_text = $tmp;
|
---|
131 | }
|
---|
132 |
|
---|
133 |
|
---|
134 | # index gdb file
|
---|
135 | if($dbname =~ m/$strColName/) {
|
---|
136 | my $tmp = ""; # rebuild windows file's set of lines after processing them one by one
|
---|
137 | for my $line (split /^/, $$win_text) { # split the string into newlines
|
---|
138 |
|
---|
139 | # In the following regex, add any .gdb fieldnames that represent a path and so would contain double backslashes
|
---|
140 | # on Windows (to escape the single backlash of win filepaths). They will be turned into single-backslashes here,
|
---|
141 | # and converted into single forward slashes futher below when the txt version of the win gdb file is normalised
|
---|
142 | # to compare it with the linux version.
|
---|
143 | # E.g. On windows, the Word-PDF collection(s) contains double backslashes in the ex.File.Directory field
|
---|
144 | # the MARC-Exploded collection contains double backslashes in the null_file entry field of the .gdb file
|
---|
145 | if($line =~ m@^<(ex.File.Directory|null_file)>(.*)@s) {
|
---|
146 | my ($fieldname, $escaped_path) = ($1, $2);
|
---|
147 | $escaped_path =~ s@\\\\@\\@g; #(my $escaped_path = $2) =~ s@\\\\@\\@g;
|
---|
148 | $line = "<$fieldname>$escaped_path";
|
---|
149 | }
|
---|
150 | elsif($line =~ m@^<Title>(.*)@s) {
|
---|
151 | # print STDERR "***** TITLE: |$1|\n";
|
---|
152 |
|
---|
153 | # word-pdf collection: Title of ps files contain new lines at end when
|
---|
154 | # GreenstoneXMLPlugin::xml_end_tag() writes the Title back out after utf8 decode
|
---|
155 | # if($metadata_name eq "Title") { $metadata_value =~ s/[\n\r]*$//; }
|
---|
156 |
|
---|
157 | (my $title = $1) =~ s@(\r|\n|\\n)*$@@; # get rid of trailing newlines/carriage returns
|
---|
158 | $line = "<Title>$title\n"; # add single newline
|
---|
159 | }
|
---|
160 | $tmp .= $line;
|
---|
161 | }
|
---|
162 | $$win_text = $tmp;
|
---|
163 | }
|
---|
164 |
|
---|
165 |
|
---|
166 | # slashes in windows metadata text need to be turned into linux style slashes
|
---|
167 | $$win_text =~ s@\\@/@g; #$$win_text =~ s@\\([^n|r|\|"])@/$1@g; # filepath something\rtf remains something\rtf
|
---|
168 |
|
---|
169 | # cut down absolute paths to files to just collect/colname/.../file, same as before
|
---|
170 | $$lin_text =~ s@^([^\\/]*(//)*).*(\\|/)(collect(\\|/)$strColName)(.*)$@$1$4$6@mg; # $$lin_text =~ s@^([^\\\/]*(//)?).*(\\|/)(collect(\\|/)$strColName)(.*)$@$1$4$6@mg;
|
---|
171 | $$win_text =~ s@^([^\\/]*(//)*).*(\\|/)(collect(\\|/)$strColName)(.*)$@$1$4$6@mg;
|
---|
172 |
|
---|
173 | # for the windows text, need to further get rid of the driveletter after [ or <meta>
|
---|
174 | $$win_text =~ s@^(\[|<[^>]*>)[a-zA-Z]:collect@$1collect@mg;
|
---|
175 |
|
---|
176 | } # end of equalising differences between a windows collection's db file and linux coll's db file
|
---|
177 |
|
---|
178 | # The following block of code is necessary to deal with tmp (html) source files generated when using PDFBox
|
---|
179 | # These tmpdirs are located inside the toplevel *greenstone* directory
|
---|
180 | (my $gsdlhome_re = $ENV{'GSDLHOME'}) =~ s@\\@\/@g;
|
---|
181 | $gsdlhome_re = ".*" unless $$ENV{'GSDLHOME'};
|
---|
182 | my $tmpfile_regex = "<URL>http://$gsdlhome_re/tmp/([^\.]*)(\..{3,4})"; # $gsdlhome/tmp/randomfilename.html, file ext can be 3 or 4 chars long
|
---|
183 | if($test_text =~ m@$tmpfile_regex@g) {
|
---|
184 | # found a match, replace the tmp file name with "random", keeping the original file extension
|
---|
185 | # in <OrigSource|URL|UTF8URL|gsdlconvertedfilename>
|
---|
186 |
|
---|
187 | # This code is slightly different from doc.xml because each document has its own doc.xml, so this needs to be done
|
---|
188 | # only once for doc.xml, but multiple times in index/col.gdb since it contains the random filenames of all docs in the col
|
---|
189 | #my ($old_tmp_filename, $ext) = ($1, $2);
|
---|
190 |
|
---|
191 | my $new_tmp_filename = "random";
|
---|
192 |
|
---|
193 |
|
---|
194 | $tmpfile_regex = "(<(URL|UTF8URL|gsdlconvertedfilename|OrigSource)>(http://)?)($gsdlhome_re)?(/tmp/)?.*(\..{3,4})";
|
---|
195 | if($5) {
|
---|
196 | $test_text =~ s@$tmpfile_regex@$1$5$new_tmp_filename$6@mg;
|
---|
197 | } else { # OrigSource contains only the filename
|
---|
198 | $test_text =~ s@$tmpfile_regex@$1$new_tmp_filename$6@mg;
|
---|
199 | }
|
---|
200 |
|
---|
201 | # modelcol used a different gsdlhome, but also a tmp dir, so make the same changes to its random filename
|
---|
202 | $tmpfile_regex = "(<(URL|UTF8URL|gsdlconvertedfilename|OrigSource)>(http://)?)(.*)?(/tmp/)?.*(\..{3,4})";
|
---|
203 | if($5) {
|
---|
204 | $model_text =~ s@$tmpfile_regex@$1$5$new_tmp_filename$6@mg;
|
---|
205 | } else { # OrigSource contains only the filename
|
---|
206 | $model_text =~ s@$tmpfile_regex@$1$new_tmp_filename$6@mg;
|
---|
207 | }
|
---|
208 |
|
---|
209 | # index/col.gdb also has entries for the random tmp file names in the form: [http://research/ak19/GS2bin_5July2013/tmp/F639.html]
|
---|
210 | # need to equalise these also. Sadly, when there are multiple intermediate files, their random tmp filenames are not
|
---|
211 | # guaranteed to be generated in the same (alphabetical/numerical) order between model and test collection, so the
|
---|
212 | # HASH OIDs, although all of them accounted for, appear in a different order. So we have to remove the Hash OIDs.
|
---|
213 | #$test_text =~ s@\[http://.*/tmp/.*(\..{3,4})\]@tmp/random$1@mg; # HASH OIDs can appear in different order
|
---|
214 | #$model_text =~ s@\[http://.*/tmp/.*(\..{3,4})\]@tmp/random$1@mg;
|
---|
215 | $test_text =~ s@\[http://.*/tmp/.*(\..{3,4})\]\n<section>[^\n]*\n@tmp/random$1\n<section>RandomHash\n@sg;
|
---|
216 | $model_text =~ s@\[http://.*/tmp/.*(\..{3,4})\]\n<section>[^\n]*\n@tmp/random$1\n<section>RandomHash\n@sg;
|
---|
217 | }
|
---|
218 |
|
---|
219 | # now can go back to using $model_text and $test_text
|
---|
220 | # print_string_to_file($test_text, $savepath.$dbname."_test.out");
|
---|
221 | # print_string_to_file($model_text, $savepath.$dbname."_model.out");
|
---|
222 |
|
---|
223 | my $report_type = "OldStyle"; # Can not change this type.
|
---|
224 | my $diff_gdb = diff \$model_text, \$test_text, { STYLE => $report_type };
|
---|
225 |
|
---|
226 | # leaving the ignore regex as it used to be in the following, in case it helps with single line comparisons
|
---|
227 | $diff_gdb = &diffutil::GenerateOutput($diff_gdb,"^<(lastmodified|lastmodifieddate|oailastmodified|oailastmodifieddate|ImageSize)>.*");
|
---|
228 |
|
---|
229 | if($diff_gdb eq "")
|
---|
230 | {
|
---|
231 | return "";
|
---|
232 | }
|
---|
233 | else
|
---|
234 | {
|
---|
235 | return "Difference Report: Differences found in the Database file: \n$diff_gdb";
|
---|
236 | }
|
---|
237 | # Call diff?
|
---|
238 | }
|
---|
239 |
|
---|
240 | # returns true if the contents are windows AND it matters for the diffing on the db that it's windows
|
---|
241 | # For col.gdb it does not seem to matter so far, if it is generated on a windows machine and to be compared to a linux-generated col.gdb
|
---|
242 | sub isDBWindowsSensitive
|
---|
243 | {
|
---|
244 | my ($dbtailname, $db_contents) = @_; # db filename without suffix
|
---|
245 |
|
---|
246 | #if($dbtailname !~ m/archiveinf/) { # only archiveinf-doc and archive-inf source need special Windows processing, not col.gdb
|
---|
247 | # return 0;
|
---|
248 | #}
|
---|
249 | return ($db_contents =~ m/\\/) ? 1 : 0; # windows slashes detected. Better test would be: [Something\something] OR <tag>something\something
|
---|
250 | # for doc.xml:
|
---|
251 | # <Metadata name="gsdlsourcefilename">import/html_files/cleves.html</Metadata>
|
---|
252 | }
|
---|
253 |
|
---|
254 | 1;
|
---|