source: other-projects/nightly-tasks/diffcol/trunk/diffcol/gdbdiff.pm@ 27743

Last change on this file since 27743 was 27743, checked in by ak19, 11 years ago

Basic Word-PDF collection now has the same number of diffing errors on Windows upon diffcol as on Linux and Mac. Needed to do a lot of special processing for windows: to remove carriage returns introduced into doc.xml when doing a multiread on the html version of a pdf doc after it has been converted to html. (And similarly, needed to get rid of windows carriage returns introduced into ex.Title meta for pdf01.pdf converted to HTML. This was handled in HTMLPlugin). Further special tags need either to be ignored, if they're time stamps, or specially handled if they're filepaths. Not sure if it's the encoding setting in multiread or maybe the locale that is introducing the carriage returns, but am dealing with this at the point of diffcol since it's not a 'problem' in Greenstone, just an inconsistency across OS-es. There's still one diffcol error remaining for this collection on all 3 OS: one word document has a different word wrap length on the machine where the model col was built compared to the wrap length on the other machines. This may be a setting to wvware or else libreoffice/staroffice, if these are used.

File size: 8.4 KB
Line 
1package gdbdiff;
2
3BEGIN {
4 die "GSDLHOME not set\n" unless defined $ENV{'GSDLHOME'};
5 die "GSDLOS not set\n" unless defined $ENV{'GSDLOS'};
6 unshift (@INC, "$ENV{'GSDLHOME'}/perllib");
7 unshift (@INC, "$ENV{'GSDLHOME'}/perllib/cpan");
8}
9
10use util;
11use diffutil;
12use Text::Diff;
13use Cwd;
14
15if ($^O =~ m/mswin/i) {
16 require Win32; # for working out Windows Long Filenames from Win 8.3 short filenames
17}
18
19sub readin_gdb
20{
21 my ($cmd) = @_;
22
23 open(PIN,"$cmd|")
24 || die "Unable to open pipe to $cmd: $!\n";
25
26 my $text_content = "";
27
28 while (defined (my $line = <PIN>)) {
29 $text_content .= $line;
30 }
31
32 close(PIN);
33 return $text_content;
34}
35
36# for debugging. Prints txt contents of db to file
37sub print_string_to_file
38{
39 my ($text, $outfile) = @_;
40
41 open(FOUT, ">$outfile") or die "ERROR failed to write to $outfile: $!\n";
42 print FOUT $text;
43 close(FOUT);
44}
45
46sub test_gdb
47{
48 my ($full_modeldb, $full_testdb,$strColName) = @_;
49
50 # print "Now is testing database\n";
51 my ($dbname, $dirname, $suffix)= &File::Basename::fileparse($full_testdb, "\\.[^\\.]+\$");
52
53 # need to sort text output of both test and model col database files, to normalise them for the comparison
54 # the -sort option to db2txt was added specifically to support diffcol
55 my $model_cmd = "db2txt -sort $full_modeldb 2>&1";
56 my $test_cmd = "db2txt -sort $full_testdb 2>&1";
57
58 my $model_text = readin_gdb($model_cmd);
59 my $test_text = readin_gdb($test_cmd);
60
61# my $savepath = &getcwd."/../"; # TASK_HOME env var does not exist at this stage, but it's one level up from current directory
62# print_string_to_file($test_text, $savepath.$dbname."_test.out");
63# print_string_to_file($model_text, $savepath.$dbname."_model.out");
64
65 # filter out the fields that can be ignored in the two database files
66 # The total_numbytes field can vary depending on how many backslashes exist in the urls in the main body text, as each
67 # of these windows slashes get escaped with another backslash, and the resulting string is used as key into rel link db
68 my $ignore_line_re = "\n<(lastmodified|lastmodifieddate|oailastmodified|oailastmodifieddate|ex.File.FileModifyDate|ex.File.FilePermissions|total_numbytes)>([^\n])*";
69 $model_text =~ s/$ignore_line_re//g;
70 $test_text =~ s/$ignore_line_re//g;
71
72 # tmp dirs have subdirs with random numbers in name, remove subdir
73 $model_text =~ s@(tmp[\\\/])(\d*[\\\/])@$1@g;
74 $test_text =~ s@(tmp[\\\/])(\d*[\\\/])@$1@g;
75
76 # if the OS doesn't match and one of them is windows, extra work needs to be done to bring the db files
77 # in test and model collection to an even base for comparison
78
79 my $testIsWin = &isDBWindowsSensitive($dbname, $test_text);
80 my $modelIsWin = &isDBWindowsSensitive($dbname, $model_text);
81
82 if($testIsWin == $modelIsWin) { # both linux or both windows, do the basic test we did on linux machines:
83 # ignore absolute path prefixes in modelcol and testcol (necessary for archiveinf-doc and -src.gdb files)
84
85 # Remember the original model col on SVN could have been built anywhere,
86 # and in the gdb files, absolute paths are stored to the collection location.
87 # Crop these paths to the collect/<colname> point.
88
89 # Entries are of the form [Entry] or <Entry>. In order to do a sensible diff,
90 # need to remove the prefix to the collect/colname folder in any (absolute) path that occurs in Entry
91 # E.g. [/full/path/collect/colname/import/file.ext] should become [collect/colname/import/file.ext]
92 # Better regex is of the form /BEGIN((?:(?!BEGIN).)*)END/, see http://docstore.mik.ua/orelly/perl/cookbook/ch06_16.htm
93
94 $model_text =~ s@^([^\\/]*(//)*).*(\\|/)(collect(\\|/)$strColName)(.*)$@$1$4$6@mg;
95 $test_text =~ s@^([^\\/]*(//)*).*(\\|/)(collect(\\|/)$strColName)(.*)$@$1$4$6@mg;
96 #$model_text =~ s@^([^\\//]*).*(\\|/)(collect(\\|/)$strColName)(.*)$@$1$3$5@mg;
97 #$test_text =~ s@^([^\\//]*).*(\\|/)(collect(\\|/)$strColName)(.*)$@$1$3$5@mg;
98 }
99
100 else { # one of the collections was built on windows
101 # handling slashes and other differences between a model coll built on one OS (e.g. linux)
102 # and a test collection built and diffed on another OS (windows)
103
104 my ($win_text, $lin_text); # references
105 if($testIsWin) {
106 $win_text = \$test_text;
107 $lin_text = \$model_text;
108 } else {
109 $win_text = \$model_text;
110 $lin_text = \$test_text;
111 }
112
113 if($dbname =~ m/archiveinf-doc/) {
114 my $tmp = ""; # rebuild windows file's set of lines after processing them one by one
115
116 # convert short filenames to long perl:
117 # http://www.mombu.com/programming/perl/t-convert-dos-83-filenames-to-win32-long-filenames-using-perl-525448.html
118 for my $line (split /^/, $$win_text) { # split the string into newlines
119
120 # assoc-file and meta-file contain filepaths, ensure these are long windows file paths now (will later convert to linux slashes)
121 if($line =~ m@^<(assoc-file|meta-file)>(.*)(\s+)@s) {
122 $line = $2; # may be a short file name
123 # perhaps test here if it is a shortfilename? should match /CAPS....~number(.ext)/
124
125 $line = "<$1>".&Win32::GetLongPathName($line)."$3"; # make it a long file name and prefix assoc-file/meta-file tagname to it again
126 }
127 $tmp .= $line;
128 }
129 $$win_text = $tmp;
130 }
131
132
133 # index gdb file
134 if($dbname =~ m/$strColName/) {
135 my $tmp = ""; # rebuild windows file's set of lines after processing them one by one
136 for my $line (split /^/, $$win_text) { # split the string into newlines
137
138 if($line =~ m@^<ex.File.Directory>(.*)@s) { # word-pdf collection contains double windows backslashes
139 (my $escaped_path = $1) =~ s@\\\\@\\@g;
140 $line = "<ex.File.Directory>$escaped_path";
141 }
142 elsif($line =~ m@^<Title>(.*)@s) {
143# print STDERR "***** TITLE: |$1|\n";
144
145 # word-pdf collection: Title of ps files contain new lines at end when
146 # GreenstoneXMLPlugin::xml_end_tag() writes the Title back out after utf8 decode
147 # if($metadata_name eq "Title") { $metadata_value =~ s/[\n\r]*$//; }
148
149 (my $title = $1) =~ s@(\r|\n|\\n)*$@@; # get rid of trailing newlines/carriage returns
150 $line = "<Title>$title\n"; # add single newline
151 }
152 $tmp .= $line;
153 }
154 $$win_text = $tmp;
155 }
156
157
158 # slashes in windows metadata text need to be turned into linux style slashes
159 $$win_text =~ s@\\@/@g; #$$win_text =~ s@\\([^n|r|\|"])@/$1@g; # filepath something\rtf remains something\rtf
160
161 # cut down absolute paths to files to just collect/colname/.../file, same as before
162 $$lin_text =~ s@^([^\\/]*(//)*).*(\\|/)(collect(\\|/)$strColName)(.*)$@$1$4$6@mg; # $$lin_text =~ s@^([^\\\/]*(//)?).*(\\|/)(collect(\\|/)$strColName)(.*)$@$1$4$6@mg;
163 $$win_text =~ s@^([^\\/]*(//)*).*(\\|/)(collect(\\|/)$strColName)(.*)$@$1$4$6@mg;
164
165 # for the windows text, need to further get rid of the driveletter after [ or <meta>
166 $$win_text =~ s@^(\[|<[^>]*>)[a-zA-Z]:collect@$1collect@mg;
167
168 } # end of equalising differences between a windows collection's db file and linux coll's db file
169
170 # now can go back to using $model_text and $test_text
171# print_string_to_file($test_text, $savepath.$dbname."_test.out");
172# print_string_to_file($model_text, $savepath.$dbname."_model.out");
173
174 my $report_type = "OldStyle"; # Can not change this type.
175 my $diff_gdb = diff \$model_text, \$test_text, { STYLE => $report_type };
176
177 # leaving the ignore regex as it used to be in the following, in case it helps with single line comparisons
178 $diff_gdb = &diffutil::GenerateOutput($diff_gdb,"^<(lastmodified|lastmodifieddate|oailastmodified|oailastmodifieddate|ImageSize)>.*");
179
180 if($diff_gdb eq "")
181 {
182 return "";
183 }
184 else
185 {
186 return "Difference Report: Differences found in the Database file: \n$diff_gdb";
187 }
188 # Call diff?
189}
190
191# returns true if the contents are windows AND it matters for the diffing on the db that it's windows
192# For col.gdb it does not seem to matter so far, if it is generated on a windows machine and to be compared to a linux-generated col.gdb
193sub isDBWindowsSensitive
194{
195 my ($dbtailname, $db_contents) = @_; # db filename without suffix
196
197 #if($dbtailname !~ m/archiveinf/) { # only archiveinf-doc and archive-inf source need special Windows processing, not col.gdb
198 # return 0;
199 #}
200 return ($db_contents =~ m/\\/) ? 1 : 0; # windows slashes detected. Better test would be: [Something\something] OR <tag>something\something
201 # for doc.xml:
202 # <Metadata name="gsdlsourcefilename">import/html_files/cleves.html</Metadata>
203}
204
2051;
Note: See TracBrowser for help on using the repository browser.