source: other-projects/nightly-tasks/diffcol/trunk/diffcol/gdbdiff.pm@ 27725

Last change on this file since 27725 was 27725, checked in by ak19, 11 years ago

Images can be different in size when generated by imagemagick on win/darwin/linux. For now we ignore images when diffing, later we will have a separate ImageDiff.pm perl module as we have gdbdiff.pm at the moment

File size: 6.7 KB
Line 
1package gdbdiff;
2
3BEGIN {
4 die "GSDLHOME not set\n" unless defined $ENV{'GSDLHOME'};
5 die "GSDLOS not set\n" unless defined $ENV{'GSDLOS'};
6 unshift (@INC, "$ENV{'GSDLHOME'}/perllib");
7 unshift (@INC, "$ENV{'GSDLHOME'}/perllib/cpan");
8}
9
10use util;
11use diffutil;
12use Text::Diff;
13use Cwd;
14
15if ($^O =~ m/mswin/i) {
16 require Win32; # for working out Windows Long Filenames from Win 8.3 short filenames
17}
18
19sub readin_gdb
20{
21 my ($cmd) = @_;
22
23 open(PIN,"$cmd|")
24 || die "Unable to open pipe to $cmd: $!\n";
25
26 my $text_content = "";
27
28 while (defined (my $line = <PIN>)) {
29 $text_content .= $line;
30 }
31
32 close(PIN);
33 return $text_content;
34}
35
36# for debugging. Prints txt contents of db to file
37sub print_string_to_file
38{
39 my ($text, $outfile) = @_;
40
41 open(FOUT, ">$outfile") or die "ERROR failed to write to $outfile: $!\n";
42 print FOUT $text;
43 close(FOUT);
44}
45
46sub test_gdb
47{
48 my ($full_modeldb, $full_testdb,$strColName) = @_;
49
50 # print "Now is testing database\n";
51 my ($dbname, $dirname, $suffix)= &File::Basename::fileparse($full_testdb, "\\.[^\\.]+\$");
52
53 # need to sort text output of both test and model col database files, to normalise them for the comparison
54 # the -sort option to db2txt was added specifically to support diffcol
55 my $model_cmd = "db2txt -sort $full_modeldb 2>&1";
56 my $test_cmd = "db2txt -sort $full_testdb 2>&1";
57
58 my $model_text = readin_gdb($model_cmd);
59 my $test_text = readin_gdb($test_cmd);
60
61# my $savepath = &getcwd."/../"; # TASK_HOME env does not exist at this stage, but it's one level up from current directory
62# print_string_to_file($test_text, $savepath.$dbname."_test.out");
63# print_string_to_file($model_text, $savepath.$dbname."_model.out");
64
65 # filter out the fields that can be ignored in the two database files
66 # The total_numbytes field can vary depending on how many backslashes exist in the urls in the main body text, as each
67 # of these windows slashes get escaped with another backslash, and the resulting string is used as key into rel link db
68 my $ignore_line_re = "\n<(lastmodified|lastmodifieddate|oailastmodified|oailastmodifieddate|total_numbytes)>([^\n])*";
69 $model_text =~ s/$ignore_line_re//g;
70 $test_text =~ s/$ignore_line_re//g;
71
72
73 # if the OS doesn't match and one of them is windows, extra work needs to be done to bring the db files
74 # in test and model collection to an even base for comparison
75
76 my $testIsWin = &isDBWindowsSensitive($dbname, $test_text);
77 my $modelIsWin = &isDBWindowsSensitive($dbname, $model_text);
78
79 if($testIsWin == $modelIsWin) { # both linux or both windows, do the basic test we did on linux machines:
80 # ignore absolute path prefixes in modelcol and testcol (necessary for archiveinf-doc and -src.gdb files)
81
82 # Remember the original model col on SVN could have been built anywhere,
83 # and in the gdb files, absolute paths are stored to the collection location.
84 # Crop these paths to the collect/<colname> point.
85
86 # Entries are of the form [Entry] or <Entry>. In order to do a sensible diff,
87 # need to remove the prefix to the collect/colname folder in any (absolute) path that occurs in Entry
88 # E.g. [/full/path/collect/colname/import/file.ext] should become [collect/colname/import/file.ext]
89 # Better regex is of the form /BEGIN((?:(?!BEGIN).)*)END/, see http://docstore.mik.ua/orelly/perl/cookbook/ch06_16.htm
90
91 $model_text =~ s@^([^\\//]*).*(\\|/)(collect(\\|/)$strColName)(.*)$@$1$3$5@mg;
92 $test_text =~ s@^([^\\//]*).*(\\|/)(collect(\\|/)$strColName)(.*)$@$1$3$5@mg;
93 }
94
95 else { # one of the collections was built on windows
96 # handling slashes and other differences between a model coll built on one OS (e.g. linux)
97 # and a test collection built and diffed on another OS (windows)
98
99 my ($win_text, $lin_text); # references
100 if($testIsWin) {
101 $win_text = \$test_text;
102 $lin_text = \$model_text;
103 } else {
104 $win_text = \$model_text;
105 $lin_text = \$test_text;
106 }
107
108 if($dbname =~ m/archiveinf-doc/) {
109 my $tmp = ""; # rebuild windows file's set of lines after processing them one by one
110
111 # convert short filenames to long perl:
112 # http://www.mombu.com/programming/perl/t-convert-dos-83-filenames-to-win32-long-filenames-using-perl-525448.html
113 for my $line (split /^/, $$win_text) { # split the string into newlines
114
115 if($line =~ m@^<assoc-file>(.*)(\s+)@s) {
116 $line = $1; # may be a short file name
117 # perhaps test here if it is a shortfilename? should match /CAPS....~number(.ext)/
118
119 $line = "<assoc-file>".&Win32::GetLongPathName($line)."$2"; # make it a long file name and prefix assoc-file to it again
120 }
121 $tmp .= $line;
122 }
123 $$win_text = $tmp;
124 }
125
126 # slashes in windows text need to be turned into linux style slashes
127 $$win_text =~ s@\\@/@g;
128
129 # cut down absolute paths to files to just collect/colname/.../file, same as before
130 $$lin_text =~ s@^([^\\//]*).*(\\|/)(collect(\\|/)$strColName)(.*)$@$1$3$5@mg;
131 $$win_text =~ s@^([^\\//]*).*(\\|/)(collect(\\|/)$strColName)(.*)$@$1$3$5@mg;
132
133 # for the windows text, need to further get rid of the driveletter after [ or <meta>
134 $$win_text =~ s@^(\[|<[^>]*>)[a-zA-Z]:collect@$1collect@mg;
135
136 } # end of equalising differences between a windows collection's db file and linux coll's db file
137
138 # now can go back to using $model_text and $test_text
139# print_string_to_file($test_text, $savepath.$dbname."_test.out");
140# print_string_to_file($model_text, $savepath.$dbname."_model.out");
141
142 my $report_type = "OldStyle"; # Can not change this type.
143 my $diff_gdb = diff \$model_text, \$test_text, { STYLE => $report_type };
144
145 # leaving the ignore regex as it used to be in the following, in case it helps with single line comparisons
146 $diff_gdb = &diffutil::GenerateOutput($diff_gdb,"^<(lastmodified|lastmodifieddate|oailastmodified|oailastmodifieddate|ImageSize)>.*");
147
148 if($diff_gdb eq "")
149 {
150 return "";
151 }
152 else
153 {
154 return "Difference Report: Differences found in the Database file: \n$diff_gdb";
155 }
156 # Call diff?
157}
158
159# returns true if the contents are windows AND it matters for the diffing on the db that it's windows
160# For col.gdb it does not seem to matter so far, if it is generated on a windows machine and to be compared to a linux-generated col.gdb
161sub isDBWindowsSensitive
162{
163 my ($dbtailname, $db_contents) = @_; # db filename without suffix
164
165 if($dbtailname !~ m/archiveinf/) { # only archiveinf-doc and archive-inf source need special Windows processing, not col.gdb
166 return 0;
167 }
168 return ($db_contents =~ m/\\/) ? 1 : 0; # windows slashes detected. Better test would be: [Something\something] OR <tag>something\something
169 # for doc.xml:
170 # <Metadata name="gsdlsourcefilename">import/html_files/cleves.html</Metadata>
171}
172
1731;
Note: See TracBrowser for help on using the repository browser.