root/other-projects/nightly-tasks/diffcol/trunk/diffcol/gdbdiff.pm2 @ 30652

Revision 30652, 14.5 KB (checked in by ak19, 4 years ago)

Committing outstanding files for diffcol supporting jdb for GS3 diffing. Not yet in use, but I want it on SVN and to not go missing.

Line 
1package gdbdiff;
2
3BEGIN {
4    die "GSDLHOME not set\n" unless defined $ENV{'GSDLHOME'};
5    die "GSDLOS not set\n" unless defined $ENV{'GSDLOS'};
6    unshift (@INC, "$ENV{'GSDLHOME'}/perllib");
7    unshift (@INC, "$ENV{'GSDLHOME'}/perllib/cpan");
8}
9
10use util;
11use diffutil;
12use Text::Diff;
13use Cwd;
14
15if ($^O =~ m/mswin/i) {
16    require Win32; # for working out Windows Long Filenames from Win 8.3 short filenames
17}
18
19sub readin_gdb
20{
21    my ($cmd) = @_;
22
23    open(PIN,"$cmd|")
24    || die "Unable to open pipe to $cmd: $!\n";
25
26    my $text_content = "";
27
28    while (defined (my $line = <PIN>)) {
29    $text_content .= $line;
30    }
31
32    close(PIN);
33    return $text_content;
34}
35
36sub read_db
37{
38    # need to sort text output of both test and model col database files, to normalise them for the comparison
39    # the -sort option to db2txt was added specifically to support diffcol
40    my($db_file) = @_;
41    my $db_cmd = "db2txt -sort $db_file 2>&1";
42    if($db_file =~ m/\.jdb$/) {
43    my $runperl = ($ENV{'GSDLOS'} =~ m/windows/) ? "perl -S" : "";
44    $db_cmd = "$runperl jdb2txt.pl -sort $db_file";
45    }
46
47    my $db_text = readin_gdb($db_cmd);
48    return $db_text;
49}
50
51sub text_to_db_to_text
52{
53    my($db_text, $db_filename) = @_;
54
55    # http://stackoverflow.com/questions/1909262/how-can-i-pipe-input-into-a-java-command-from-perl
56   
57    if($db_filename =~ m/\.jdb$/) {
58    my $runperl = ($ENV{'GSDLOS'} =~ m/windows/) ? "perl -S" : "";
59    open PIPE, "| $runperl txt2jdb.pl $db_filename";
60    print PIPE "$db_text";
61    close(PIPE);
62    } else {
63    open PIPE, "| txt2db $db_filename";
64    print PIPE "$db_text";
65    close(PIPE);
66    }
67    return &read_db("$db_filename");
68}
69
70# for debugging. Prints txt contents of db to file
71sub print_string_to_file
72{
73    my ($text, $outfile) = @_;
74   
75    open(FOUT, ">$outfile") or die "ERROR failed to write to $outfile: $!\n";
76    print FOUT $text;
77    close(FOUT);
78}
79
80sub test_gdb
81{
82    my ($full_modeldb, $full_testdb, $strColName, $test_os, $model_os, $strTestCol, $strModelCol, $debugging) = @_;
83
84   # print "Now is testing database\n";
85    my ($dbname, $dirname, $suffix)= &File::Basename::fileparse($full_testdb, "\\.[^\\.]+\$");
86   
87    my $model_text = read_db($full_modeldb);
88    my $test_text = read_db($full_testdb);
89
90    my $savepath = &getcwd."/../"; # TASK_HOME env var does not exist at this stage, but it's one level up from current directory
91    if($debugging) {
92    print_string_to_file($test_text, $savepath.$dbname."_test.out1");   
93    print_string_to_file($model_text, $savepath.$dbname."_model.out1");
94    }
95
96    # filter out the fields that can be ignored in the two database files
97    # The total_numbytes field can vary depending on how many backslashes exist in the urls in the main body text, as each
98    # of these windows slashes get escaped with another backslash, and the resulting string is used as key into rel link db
99    my $ignore_line_re = "\n<(FileSize|lastmodified|lastmodifieddate|oailastmodified|oailastmodifieddate|ex.File.FileModifyDate|ex.File.FilePermissions|total_numbytes|ex.Composite.LightValue)>([^\n])*";
100    $model_text =~ s/$ignore_line_re//g;
101    $test_text =~ s/$ignore_line_re//g;
102
103    # tmp dirs have subdirs with random numbers in name, remove subdir
104    # these tmpdirs are located inside the collection directory
105    $model_text =~ s@(tmp[\\\/])(\d*[\\\/])@$1@g;
106    $test_text =~ s@(tmp[\\\/])(\d*[\\\/])@$1@g;
107
108    # if the OS doesn't match and one of them is windows, extra work needs to be done to bring the db files
109    # in test and model collection to an even base for comparison
110   
111    my $testIsWin = ($test_os ne "compute") ? ($test_os eq "windows") : &isDBWindowsSensitive($dbname, $test_text);
112    my $modelIsWin = ($model_os ne "compute") ? ($model_os eq "windows") : &isDBWindowsSensitive($dbname, $model_text);
113   
114    if($testIsWin == $modelIsWin) {     
115    # both linux or both windows, do the basic test we did on linux machines:
116            # ignore absolute path prefixes in modelcol and testcol (necessary for archiveinf-doc and -src.gdb files)
117
118            # Remember the original model col on SVN could have been built anywhere,
119            # and in the gdb files, absolute paths are stored to the collection location.
120            # Crop these paths to the collect/<colname> point.
121           
122            # Entries are of the form [Entry] or <Entry>. In order to do a sensible diff,
123            # need to remove the prefix to the collect/colname folder in any (absolute) path that occurs in Entry
124            # E.g. [/full/path/collect/colname/import/file.ext] should become [collect/colname/import/file.ext]
125            # Better regex is of the form /BEGIN((?:(?!BEGIN).)*)END/, see http://docstore.mik.ua/orelly/perl/cookbook/ch06_16.htm
126
127            $model_text =~ s@^([^\\/]*(//)*).*(\\|/)(collect(\\|/)$strColName)(.*)$@$1$4$6@mg;
128            $test_text =~ s@^([^\\/]*(//)*).*(\\|/)(collect(\\|/)$strColName)(.*)$@$1$4$6@mg;
129            #$model_text =~ s@^([^\\//]*).*(\\|/)(collect(\\|/)$strColName)(.*)$@$1$3$5@mg;         
130            #$test_text =~ s@^([^\\//]*).*(\\|/)(collect(\\|/)$strColName)(.*)$@$1$3$5@mg;                     
131    }
132   
133    else { # one of the collections was built on windows
134        # handling slashes and other differences between a model coll built on one OS (e.g. linux)
135        # and a test collection built and diffed on another OS (windows)
136   
137        my ($win_text, $lin_text); # references
138        my $collection_path = $strTestCol; # full path to a windows collection
139       
140        if($testIsWin) {
141            $collection_path = $strTestCol; # test collection path is windows
142            $win_text = \$test_text;
143            $lin_text = \$model_text;
144        } else {
145            $collection_path = $strModelCol; # model collection path is windows
146            $win_text = \$model_text;
147            $lin_text = \$test_text;
148        }
149       
150        if($dbname =~ m/archiveinf-doc/) {
151           
152            (my $collection_path_re = $collection_path) =~ s@\\@\\\\@g;
153           
154            my $tmp = ""; # rebuild windows file's set of lines after processing them one by one
155   
156            # convert short filenames to long perl:
157            # http://www.mombu.com/programming/perl/t-convert-dos-83-filenames-to-win32-long-filenames-using-perl-525448.html
158            for my $line (split /^/, $$win_text) { # split the string into newlines
159               
160                # assoc-file and meta-file contain filepaths, ensure these are long windows file paths now (will later convert to linux slashes)   
161                if($line =~ m@^<(assoc-file|meta-file|src-file)>(.*)(\s+)@s) {
162                    my ($field, $value, $suffix) = ($1, $2, $3);
163                    $line = $value; # it may be a short file name   
164                   
165                    if($line !~ m/^\@/) { # if the path doesn't use a "relative" @GSPATH@ placeholder string, but is an absolute path instead                       
166                        # perhaps test here if it is a shortfilename? should match /CAPS....~number(.ext)/                     
167                        $line = "<$field>".&Win32::GetLongPathName($line)."$suffix"; # make it a long file name and prefix assoc-file/meta-file tagname to it again                 
168                    }
169                    else { # if $line contains @THISCOLLECTIONPATH@, still need to deal with DOS filenames suffixes:
170                        # replace placeholder with absolute path and expand to long filename, then insert placeholder in its original place again                       
171                        $line =~ s/\@THISCOLLECTPATH\@/$collection_path/;
172                        $line = &Win32::GetLongPathName($line);
173                        $line =~ s/^$collection_path_re/\@THISCOLLECTPATH\@/;
174                        $line = "<$field>".$line."$suffix";
175                    }
176                }
177                $tmp .= $line;
178            }
179            $$win_text = $tmp;         
180        }
181       
182       
183        # index gdb file
184        if($dbname =~ m/$strColName/) {
185            my $tmp = ""; # rebuild windows file's set of lines after processing them one by one
186            for my $line (split /^/, $$win_text) { # split the string into newlines         
187
188                # In the following regex, add any .gdb fieldnames that represent a path and so would contain double backslashes
189                # on Windows (to escape the single backlash of win filepaths). They will be turned into single-backslashes here,
190                # and converted into single forward slashes futher below when the txt version of the win gdb file is normalised
191                # to compare it with the linux version.
192                # E.g. On windows, the Word-PDF collection(s) contains double backslashes in the ex.File.Directory field
193                # the MARC-Exploded collection contains double backslashes in the null_file entry field of the .gdb file               
194                if($line =~ m@^<(ex.File.Directory|null_file)>(.*)@s) {
195                    my ($fieldname, $escaped_path) = ($1, $2);
196                    $escaped_path =~ s@\\\\@/@g; #(my $escaped_path = $2) =~ s@\\\\@\\@g;
197                    $line = "<$fieldname>$escaped_path";
198                }
199                elsif($line =~ m@^<Title>(.*)@s) {
200#                   print STDERR "***** TITLE: |$1|\n";
201               
202                    # word-pdf collection: Title of ps files contain new lines at end when
203                    # GreenstoneXMLPlugin::xml_end_tag() writes the Title back out after utf8 decode
204                    # if($metadata_name eq "Title") { $metadata_value =~ s/[\n\r]*$//; }
205               
206                    (my $title = $1) =~ s@(\r|\n|\\n)*$@@; # get rid of trailing newlines/carriage returns
207                    $line = "<Title>$title\n"; # add single newline                 
208                }
209                $tmp .= $line;
210            }
211            $$win_text = $tmp;
212       
213            # slashes in windows metadata text need to be turned into linux style slashes.
214            # index\col.gdb uses double backslashes, and single for \n,\t
215            #$$win_text =~ s@\\\\@/@g;
216        }
217        else { # archiveinf gdb file
218       
219            # slashes in windows metadata text need to be turned into linux style slashes.
220            # In the two archivesinf gdb files, filepaths may use single backslashes
221            $$win_text =~ s@\\@/@g; #$$win_text =~ s@\\([^n|r|\|"])@/$1@g; # filepath something\rtf remains something\rtf
222        }
223       
224        # cut down absolute paths to files to just collect/colname/.../file, same as before
225        $$lin_text =~ s@^([^\\/]*(//)*).*(\\|/)(collect(\\|/)$strColName)(.*)$@$1$4$6@mg; # $$lin_text =~ s@^([^\\\/]*(//)?).*(\\|/)(collect(\\|/)$strColName)(.*)$@$1$4$6@mg;
226        $$win_text =~ s@^([^\\/]*(//)*).*(\\|/)(collect(\\|/)$strColName)(.*)$@$1$4$6@mg;       
227       
228        # for the windows text, need to further get rid of the driveletter after [ or <meta>
229        $$win_text =~ s@^(\[|<[^>]*>)[a-zA-Z]:collect@$1collect@mg;     
230       
231    } # end of equalising differences between a windows collection's db file and linux coll's db file
232   
233    # The following block of code is necessary to deal with tmp (html) source files generated when using PDFBox
234    # These tmpdirs are located inside the toplevel *greenstone* directory
235    (my $gsdlhome_re = $ENV{'GSDLHOME'}) =~ s@\\@\/@g;     
236    $gsdlhome_re = ".*" unless $$ENV{'GSDLHOME'};
237    my $tmpfile_regex = "<URL>http://$gsdlhome_re/tmp/([^\.]*?)(\..{3,4})"; # $gsdlhome/tmp/randomfilename.html, file ext can be 3 or 4 chars long         
238    if($test_text =~ m@$tmpfile_regex@g) {         
239        # found a match, replace the tmp file name with "random", keeping the original file extension
240        # in <OrigSource|URL|UTF8URL|gsdlconvertedfilename>
241   
242        # This code is slightly different from doc.xml because each document has its own doc.xml, so this needs to be done
243        # only once for doc.xml, but multiple times in index/col.gdb since it contains the random filenames of all docs in the col 
244        #my ($old_tmp_filename, $ext) = ($1, $2);
245       
246        my $new_tmp_filename = "random";           
247   
248       
249        $tmpfile_regex = "(<(URL|UTF8URL|gsdlconvertedfilename|OrigSource)>(http://)?)($gsdlhome_re)?(/tmp/)?.*?(\..{3,4})";
250        if($5) {
251            $test_text =~ s@$tmpfile_regex@$1$5$new_tmp_filename$6@mg;
252        } else { # OrigSource contains only the filename
253            $test_text =~ s@$tmpfile_regex@$1$new_tmp_filename$6@mg;
254        }
255       
256        # modelcol used a different gsdlhome, but also a tmp dir, so make the same changes to its random filename           
257        $tmpfile_regex = "(<(URL|UTF8URL|gsdlconvertedfilename|OrigSource)>(http://)?)(.*)?(/tmp/)?.*?(\..{3,4})";
258        if($5) {
259            $model_text =~ s@$tmpfile_regex@$1$5$new_tmp_filename$6@mg;
260        } else { # OrigSource contains only the filename
261            $model_text =~ s@$tmpfile_regex@$1$new_tmp_filename$6@mg;
262        }
263       
264        # index/col.gdb also has entries for the random tmp file names in the form: [http://research/ak19/GS2bin_5July2013/tmp/F639.html]
265        # need to equalise these also. Sadly, when there are multiple intermediate files, their random tmp filenames are not
266        # guaranteed to be generated in the same (alphabetical/numerical) order between model and test collection, so the
267        # HASH OIDs, although all of them accounted for, appear in a different order. So we have to remove the Hash OIDs.
268        #$test_text =~ s@\[http://.*/tmp/.*(\..{3,4})\]@tmp/random$1@mg; # HASH OIDs can appear in different order
269        #$model_text =~ s@\[http://.*/tmp/.*(\..{3,4})\]@tmp/random$1@mg;
270       
271        $test_text  =~ s@\[http://[^\n]*?/tmp/.*?(\..{3,4})\]\n<section>([^\n]*?)\n@[tmp/random$1\n<section>$2]\n@sg;
272        $model_text =~ s@\[http://[^\n]*?/tmp/.*?(\..{3,4})\]\n<section>([^\n]*?)\n@[tmp/random$1\n<section>$2]\n@sg;
273       
274        # need to re- sort the keys, now that the absolute paths to tmp locations has been removed
275        # so that we get the tmp files in the same order in both model and test collections
276       
277        $model_text = text_to_db_to_text($model_text, "model.gdb");
278        $test_text = text_to_db_to_text($test_text, "test.gdb");
279    }   
280
281    # now can go back to using $model_text and $test_text
282
283    if($debugging) {
284    print_string_to_file($test_text, $savepath.$dbname."_test.out");   
285    print_string_to_file($model_text, $savepath.$dbname."_model.out");
286    }
287
288    my $report_type = "OldStyle"; # Can not change this type.
289    my $diff_gdb = diff \$model_text, \$test_text, { STYLE => $report_type };
290   
291    # leaving the ignore regex as it used to be in the following, in case it helps with single line comparisons
292    $diff_gdb = &diffutil::GenerateOutput($diff_gdb,"^<(lastmodified|lastmodifieddate|oailastmodified|oailastmodifieddate|ImageSize)>.*");
293
294    if($diff_gdb eq "")
295    {
296    return "";
297    }
298    else
299    {
300    return "Difference Report: Differences found in the Database file: \n$diff_gdb";
301    }
302    # Call diff?
303}
304
305# returns true if the contents are windows AND it matters for the diffing on the db that it's windows
306# For col.gdb it does not seem to matter so far, if it is generated on a windows machine and to be compared to a linux-generated col.gdb
307sub isDBWindowsSensitive
308{
309    my ($dbtailname, $db_contents) = @_; # db filename without suffix
310   
311#   return ($db_contents =~ m/\\/) ? 1 : 0; # windows slashes detected. Better test would be: [Something\something] OR <tag>something\something
312   
313    if($dbtailname =~ m/^archiveinf-doc/) {
314        return ($db_contents =~ m@<src-file>[a-zA-Z]:\\@) ? 1 : 0; # <src-file>C:\path
315    }
316    elsif($dbtailname =~ m/^archiveinf-src/) { # <src-file>C:\path
317        return ($db_contents =~ m@\[[a-zA-Z]:\\@) ? 1 : 0; # [C:\path]
318    }
319    else { # index/col.gdb file
320        if ($db_contents =~ m@<URL>http://[a-zA-Z]:/@) { # <URL>http://C:/path
321            return 1;
322        }       
323        elsif ($db_contents =~ m@^(<URL>http://[a-zA-Z]:/)|(<null_file>[^\\]*\\)@m) { # <URL>http://C:/path OR <null_file>CMSwp-all.00000001\\00000035.nul         
324            return 1;
325        }
326        elsif ($db_contents =~ m@^(<ex.File.Directory>[a-zA-Z]:\\\\)@m) { # <ex.File.Directory>C:\\path\\path for OAI collection
327            return 1;
328        }
329        return 0;       
330    }   
331}
332
3331;
Note: See TracBrowser for help on using the browser.