Changeset 27766 for other-projects/nightly-tasks
- Timestamp:
- 2013-07-05T22:20:39+12:00 (11 years ago)
- Location:
- other-projects/nightly-tasks/diffcol/trunk/diffcol
- Files:
-
- 2 edited
Legend:
- Unmodified
- Added
- Removed
-
other-projects/nightly-tasks/diffcol/trunk/diffcol/diffcol.pl
r27743 r27766 45 45 use diffutil; 46 46 use Text::Diff; 47 use Cwd; 47 48 48 49 #--Global Variables Declaration----------- … … 667 668 668 669 669 670 670 # tmp dirs have subdirs with random numbers in name, remove randomly named subdir portion of path 671 # these tmpdirs are located inside the collection directory 671 672 $model_contents =~ s@(tmp[\\\/])(\d*[\\\/])@$1@g; 672 673 $test_contents =~ s@(tmp[\\\/])(\d*[\\\/])@$1@g; … … 674 675 # remove all absolute paths upto collect folder from <Metadata /> elements 675 676 $model_contents =~ s@(<Metadata name=\"[^\"]*\">(http:\/\/)?).*(collect[\\\/]$strColName)@$1$3@g; 676 $test_contents =~ s@(<Metadata name=\"[^\"]*\">(http:\/\/)?).*(collect[\\\/]$strColName)@$1$3@g; 677 $test_contents =~ s@(<Metadata name=\"[^\"]*\">(http:\/\/)?).*(collect[\\\/]$strColName)@$1$3@g; 678 679 # The following block of code is necessary to deal with tmp (html) source files generated when using PDFBox 680 # These tmpdirs are located inside the toplevel *greenstone* directory 681 (my $gsdlhome_re = $ENV{'GSDLHOME'}) =~ s@\\@\/@g; 682 my $tmpfile_regex = "<Metadata name=\"URL\">http://$gsdlhome_re/tmp/([^\.]*)(\..{3,4})</Metadata>"; # $gsdlhome/tmp/randomfilename.html, file ext can be 3 or 4 chars long 683 if($test_contents =~ m@$tmpfile_regex@) { 684 # found a match, replace the tmp file name with "random", keeping the original file extension 685 # in <Metadata name="OrigSource|URL|UTF8URL|gsdlconvertedfilename"> 686 687 my ($old_tmp_filename, $ext) = ($1, $2); 688 my $new_tmp_filename = "random"; 689 690 $tmpfile_regex = "(<Metadata name=\"(URL|UTF8URL|gsdlconvertedfilename|OrigSource)\">(http://)?)($gsdlhome_re)?(/tmp/)?$old_tmp_filename($ext</Metadata>)"; 691 if($5) { 692 $test_contents =~ s@$tmpfile_regex@$1$5$new_tmp_filename$6@g; 693 } else { # OrigSource contains only the filename 694 $test_contents =~ s@$tmpfile_regex@$1$new_tmp_filename$6@g; 695 } 696 697 # modelcol used a different gsdlhome, but also a tmp dir, so make the same changes to its random filename 698 $tmpfile_regex = "(<Metadata name=\"(URL|UTF8URL|gsdlconvertedfilename|OrigSource)\">(http://)?)(.*)?(/tmp/)?.*($ext</Metadata>)"; 699 if($5) { 700 $model_contents =~ s@$tmpfile_regex@$1$5$new_tmp_filename$6@g; 701 } else { # OrigSource contains only the filename 702 $model_contents =~ s@$tmpfile_regex@$1$new_tmp_filename$6@g; 703 } 704 } 677 705 678 706 # my $savepath = &getcwd."/../"; # TASK_HOME env var does not exist at this stage, but it's one level up from current directory -
other-projects/nightly-tasks/diffcol/trunk/diffcol/gdbdiff.pm
r27743 r27766 71 71 72 72 # tmp dirs have subdirs with random numbers in name, remove subdir 73 # these tmpdirs are located inside the collection directory 73 74 $model_text =~ s@(tmp[\\\/])(\d*[\\\/])@$1@g; 74 75 $test_text =~ s@(tmp[\\\/])(\d*[\\\/])@$1@g; … … 167 168 168 169 } # end of equalising differences between a windows collection's db file and linux coll's db file 170 171 # The following block of code is necessary to deal with tmp (html) source files generated when using PDFBox 172 # These tmpdirs are located inside the toplevel *greenstone* directory 173 (my $gsdlhome_re = $ENV{'GSDLHOME'}) =~ s@\\@\/@g; 174 my $tmpfile_regex = "<URL>http://$gsdlhome_re/tmp/([^\.]*)(\..{3,4})"; # $gsdlhome/tmp/randomfilename.html, file ext can be 3 or 4 chars long 175 if($test_text =~ m@$tmpfile_regex@g) { 176 # found a match, replace the tmp file name with "random", keeping the original file extension 177 # in <OrigSource|URL|UTF8URL|gsdlconvertedfilename> 178 179 # This code is slightly different from doc.xml because each document has its own doc.xml, so this needs to be done 180 # only once for doc.xml, but multiple times in index/col.gdb since it contains the random filenames of all docs in the col 181 #my ($old_tmp_filename, $ext) = ($1, $2); 182 183 my $new_tmp_filename = "random"; 184 185 186 $tmpfile_regex = "(<(URL|UTF8URL|gsdlconvertedfilename|OrigSource)>(http://)?)($gsdlhome_re)?(/tmp/)?.*(\..{3,4})"; 187 if($5) { 188 $test_text =~ s@$tmpfile_regex@$1$5$new_tmp_filename$6@mg; 189 } else { # OrigSource contains only the filename 190 $test_text =~ s@$tmpfile_regex@$1$new_tmp_filename$6@mg; 191 } 192 193 # modelcol used a different gsdlhome, but also a tmp dir, so make the same changes to its random filename 194 $tmpfile_regex = "(<(URL|UTF8URL|gsdlconvertedfilename|OrigSource)>(http://)?)(.*)?(/tmp/)?.*(\..{3,4})"; 195 if($5) { 196 $model_text =~ s@$tmpfile_regex@$1$5$new_tmp_filename$6@mg; 197 } else { # OrigSource contains only the filename 198 $model_text =~ s@$tmpfile_regex@$1$new_tmp_filename$6@mg; 199 } 200 201 # index/col.gdb also has entries for the random tmp file names in the form: [http://research/ak19/GS2bin_5July2013/tmp/F639.html] 202 # need to equalise these also 203 $test_text =~ s@\[http://.*/tmp/.*(\..{3,4})\]@tmp/random$1@mg; 204 $model_text =~ s@\[http://.*/tmp/.*(\..{3,4})\]@tmp/random$1@mg; 205 } 169 206 170 207 # now can go back to using $model_text and $test_text
Note:
See TracChangeset
for help on using the changeset viewer.