Ignore:
Timestamp:
2013-07-05T22:20:39+12:00 (11 years ago)
Author:
ak19
Message:

Many additions to equalise the PDFBox collection metadata, since it generates an intermediate src file in the GSDLHOME/tmp directory whose path and random filename ends up in all the doc.xml and index/col.gdb files

File:
1 edited

Legend:

Unmodified
Added
Removed
  • other-projects/nightly-tasks/diffcol/trunk/diffcol/diffcol.pl

    r27743 r27766  
    4545use diffutil;
    4646use Text::Diff;
     47use Cwd;
    4748
    4849#--Global Variables Declaration-----------
     
    667668           
    668669       
    669 
    670670        # tmp dirs have subdirs with random numbers in name, remove randomly named subdir portion of path
     671        # these tmpdirs are located inside the collection directory
    671672        $model_contents =~ s@(tmp[\\\/])(\d*[\\\/])@$1@g;
    672673        $test_contents =~ s@(tmp[\\\/])(\d*[\\\/])@$1@g;
     
    674675        # remove all absolute paths upto collect folder from <Metadata /> elements
    675676        $model_contents =~ s@(<Metadata name=\"[^\"]*\">(http:\/\/)?).*(collect[\\\/]$strColName)@$1$3@g;
    676         $test_contents =~ s@(<Metadata name=\"[^\"]*\">(http:\/\/)?).*(collect[\\\/]$strColName)@$1$3@g;   
     677        $test_contents =~ s@(<Metadata name=\"[^\"]*\">(http:\/\/)?).*(collect[\\\/]$strColName)@$1$3@g;       
     678       
     679        # The following block of code is necessary to deal with tmp (html) source files generated when using PDFBox
     680        # These tmpdirs are located inside the toplevel *greenstone* directory
     681        (my $gsdlhome_re = $ENV{'GSDLHOME'}) =~ s@\\@\/@g;     
     682        my $tmpfile_regex = "<Metadata name=\"URL\">http://$gsdlhome_re/tmp/([^\.]*)(\..{3,4})</Metadata>"; # $gsdlhome/tmp/randomfilename.html, file ext can be 3 or 4 chars long 
     683        if($test_contents =~ m@$tmpfile_regex@) {           
     684            # found a match, replace the tmp file name with "random", keeping the original file extension
     685            # in <Metadata name="OrigSource|URL|UTF8URL|gsdlconvertedfilename">
     686       
     687            my ($old_tmp_filename, $ext) = ($1, $2);           
     688            my $new_tmp_filename = "random";           
     689           
     690            $tmpfile_regex = "(<Metadata name=\"(URL|UTF8URL|gsdlconvertedfilename|OrigSource)\">(http://)?)($gsdlhome_re)?(/tmp/)?$old_tmp_filename($ext</Metadata>)";
     691            if($5) {
     692                $test_contents =~ s@$tmpfile_regex@$1$5$new_tmp_filename$6@g;
     693            } else { # OrigSource contains only the filename
     694                $test_contents =~ s@$tmpfile_regex@$1$new_tmp_filename$6@g;
     695            }
     696           
     697            # modelcol used a different gsdlhome, but also a tmp dir, so make the same changes to its random filename           
     698            $tmpfile_regex = "(<Metadata name=\"(URL|UTF8URL|gsdlconvertedfilename|OrigSource)\">(http://)?)(.*)?(/tmp/)?.*($ext</Metadata>)";
     699            if($5) {
     700                $model_contents =~ s@$tmpfile_regex@$1$5$new_tmp_filename$6@g;
     701            } else { # OrigSource contains only the filename
     702                $model_contents =~ s@$tmpfile_regex@$1$new_tmp_filename$6@g;
     703            }
     704        }
    677705       
    678706#       my $savepath = &getcwd."/../"; # TASK_HOME env var does not exist at this stage, but it's one level up from current directory       
Note: See TracChangeset for help on using the changeset viewer.