Changeset 27766

Show
Ignore:
Timestamp:
05.07.2013 22:20:39 (6 years ago)
Author:
ak19
Message:

Many additions to equalise the PDFBox collection metadata, since it generates an intermediate src file in the GSDLHOME/tmp directory whose path and random filename ends up in all the doc.xml and index/col.gdb files

Location:
other-projects/nightly-tasks/diffcol/trunk/diffcol
Files:
2 modified

Legend:

Unmodified
Added
Removed
  • other-projects/nightly-tasks/diffcol/trunk/diffcol/diffcol.pl

    r27743 r27766  
    4545use diffutil; 
    4646use Text::Diff; 
     47use Cwd; 
    4748 
    4849#--Global Variables Declaration----------- 
     
    667668             
    668669         
    669  
    670670        # tmp dirs have subdirs with random numbers in name, remove randomly named subdir portion of path 
     671        # these tmpdirs are located inside the collection directory 
    671672        $model_contents =~ s@(tmp[\\\/])(\d*[\\\/])@$1@g;  
    672673        $test_contents =~ s@(tmp[\\\/])(\d*[\\\/])@$1@g; 
     
    674675        # remove all absolute paths upto collect folder from <Metadata /> elements 
    675676        $model_contents =~ s@(<Metadata name=\"[^\"]*\">(http:\/\/)?).*(collect[\\\/]$strColName)@$1$3@g; 
    676         $test_contents =~ s@(<Metadata name=\"[^\"]*\">(http:\/\/)?).*(collect[\\\/]$strColName)@$1$3@g;     
     677        $test_contents =~ s@(<Metadata name=\"[^\"]*\">(http:\/\/)?).*(collect[\\\/]$strColName)@$1$3@g;         
     678         
     679        # The following block of code is necessary to deal with tmp (html) source files generated when using PDFBox 
     680        # These tmpdirs are located inside the toplevel *greenstone* directory 
     681        (my $gsdlhome_re = $ENV{'GSDLHOME'}) =~ s@\\@\/@g;       
     682        my $tmpfile_regex = "<Metadata name=\"URL\">http://$gsdlhome_re/tmp/([^\.]*)(\..{3,4})</Metadata>"; # $gsdlhome/tmp/randomfilename.html, file ext can be 3 or 4 chars long   
     683        if($test_contents =~ m@$tmpfile_regex@) {            
     684            # found a match, replace the tmp file name with "random", keeping the original file extension  
     685            # in <Metadata name="OrigSource|URL|UTF8URL|gsdlconvertedfilename"> 
     686         
     687            my ($old_tmp_filename, $ext) = ($1, $2);             
     688            my $new_tmp_filename = "random";             
     689             
     690            $tmpfile_regex = "(<Metadata name=\"(URL|UTF8URL|gsdlconvertedfilename|OrigSource)\">(http://)?)($gsdlhome_re)?(/tmp/)?$old_tmp_filename($ext</Metadata>)"; 
     691            if($5) {  
     692                $test_contents =~ s@$tmpfile_regex@$1$5$new_tmp_filename$6@g; 
     693            } else { # OrigSource contains only the filename 
     694                $test_contents =~ s@$tmpfile_regex@$1$new_tmp_filename$6@g; 
     695            } 
     696             
     697            # modelcol used a different gsdlhome, but also a tmp dir, so make the same changes to its random filename            
     698            $tmpfile_regex = "(<Metadata name=\"(URL|UTF8URL|gsdlconvertedfilename|OrigSource)\">(http://)?)(.*)?(/tmp/)?.*($ext</Metadata>)"; 
     699            if($5) {  
     700                $model_contents =~ s@$tmpfile_regex@$1$5$new_tmp_filename$6@g; 
     701            } else { # OrigSource contains only the filename 
     702                $model_contents =~ s@$tmpfile_regex@$1$new_tmp_filename$6@g; 
     703            } 
     704        } 
    677705         
    678706#       my $savepath = &getcwd."/../"; # TASK_HOME env var does not exist at this stage, but it's one level up from current directory        
  • other-projects/nightly-tasks/diffcol/trunk/diffcol/gdbdiff.pm

    r27743 r27766  
    7171 
    7272    # tmp dirs have subdirs with random numbers in name, remove subdir 
     73    # these tmpdirs are located inside the collection directory 
    7374    $model_text =~ s@(tmp[\\\/])(\d*[\\\/])@$1@g; 
    7475    $test_text =~ s@(tmp[\\\/])(\d*[\\\/])@$1@g; 
     
    167168         
    168169    } # end of equalising differences between a windows collection's db file and linux coll's db file 
     170     
     171    # The following block of code is necessary to deal with tmp (html) source files generated when using PDFBox 
     172    # These tmpdirs are located inside the toplevel *greenstone* directory 
     173    (my $gsdlhome_re = $ENV{'GSDLHOME'}) =~ s@\\@\/@g;       
     174    my $tmpfile_regex = "<URL>http://$gsdlhome_re/tmp/([^\.]*)(\..{3,4})"; # $gsdlhome/tmp/randomfilename.html, file ext can be 3 or 4 chars long            
     175    if($test_text =~ m@$tmpfile_regex@g) {           
     176        # found a match, replace the tmp file name with "random", keeping the original file extension  
     177        # in <OrigSource|URL|UTF8URL|gsdlconvertedfilename> 
     178     
     179        # This code is slightly different from doc.xml because each document has its own doc.xml, so this needs to be done 
     180        # only once for doc.xml, but multiple times in index/col.gdb since it contains the random filenames of all docs in the col   
     181        #my ($old_tmp_filename, $ext) = ($1, $2); 
     182         
     183        my $new_tmp_filename = "random";             
     184     
     185         
     186        $tmpfile_regex = "(<(URL|UTF8URL|gsdlconvertedfilename|OrigSource)>(http://)?)($gsdlhome_re)?(/tmp/)?.*(\..{3,4})"; 
     187        if($5) {  
     188            $test_text =~ s@$tmpfile_regex@$1$5$new_tmp_filename$6@mg; 
     189        } else { # OrigSource contains only the filename 
     190            $test_text =~ s@$tmpfile_regex@$1$new_tmp_filename$6@mg; 
     191        } 
     192         
     193        # modelcol used a different gsdlhome, but also a tmp dir, so make the same changes to its random filename            
     194        $tmpfile_regex = "(<(URL|UTF8URL|gsdlconvertedfilename|OrigSource)>(http://)?)(.*)?(/tmp/)?.*(\..{3,4})"; 
     195        if($5) {  
     196            $model_text =~ s@$tmpfile_regex@$1$5$new_tmp_filename$6@mg; 
     197        } else { # OrigSource contains only the filename 
     198            $model_text =~ s@$tmpfile_regex@$1$new_tmp_filename$6@mg; 
     199        } 
     200         
     201        # index/col.gdb also has entries for the random tmp file names in the form: [http://research/ak19/GS2bin_5July2013/tmp/F639.html] 
     202        # need to equalise these also 
     203        $test_text =~ s@\[http://.*/tmp/.*(\..{3,4})\]@tmp/random$1@mg; 
     204        $model_text =~ s@\[http://.*/tmp/.*(\..{3,4})\]@tmp/random$1@mg; 
     205    }    
    169206 
    170207    # now can go back to using $model_text and $test_text