Changeset 27766


Ignore:
Timestamp:
07/05/13 22:20:39 (8 years ago)
Author:
ak19
Message:

Many additions to equalise the PDFBox collection metadata, since it generates an intermediate src file in the GSDLHOME/tmp directory whose path and random filename ends up in all the doc.xml and index/col.gdb files

Location:
other-projects/nightly-tasks/diffcol/trunk/diffcol
Files:
2 edited

Legend:

Unmodified
Added
Removed
  • other-projects/nightly-tasks/diffcol/trunk/diffcol/diffcol.pl

    r27743 r27766  
    4545use diffutil;
    4646use Text::Diff;
     47use Cwd;
    4748
    4849#--Global Variables Declaration-----------
     
    667668           
    668669       
    669 
    670670        # tmp dirs have subdirs with random numbers in name, remove randomly named subdir portion of path
     671        # these tmpdirs are located inside the collection directory
    671672        $model_contents =~ s@(tmp[\\\/])(\d*[\\\/])@$1@g;
    672673        $test_contents =~ s@(tmp[\\\/])(\d*[\\\/])@$1@g;
     
    674675        # remove all absolute paths upto collect folder from <Metadata /> elements
    675676        $model_contents =~ s@(<Metadata name=\"[^\"]*\">(http:\/\/)?).*(collect[\\\/]$strColName)@$1$3@g;
    676         $test_contents =~ s@(<Metadata name=\"[^\"]*\">(http:\/\/)?).*(collect[\\\/]$strColName)@$1$3@g;   
     677        $test_contents =~ s@(<Metadata name=\"[^\"]*\">(http:\/\/)?).*(collect[\\\/]$strColName)@$1$3@g;       
     678       
     679        # The following block of code is necessary to deal with tmp (html) source files generated when using PDFBox
     680        # These tmpdirs are located inside the toplevel *greenstone* directory
     681        (my $gsdlhome_re = $ENV{'GSDLHOME'}) =~ s@\\@\/@g;     
     682        my $tmpfile_regex = "<Metadata name=\"URL\">http://$gsdlhome_re/tmp/([^\.]*)(\..{3,4})</Metadata>"; # $gsdlhome/tmp/randomfilename.html, file ext can be 3 or 4 chars long 
     683        if($test_contents =~ m@$tmpfile_regex@) {           
     684            # found a match, replace the tmp file name with "random", keeping the original file extension
     685            # in <Metadata name="OrigSource|URL|UTF8URL|gsdlconvertedfilename">
     686       
     687            my ($old_tmp_filename, $ext) = ($1, $2);           
     688            my $new_tmp_filename = "random";           
     689           
     690            $tmpfile_regex = "(<Metadata name=\"(URL|UTF8URL|gsdlconvertedfilename|OrigSource)\">(http://)?)($gsdlhome_re)?(/tmp/)?$old_tmp_filename($ext</Metadata>)";
     691            if($5) {
     692                $test_contents =~ s@$tmpfile_regex@$1$5$new_tmp_filename$6@g;
     693            } else { # OrigSource contains only the filename
     694                $test_contents =~ s@$tmpfile_regex@$1$new_tmp_filename$6@g;
     695            }
     696           
     697            # modelcol used a different gsdlhome, but also a tmp dir, so make the same changes to its random filename           
     698            $tmpfile_regex = "(<Metadata name=\"(URL|UTF8URL|gsdlconvertedfilename|OrigSource)\">(http://)?)(.*)?(/tmp/)?.*($ext</Metadata>)";
     699            if($5) {
     700                $model_contents =~ s@$tmpfile_regex@$1$5$new_tmp_filename$6@g;
     701            } else { # OrigSource contains only the filename
     702                $model_contents =~ s@$tmpfile_regex@$1$new_tmp_filename$6@g;
     703            }
     704        }
    677705       
    678706#       my $savepath = &getcwd."/../"; # TASK_HOME env var does not exist at this stage, but it's one level up from current directory       
  • other-projects/nightly-tasks/diffcol/trunk/diffcol/gdbdiff.pm

    r27743 r27766  
    7171
    7272    # tmp dirs have subdirs with random numbers in name, remove subdir
     73    # these tmpdirs are located inside the collection directory
    7374    $model_text =~ s@(tmp[\\\/])(\d*[\\\/])@$1@g;
    7475    $test_text =~ s@(tmp[\\\/])(\d*[\\\/])@$1@g;
     
    167168       
    168169    } # end of equalising differences between a windows collection's db file and linux coll's db file
     170   
     171    # The following block of code is necessary to deal with tmp (html) source files generated when using PDFBox
     172    # These tmpdirs are located inside the toplevel *greenstone* directory
     173    (my $gsdlhome_re = $ENV{'GSDLHOME'}) =~ s@\\@\/@g;     
     174    my $tmpfile_regex = "<URL>http://$gsdlhome_re/tmp/([^\.]*)(\..{3,4})"; # $gsdlhome/tmp/randomfilename.html, file ext can be 3 or 4 chars long           
     175    if($test_text =~ m@$tmpfile_regex@g) {         
     176        # found a match, replace the tmp file name with "random", keeping the original file extension
     177        # in <OrigSource|URL|UTF8URL|gsdlconvertedfilename>
     178   
     179        # This code is slightly different from doc.xml because each document has its own doc.xml, so this needs to be done
     180        # only once for doc.xml, but multiple times in index/col.gdb since it contains the random filenames of all docs in the col 
     181        #my ($old_tmp_filename, $ext) = ($1, $2);
     182       
     183        my $new_tmp_filename = "random";           
     184   
     185       
     186        $tmpfile_regex = "(<(URL|UTF8URL|gsdlconvertedfilename|OrigSource)>(http://)?)($gsdlhome_re)?(/tmp/)?.*(\..{3,4})";
     187        if($5) {
     188            $test_text =~ s@$tmpfile_regex@$1$5$new_tmp_filename$6@mg;
     189        } else { # OrigSource contains only the filename
     190            $test_text =~ s@$tmpfile_regex@$1$new_tmp_filename$6@mg;
     191        }
     192       
     193        # modelcol used a different gsdlhome, but also a tmp dir, so make the same changes to its random filename           
     194        $tmpfile_regex = "(<(URL|UTF8URL|gsdlconvertedfilename|OrigSource)>(http://)?)(.*)?(/tmp/)?.*(\..{3,4})";
     195        if($5) {
     196            $model_text =~ s@$tmpfile_regex@$1$5$new_tmp_filename$6@mg;
     197        } else { # OrigSource contains only the filename
     198            $model_text =~ s@$tmpfile_regex@$1$new_tmp_filename$6@mg;
     199        }
     200       
     201        # index/col.gdb also has entries for the random tmp file names in the form: [http://research/ak19/GS2bin_5July2013/tmp/F639.html]
     202        # need to equalise these also
     203        $test_text =~ s@\[http://.*/tmp/.*(\..{3,4})\]@tmp/random$1@mg;
     204        $model_text =~ s@\[http://.*/tmp/.*(\..{3,4})\]@tmp/random$1@mg;
     205    }   
    169206
    170207    # now can go back to using $model_text and $test_text
Note: See TracChangeset for help on using the changeset viewer.