Ignore:
Timestamp:
07/05/13 22:20:39 (8 years ago)
Author:
ak19
Message:

Many additions to equalise the PDFBox collection metadata, since it generates an intermediate src file in the GSDLHOME/tmp directory whose path and random filename ends up in all the doc.xml and index/col.gdb files

File:
1 edited

Legend:

Unmodified
Added
Removed
  • other-projects/nightly-tasks/diffcol/trunk/diffcol/gdbdiff.pm

    r27743 r27766  
    7171
    7272    # tmp dirs have subdirs with random numbers in name, remove subdir
     73    # these tmpdirs are located inside the collection directory
    7374    $model_text =~ s@(tmp[\\\/])(\d*[\\\/])@$1@g;
    7475    $test_text =~ s@(tmp[\\\/])(\d*[\\\/])@$1@g;
     
    167168       
    168169    } # end of equalising differences between a windows collection's db file and linux coll's db file
     170   
     171    # The following block of code is necessary to deal with tmp (html) source files generated when using PDFBox
     172    # These tmpdirs are located inside the toplevel *greenstone* directory
     173    (my $gsdlhome_re = $ENV{'GSDLHOME'}) =~ s@\\@\/@g;     
     174    my $tmpfile_regex = "<URL>http://$gsdlhome_re/tmp/([^\.]*)(\..{3,4})"; # $gsdlhome/tmp/randomfilename.html, file ext can be 3 or 4 chars long           
     175    if($test_text =~ m@$tmpfile_regex@g) {         
     176        # found a match, replace the tmp file name with "random", keeping the original file extension
     177        # in <OrigSource|URL|UTF8URL|gsdlconvertedfilename>
     178   
     179        # This code is slightly different from doc.xml because each document has its own doc.xml, so this needs to be done
     180        # only once for doc.xml, but multiple times in index/col.gdb since it contains the random filenames of all docs in the col 
     181        #my ($old_tmp_filename, $ext) = ($1, $2);
     182       
     183        my $new_tmp_filename = "random";           
     184   
     185       
     186        $tmpfile_regex = "(<(URL|UTF8URL|gsdlconvertedfilename|OrigSource)>(http://)?)($gsdlhome_re)?(/tmp/)?.*(\..{3,4})";
     187        if($5) {
     188            $test_text =~ s@$tmpfile_regex@$1$5$new_tmp_filename$6@mg;
     189        } else { # OrigSource contains only the filename
     190            $test_text =~ s@$tmpfile_regex@$1$new_tmp_filename$6@mg;
     191        }
     192       
     193        # modelcol used a different gsdlhome, but also a tmp dir, so make the same changes to its random filename           
     194        $tmpfile_regex = "(<(URL|UTF8URL|gsdlconvertedfilename|OrigSource)>(http://)?)(.*)?(/tmp/)?.*(\..{3,4})";
     195        if($5) {
     196            $model_text =~ s@$tmpfile_regex@$1$5$new_tmp_filename$6@mg;
     197        } else { # OrigSource contains only the filename
     198            $model_text =~ s@$tmpfile_regex@$1$new_tmp_filename$6@mg;
     199        }
     200       
     201        # index/col.gdb also has entries for the random tmp file names in the form: [http://research/ak19/GS2bin_5July2013/tmp/F639.html]
     202        # need to equalise these also
     203        $test_text =~ s@\[http://.*/tmp/.*(\..{3,4})\]@tmp/random$1@mg;
     204        $model_text =~ s@\[http://.*/tmp/.*(\..{3,4})\]@tmp/random$1@mg;
     205    }   
    169206
    170207    # now can go back to using $model_text and $test_text
Note: See TracChangeset for help on using the changeset viewer.