Ignore:
Timestamp:
2013-10-04T19:52:18+13:00 (11 years ago)
Author:
ak19
Message:
  1. Now gsConvert.pl calls the new pptextract.vbs VBScript (which creates .item files and ppt slide.txt files in utf-8) instead of the older VB pptextract.exe executable which created .item and slide.txt files in windows default utf-16 LE. 2. PagedImagePlugin.pm::tidy_item_file now reads in the .item files in utf-8 mode, so that its strings are unicode aware. Substitutions are of unicode code points instead of byte sequences, since the strings in the file are now unicode aware.
Location:
main/trunk/greenstone2
Files:
2 edited

Legend:

Unmodified
Added
Removed
  • main/trunk/greenstone2/bin/script/gsConvert.pl

    r28166 r28355  
    386386    my $vbScript = &FileUtils::filenameConcatenate($ENV{'GSDLHOME'}, "bin",
    387387                       $ENV{'GSDLOS'}, "pptextract");
    388     $vbScript = "pptextract" if ($ENV{'GSDLOS'} =~ m/^windows$/i);
     388    $vbScript = "CScript //Nologo \"".$vbScript.".vbs\"" if ($ENV{'GSDLOS'} =~ m/^windows$/i); # now we use the .vbs VBScript
     389    # $vbScript = "pptextract" if ($ENV{'GSDLOS'} =~ m/^windows$/i); # back when the pptextract.exe VB executable was used
    389390           
    390391    my $cmd = "";
     
    398399        $cmd .= " 2>\"$output_filestem.err\""
    399400        if ($ENV{'GSDLOS'} !~ m/^windows$/i || $is_winnt_2000);
     401
    400402        if (system($cmd) !=0) {
    401403        print STDERR "Powerpoint VB Scripting convert failed\n";
     
    549551                                            # else script launch fails when there are error msgs
    550552            $vbScript = &FileUtils::filenameConcatenate($vbScript, "docx2html.vbs");
    551             $vbScript = "CScript //Nologo \"$vbScript\"";   # launche with CScript for error output in STDERR
     553            $vbScript = "CScript //Nologo \"$vbScript\"";   # launch with CScript for error output in STDERR
    552554                                    # //Nologo flag avoids Microsoft's opening/logo msgs
    553555            print STDERR "About to use windows scripting to process docx file $input_filename.\n";
  • main/trunk/greenstone2/perllib/plugins/PagedImagePlugin.pm

    r27509 r28355  
    407407    my ($filename) = @_;
    408408
    409     open (ITEMFILE, $filename) || die "couldn't open $filename\n";
     409    open (ITEMFILE, "<:encoding(UTF-8)", $filename) || die "couldn't open $filename\n";
    410410    my $backup_filename = "backup.item";
    411411    open (BACKUP,">$backup_filename")|| die "couldn't write to $backup_filename\n";
     412    binmode(BACKUP, ":utf8");
    412413    my $line = "";
    413414    $line = <ITEMFILE>;
    414     $line =~ s/^\xEF\xBB\xBF//; # strip BOM
    415     $line =~ s/\x0B+//ig;
     415    #$line =~ s/^\xEF\xBB\xBF//; # strip BOM in text file read in as a sequence of bytes (not unicode aware strings)
     416    $line =~ s/^\x{FEFF}//; # strip BOM in file opened *as UTF-8*. Strings in the file just read in are now unicode-aware,
     417                            # this means the BOM is now a unicode codepoint instead of a byte sequence
     418                            # See http://en.wikipedia.org/wiki/Byte_order_mark and http://perldoc.perl.org/5.14.0/perlunicode.html
     419    $line =~ s/\x{0B}+//ig; # removing \vt-vertical tabs using the unicode codepoint for \vt
    416420    $line =~ s/&/&amp;/g;
    417421    print BACKUP ($line);
    418422    #Tidy up the item file some metadata title contains \vt-vertical tab
    419423    while ($line = <ITEMFILE>) {
    420     $line =~ s/\x0B+//ig;
     424    $line =~ s/\x{0B}+//ig; # removing \vt-vertical tabs using the unicode codepoint for \vt
    421425    $line =~ s/&/&amp;/g;
    422426    print BACKUP ($line);
     
    657661    my $self = shift (@_);
    658662    my ($filename_full_path, $dir, $block_hash) = @_;
     663
    659664
    660665    open (ITEMFILE, $filename_full_path) || die "couldn't open $filename_full_path to work out which files to block\n";
Note: See TracChangeset for help on using the changeset viewer.