Changeset 28355

Show
Ignore:
Timestamp:
04.10.2013 19:52:18 (6 years ago)
Author:
ak19
Message:

1. Now gsConvert.pl calls the new pptextract.vbs VBScript (which creates .item files and ppt slide.txt files in utf-8) instead of the older VB pptextract.exe executable which created .item and slide.txt files in windows default utf-16 LE. 2. PagedImagePlugin?.pm::tidy_item_file now reads in the .item files in utf-8 mode, so that its strings are unicode aware. Substitutions are of unicode code points instead of byte sequences, since the strings in the file are now unicode aware.

Location:
main/trunk/greenstone2
Files:
2 modified

Legend:

Unmodified
Added
Removed
  • main/trunk/greenstone2/bin/script/gsConvert.pl

    r28166 r28355  
    386386    my $vbScript = &FileUtils::filenameConcatenate($ENV{'GSDLHOME'}, "bin", 
    387387                       $ENV{'GSDLOS'}, "pptextract"); 
    388     $vbScript = "pptextract" if ($ENV{'GSDLOS'} =~ m/^windows$/i); 
     388    $vbScript = "CScript //Nologo \"".$vbScript.".vbs\"" if ($ENV{'GSDLOS'} =~ m/^windows$/i); # now we use the .vbs VBScript 
     389    # $vbScript = "pptextract" if ($ENV{'GSDLOS'} =~ m/^windows$/i); # back when the pptextract.exe VB executable was used 
    389390             
    390391    my $cmd = ""; 
     
    398399        $cmd .= " 2>\"$output_filestem.err\"" 
    399400        if ($ENV{'GSDLOS'} !~ m/^windows$/i || $is_winnt_2000); 
     401 
    400402        if (system($cmd) !=0) { 
    401403        print STDERR "Powerpoint VB Scripting convert failed\n"; 
     
    549551                                            # else script launch fails when there are error msgs 
    550552            $vbScript = &FileUtils::filenameConcatenate($vbScript, "docx2html.vbs");  
    551             $vbScript = "CScript //Nologo \"$vbScript\"";   # launche with CScript for error output in STDERR 
     553            $vbScript = "CScript //Nologo \"$vbScript\"";   # launch with CScript for error output in STDERR 
    552554                                    # //Nologo flag avoids Microsoft's opening/logo msgs 
    553555            print STDERR "About to use windows scripting to process docx file $input_filename.\n"; 
  • main/trunk/greenstone2/perllib/plugins/PagedImagePlugin.pm

    r27509 r28355  
    407407    my ($filename) = @_; 
    408408 
    409     open (ITEMFILE, $filename) || die "couldn't open $filename\n"; 
     409    open (ITEMFILE, "<:encoding(UTF-8)", $filename) || die "couldn't open $filename\n"; 
    410410    my $backup_filename = "backup.item"; 
    411411    open (BACKUP,">$backup_filename")|| die "couldn't write to $backup_filename\n"; 
     412    binmode(BACKUP, ":utf8"); 
    412413    my $line = ""; 
    413414    $line = <ITEMFILE>; 
    414     $line =~ s/^\xEF\xBB\xBF//; # strip BOM 
    415     $line =~ s/\x0B+//ig; 
     415    #$line =~ s/^\xEF\xBB\xBF//; # strip BOM in text file read in as a sequence of bytes (not unicode aware strings) 
     416    $line =~ s/^\x{FEFF}//; # strip BOM in file opened *as UTF-8*. Strings in the file just read in are now unicode-aware, 
     417                            # this means the BOM is now a unicode codepoint instead of a byte sequence 
     418                            # See http://en.wikipedia.org/wiki/Byte_order_mark and http://perldoc.perl.org/5.14.0/perlunicode.html  
     419    $line =~ s/\x{0B}+//ig; # removing \vt-vertical tabs using the unicode codepoint for \vt 
    416420    $line =~ s/&/&amp;/g; 
    417421    print BACKUP ($line); 
    418422    #Tidy up the item file some metadata title contains \vt-vertical tab 
    419423    while ($line = <ITEMFILE>) { 
    420     $line =~ s/\x0B+//ig; 
     424    $line =~ s/\x{0B}+//ig; # removing \vt-vertical tabs using the unicode codepoint for \vt 
    421425    $line =~ s/&/&amp;/g; 
    422426    print BACKUP ($line); 
     
    657661    my $self = shift (@_); 
    658662    my ($filename_full_path, $dir, $block_hash) = @_; 
     663 
    659664 
    660665    open (ITEMFILE, $filename_full_path) || die "couldn't open $filename_full_path to work out which files to block\n";