Changeset 30201 for gs2-extensions


Ignore:
Timestamp:
09/04/15 17:29:04 (5 years ago)
Author:
davidb
Message:

Zach's hard work improving OCR in Greenstone

File:
1 edited

Legend:

Unmodified
Added
Removed
  • gs2-extensions/ocr/trunk/perllib/plugins/OCRImageConverter.pm

    r30110 r30201  
    44
    55use strict;
     6use warnings;
    67no strict 'refs'; # allow filehandles to be variables and viceversa
    78no strict 'subs'; # allow barewords (eg STDERR) as function arguments
     
    1718    @OCRImageConverter::ISA = ('BaseMediaConverter');
    1819
    19     # Check that PDFBox is installed and available on the path
     20    # Check that Tesseract is installed and available on the path
    2021    $ocrimage_conversion_available = 1;
    2122   
     
    5556            }
    5657        }
    57     }
    58 
    59 }
    60 
    61 my $arguments = [ ];
     58
     59        my $cpan = &FileUtils::filenameConcatenate($gextocr_home, "installed", "cmdline", "cpan", "lib64", "perl5");
     60        &gsprintf(STDERR, "OCRImageConverter: CPAN directory $cpan\n");
     61        if (! -d $cpan) {
     62            $ocrimage_conversion_available = 0;
     63            $no_ocrimage_conversion_reason = "gexthtmltokenotinstalled";
     64        } else {
     65            push(@INC, $cpan);
     66        }
     67    }
     68}
     69
     70use HTML::TokeParser;
     71
     72my $layout_list = [
     73    {
     74        'name' => "none",
     75        'desc' => "{OCRImageConverter.unpaper_layout.none}"
     76    },
     77    {
     78        'name' => "single",
     79        'desc' => "{OCRImageConverter.unpaper_layout.single}"
     80    },
     81    {
     82        'name' => "double",
     83        'desc' => "{OCRImageConverter.unpaper_layout.double}"
     84    }
     85];
     86
     87my $arguments = [
     88    {
     89        'name' => 'use_unpaper',
     90        'desc' => "{OCRImageConverter.use_unpaper}",
     91        'type' => "flag",
     92        'deft' => 0,
     93        'reqd' => "no"
     94    },
     95    {
     96        'name' => 'unpaper_layout',
     97        'desc' => "{OCRImageConverter.unpaper_layout}",
     98        'type' => "enum",
     99        'reqd' => "yes",
     100        'list' => $layout_list,
     101        'deft' => "none"
     102    }
     103];
     104
     105my $opt_diva_args = [
     106    {
     107        'name' => 'diva_support',
     108        'desc' => "{OCRImageConverter.diva_support}",
     109        'type' => "flag",
     110        'deft' => "yes",
     111        'reqd' => "no"
     112    }
     113];
    62114
    63115my $options = {
     
    74126    push(@$pluginlist, $class);
    75127
     128    if (defined $ENV{'GEXTDIVA'}) {
     129        push(@$arguments, @$opt_diva_args);
     130    }
     131
    76132    push(@{$hashArgOptLists->{"ArgList"}}, @{$arguments});
    77133    push(@{$hashArgOptLists->{"OptList"}}, $options);
    78 
    79134
    80135    my $self = new BaseMediaConverter($pluginlist, $inputargs, $hashArgOptLists, $auxilary);
     
    121176    my ($source_file_full_path) = @_;
    122177   
    123     print STDERR "***** WE ARE WORKING *****\n";
    124    
    125178    # TODO cache
    126179    if (lc(substr($source_file_full_path, length($source_file_full_path) - 4)) eq ".gif") {
     180        my $desired_extension = "tif";
     181        if ($self->{'use_unpaper'}) {
     182            $desired_extension = "pnm";
     183        }
     184       
    127185        # need to convert to another format first
    128186        #my $target_source = substr($source_file_full_path, length($source_file_full_path) - 4) . ".tif";
    129         my $converted_file = &util::get_tmp_filename("tif");
     187        my $converted_file = &util::get_tmp_filename($desired_extension);
    130188        push(@{$self->{'ocrtmp_file_paths'}}, $converted_file);
    131189       
     
    134192       
    135193        print STDERR "*** Magick command: $imagick_cmd\n";
     194       
     195        if ($self->{'use_unpaper'}) {
     196            my $unpaper_cmd = "unpaper \"$converted_file\" \"$converted_file\" --overwrite --layout " . $self->{'unpaper_layout'};
     197            system($unpaper_cmd);
     198            print STDERR "*** unpaper command: $unpaper_cmd\n";
     199        }
    136200       
    137201        $source_file_full_path = $converted_file;
     
    175239   
    176240    my ($regenerated, $result, $had_error) = $self->autorun_general_cmd($convert_cmd, $source_file_full_path, $target_file_path, $print_info);
     241    $self->autorun_general_cmd($convert_cmd . " hocr", $source_file_full_path, $target_file_path, $print_info);
     242   
     243    # HOCR
     244    my $hocr = substr($target_file_path, 0, length($target_file_path) - 4) . ".hocr";
     245   
     246    my $json = "[";
     247
     248    open(my $fh, $hocr) || die "Can't open HOCR file $hocr!";
     249    #open(my $fh, $hocr);
     250    my $parser = HTML::TokeParser->new($fh);
     251   
     252    my $state = "idle";
     253    my $bbox = "";
     254    my $text = "";
     255   
     256    while (my $token = $parser->get_token()) {
     257        if ($state eq "idle" && $token->[0] eq "S" && $token->[1] eq "span") {
     258            $state = "inSpan";
     259            $bbox = $token->[2]{'title'};
     260        }
     261       
     262        if ($state eq "inSpan" && $token->[0] eq "T") {
     263#            $state = "gotText";
     264            $text = $token->[1];
     265            $state = "gotText";
     266            chomp $text;
     267        }
     268       
     269        if ($state eq "gotText" && $token->[0] eq "E") {
     270            $state = "idle";
     271            #next unless $text =~ /\w/;
     272           
     273            my @numbers = $bbox =~ /(\d+)/g;
     274            $json .= "[\"$text\",[${numbers[0]},${numbers[1]},${numbers[2]},${numbers[3]}]],";
     275        }
     276    }
     277
     278    close($fh);
     279
     280    $json = substr($json, 0, length($json) - 1); # remove trailing comma
     281    $json .= "]";
     282
     283    open(my $oh, '>', $hocr . '.json');
     284    print $oh "$json";
     285    close($oh);
     286   
     287    # Diva.js
     288    #if (defined $EXT{'GEXTDIVA'}) {
     289    #    if ($self->{'diva_support'}) {
     290    #        my $webroot = &FileUtils::filenameConcatenate($ENV{'GSDL3HOME'}, '..', 'packages', 'tomcat', 'webapps', 'iipsrv', 'collect', '<HASH>');
     291    #        my $processpy = &FileUtils::filenameConcatenate($ENV{'GEXTDIVA_INSTALLED'}, 'cmdline', 'python');
     292    #        my $DS = &util::get_dirsep();
     293    #        my $convert = `which convert`;
     294    #       
     295    #        $print_info = {
     296    #            'message_prefix' => "Diva.js processing",
     297    #            'message' => "Now processing images for use with Diva.js..."
     298    #        };
     299    #
     300    #        my $process_cmd = "python \"$processpy\" \"<IN>\" \"$webroot${DS}img\" \"$webroot${DS}data\" -t tiff -i \"$convert\"";
     301    #        $self->autorun_general_cmd($process_cmd, "<IN>", $print_info);
     302    #    }
     303    #}
    177304   
    178305    if ($had_error) {
     
    181308        return (1, $result, $target_file_path);
    182309    }
     310}
     311
     312sub test {
     313    my $self = shift(@_);
     314    print STDERR "**** test working\n";
    183315}
    184316
     
    210342}
    211343
    212 1; 
     3441;
Note: See TracChangeset for help on using the changeset viewer.