Changeset 32280

Show
Ignore:
Timestamp:
17.07.2018 20:40:57 (13 months ago)
Author:
ak19
Message:

Implementing PDFv2paged_text (with pdfbox)

Location:
main/trunk/greenstone2/perllib
Files:
4 modified

Legend:

Unmodified
Added
Removed
  • main/trunk/greenstone2/perllib/plugins/ConvertBinaryFile.pm

    r32277 r32280  
    170170    $self->{'convert_to_plugin'} = "StructuredHTMLPlugin"; 
    171171    $self->{'convert_to_ext'} = "html"; 
    172     } elsif ($convert_to =~ /^pagedimg/) { 
     172    } elsif ($convert_to =~ /^pagedimg/ || $convert_to eq "paged_text") { 
    173173    $self->{'convert_to_plugin'} = "PagedImagePlugin"; 
    174     my ($convert_to_ext) = $convert_to =~ /pagedimg\_(jpg|gif|png)/i; 
    175     $convert_to_ext = 'jpg' unless defined $convert_to_ext; 
    176     $self->{'convert_to_ext'} = $convert_to_ext; 
     174    if($convert_to eq "paged_text") { 
     175        $self->{'convert_to_ext'} = "txt"; 
     176    } else { 
     177        my ($convert_to_ext) = $convert_to =~ /pagedimg(?:txt)?\_(jpg|gif|png)/i; # the ?: prefix avoids capturing or else discards the optional 'txt' in 'pagedimgtxt', 
     178        # so that we can consider the actual portion we want to capture: the img type 
     179        $convert_to_ext = 'jpg' unless defined $convert_to_ext; 
     180        $self->{'convert_to_ext'} = $convert_to_ext; 
     181    } 
    177182    } 
    178183} 
  • main/trunk/greenstone2/perllib/plugins/PDFv2Plugin.pm

    r32277 r32280  
    174174    # TODO 
    175175    print STDERR "@@@ Conversion to " . $self->{'convert_to'} , " with Xpdf Tools is not yet implemented.\n"; 
    176     print STDERR "@@@ Converting to a text instead.\n"; 
    177     $self->{'convert_to'} = "text"; 
     176    #print STDERR "@@@ Converting to text instead.\n"; 
     177    #$self->{'convert_to'} = "text"; 
    178178    } 
    179179     
  • main/trunk/greenstone2/perllib/plugins/PrintInfo.pm

    r25957 r32280  
    175175    } 
    176176    $self->print_xml($high_level_information_only); 
     177#    print STDERR "\n@@@ BLA\n\n"; 
    177178} 
    178179 
  • main/trunk/greenstone2/perllib/util.pm

    r32193 r32280  
    17291729    sub page_number { 
    17301730    my ($dir) = @_; 
    1731     my ($pagenum) =($dir =~ m/^.*?[-\.]?(\d+)(\.(jpg|gif|png))?$/i); 
     1731    my ($pagenum) =($dir =~ m/^.*?[-\.]?(\d+)(\.(jpg|gif|png|txt))?$/i); 
    17321732#   my ($pagenum) =($dir =~ m/(\d+)(\.(jpg|gif|png))?$/i); # this works but is not as safe/strict about input filepatterns as the above 
    17331733 
     
    17631763    my $hasTxtFile = &FileUtils::fileExists($txtfilename); 
    17641764 
    1765     foreach my $file (@dir_files){ 
    1766     if ($file !~ /\.item/i && $file !~ /\.txt/i){ 
     1765    # Write out the elements of the item file. 
     1766    # We could be dealing with 3 types of conversion output formats: txt only (paged_text), 
     1767    # images only (pagedimg_) and images AND text (pagedimgtxt_). 
     1768    foreach my $file (@dir_files) { 
     1769    if ($file !~ /\.item/i) { 
    17671770        $page_num = page_number($file); 
    17681771        $page_num++ if $starts_at_0; # image numbers start at 0, so add 1 
    1769         if($hasTxtFile) { 
    1770         print $item_fh "   <Page pagenum=\"$page_num\" imgfile=\"$file\" txtfile=\"$page_num.txt\"/>\n"; 
    1771         } else { 
    1772         print $item_fh "   <Page pagenum=\"$page_num\" imgfile=\"$file\" txtfile=\"\"/>\n"; 
     1772         
     1773        if ($convert_to eq "txt") { # output format is paged_text, which has no images 
     1774        if ($file =~ m/\.txt/i) { # check only txt files (should be all there is, besides the skipped .item file) 
     1775            print $item_fh "   <Page pagenum=\"$page_num\" imgfile=\"\" txtfile=\"$page_num.txt\"/>\n"; 
     1776        } # else, some non-txt file ext, skip 
    17731777        } 
    1774     } 
    1775     } 
     1778        else { # either pagedimg or pagedimgtxt output mode 
     1779        if($file !~ /\.txt/i) { # check only img files, skip any matching txt files 
     1780            if($hasTxtFile) { # if every image has a matching txt file, output txtfile too 
     1781            print $item_fh "   <Page pagenum=\"$page_num\" imgfile=\"$file\" txtfile=\"$page_num.txt\"/>\n"; 
     1782            } else { # when its pagedimg only, txtfile is empty 
     1783            print $item_fh "   <Page pagenum=\"$page_num\" imgfile=\"$file\" txtfile=\"\"/>\n"; 
     1784            } 
     1785        } 
     1786        } 
     1787    } 
     1788    } 
     1789     
    17761790 
    17771791    print $item_fh "</PagedDocument>\n";