Ignore:
Timestamp:
07/17/18 20:40:57 (2 years ago)
Author:
ak19
Message:

Implementing PDFv2paged_text (with pdfbox)

Location:
main/trunk/greenstone2/perllib
Files:
4 edited

Legend:

Unmodified
Added
Removed
  • main/trunk/greenstone2/perllib/plugins/ConvertBinaryFile.pm

    r32277 r32280  
    170170    $self->{'convert_to_plugin'} = "StructuredHTMLPlugin";
    171171    $self->{'convert_to_ext'} = "html";
    172     } elsif ($convert_to =~ /^pagedimg/) {
     172    } elsif ($convert_to =~ /^pagedimg/ || $convert_to eq "paged_text") {
    173173    $self->{'convert_to_plugin'} = "PagedImagePlugin";
    174     my ($convert_to_ext) = $convert_to =~ /pagedimg\_(jpg|gif|png)/i;
    175     $convert_to_ext = 'jpg' unless defined $convert_to_ext;
    176     $self->{'convert_to_ext'} = $convert_to_ext;
     174    if($convert_to eq "paged_text") {
     175        $self->{'convert_to_ext'} = "txt";
     176    } else {
     177        my ($convert_to_ext) = $convert_to =~ /pagedimg(?:txt)?\_(jpg|gif|png)/i; # the ?: prefix avoids capturing or else discards the optional 'txt' in 'pagedimgtxt',
     178        # so that we can consider the actual portion we want to capture: the img type
     179        $convert_to_ext = 'jpg' unless defined $convert_to_ext;
     180        $self->{'convert_to_ext'} = $convert_to_ext;
     181    }
    177182    }
    178183}
  • main/trunk/greenstone2/perllib/plugins/PDFv2Plugin.pm

    r32277 r32280  
    174174    # TODO
    175175    print STDERR "@@@ Conversion to " . $self->{'convert_to'} , " with Xpdf Tools is not yet implemented.\n";
    176     print STDERR "@@@ Converting to a text instead.\n";
    177     $self->{'convert_to'} = "text";
     176    #print STDERR "@@@ Converting to text instead.\n";
     177    #$self->{'convert_to'} = "text";
    178178    }
    179179   
  • main/trunk/greenstone2/perllib/plugins/PrintInfo.pm

    r25957 r32280  
    175175    }
    176176    $self->print_xml($high_level_information_only);
     177#    print STDERR "\n@@@ BLA\n\n";
    177178}
    178179
  • main/trunk/greenstone2/perllib/util.pm

    r32193 r32280  
    17291729    sub page_number {
    17301730    my ($dir) = @_;
    1731     my ($pagenum) =($dir =~ m/^.*?[-\.]?(\d+)(\.(jpg|gif|png))?$/i);
     1731    my ($pagenum) =($dir =~ m/^.*?[-\.]?(\d+)(\.(jpg|gif|png|txt))?$/i);
    17321732#   my ($pagenum) =($dir =~ m/(\d+)(\.(jpg|gif|png))?$/i); # this works but is not as safe/strict about input filepatterns as the above
    17331733
     
    17631763    my $hasTxtFile = &FileUtils::fileExists($txtfilename);
    17641764
    1765     foreach my $file (@dir_files){
    1766     if ($file !~ /\.item/i && $file !~ /\.txt/i){
     1765    # Write out the elements of the item file.
     1766    # We could be dealing with 3 types of conversion output formats: txt only (paged_text),
     1767    # images only (pagedimg_) and images AND text (pagedimgtxt_).
     1768    foreach my $file (@dir_files) {
     1769    if ($file !~ /\.item/i) {
    17671770        $page_num = page_number($file);
    17681771        $page_num++ if $starts_at_0; # image numbers start at 0, so add 1
    1769         if($hasTxtFile) {
    1770         print $item_fh "   <Page pagenum=\"$page_num\" imgfile=\"$file\" txtfile=\"$page_num.txt\"/>\n";
    1771         } else {
    1772         print $item_fh "   <Page pagenum=\"$page_num\" imgfile=\"$file\" txtfile=\"\"/>\n";
     1772       
     1773        if ($convert_to eq "txt") { # output format is paged_text, which has no images
     1774        if ($file =~ m/\.txt/i) { # check only txt files (should be all there is, besides the skipped .item file)
     1775            print $item_fh "   <Page pagenum=\"$page_num\" imgfile=\"\" txtfile=\"$page_num.txt\"/>\n";
     1776        } # else, some non-txt file ext, skip
    17731777        }
    1774     }
    1775     }
     1778        else { # either pagedimg or pagedimgtxt output mode
     1779        if($file !~ /\.txt/i) { # check only img files, skip any matching txt files
     1780            if($hasTxtFile) { # if every image has a matching txt file, output txtfile too
     1781            print $item_fh "   <Page pagenum=\"$page_num\" imgfile=\"$file\" txtfile=\"$page_num.txt\"/>\n";
     1782            } else { # when its pagedimg only, txtfile is empty
     1783            print $item_fh "   <Page pagenum=\"$page_num\" imgfile=\"$file\" txtfile=\"\"/>\n";
     1784            }
     1785        }
     1786        }
     1787    }
     1788    }
     1789   
    17761790
    17771791    print $item_fh "</PagedDocument>\n";
Note: See TracChangeset for help on using the changeset viewer.