Ignore:
Timestamp:
2018-07-17T20:40:57+12:00 (6 years ago)
Author:
ak19
Message:

Implementing PDFv2paged_text (with pdfbox)

File:
1 edited

Legend:

Unmodified
Added
Removed
  • main/trunk/greenstone2/perllib/util.pm

    r32193 r32280  
    17291729    sub page_number {
    17301730    my ($dir) = @_;
    1731     my ($pagenum) =($dir =~ m/^.*?[-\.]?(\d+)(\.(jpg|gif|png))?$/i);
     1731    my ($pagenum) =($dir =~ m/^.*?[-\.]?(\d+)(\.(jpg|gif|png|txt))?$/i);
    17321732#   my ($pagenum) =($dir =~ m/(\d+)(\.(jpg|gif|png))?$/i); # this works but is not as safe/strict about input filepatterns as the above
    17331733
     
    17631763    my $hasTxtFile = &FileUtils::fileExists($txtfilename);
    17641764
    1765     foreach my $file (@dir_files){
    1766     if ($file !~ /\.item/i && $file !~ /\.txt/i){
     1765    # Write out the elements of the item file.
     1766    # We could be dealing with 3 types of conversion output formats: txt only (paged_text),
     1767    # images only (pagedimg_) and images AND text (pagedimgtxt_).
     1768    foreach my $file (@dir_files) {
     1769    if ($file !~ /\.item/i) {
    17671770        $page_num = page_number($file);
    17681771        $page_num++ if $starts_at_0; # image numbers start at 0, so add 1
    1769         if($hasTxtFile) {
    1770         print $item_fh "   <Page pagenum=\"$page_num\" imgfile=\"$file\" txtfile=\"$page_num.txt\"/>\n";
    1771         } else {
    1772         print $item_fh "   <Page pagenum=\"$page_num\" imgfile=\"$file\" txtfile=\"\"/>\n";
     1772       
     1773        if ($convert_to eq "txt") { # output format is paged_text, which has no images
     1774        if ($file =~ m/\.txt/i) { # check only txt files (should be all there is, besides the skipped .item file)
     1775            print $item_fh "   <Page pagenum=\"$page_num\" imgfile=\"\" txtfile=\"$page_num.txt\"/>\n";
     1776        } # else, some non-txt file ext, skip
    17731777        }
    1774     }
    1775     }
     1778        else { # either pagedimg or pagedimgtxt output mode
     1779        if($file !~ /\.txt/i) { # check only img files, skip any matching txt files
     1780            if($hasTxtFile) { # if every image has a matching txt file, output txtfile too
     1781            print $item_fh "   <Page pagenum=\"$page_num\" imgfile=\"$file\" txtfile=\"$page_num.txt\"/>\n";
     1782            } else { # when its pagedimg only, txtfile is empty
     1783            print $item_fh "   <Page pagenum=\"$page_num\" imgfile=\"$file\" txtfile=\"\"/>\n";
     1784            }
     1785        }
     1786        }
     1787    }
     1788    }
     1789   
    17761790
    17771791    print $item_fh "</PagedDocument>\n";
Note: See TracChangeset for help on using the changeset viewer.