Context Navigation

← Previous Changeset
Next Changeset →

Changeset 32280

Timestamp:

2018-07-17T20:40:57+12:00 (6 years ago)

Author:

ak19

Message:

Implementing PDFv2paged_text (with pdfbox)

Location:

main/trunk/greenstone2/perllib

Files:

: 4 edited

plugins/ConvertBinaryFile.pm (modified) (1 diff)
plugins/PDFv2Plugin.pm (modified) (1 diff)
plugins/PrintInfo.pm (modified) (1 diff)
util.pm (modified) (2 diffs)

Legend:

: Unmodified
: Added
: Removed

main/trunk/greenstone2/perllib/plugins/ConvertBinaryFile.pm

-              r32277
+              r32280
     $self->{'convert_to_plugin'} = "StructuredHTMLPlugin";
     $self->{'convert_to_ext'} = "html";
     } elsif ($convert_to =~ /^pagedimg/) {
+    } elsif ($convert_to =~ /^pagedimg/ || $convert_to eq "paged_text") {
     $self->{'convert_to_plugin'} = "PagedImagePlugin";
+    my ($convert_to_ext) = $convert_to =~ /pagedimg\_(jpg|gif|png)/i;
+    $convert_to_ext = 'jpg' unless defined $convert_to_ext;
+    $self->{'convert_to_ext'} = $convert_to_ext;
+    if($convert_to eq "paged_text") {
+        $self->{'convert_to_ext'} = "txt";
+    } else {
+        my ($convert_to_ext) = $convert_to =~ /pagedimg(?:txt)?\_(jpg|gif|png)/i; # the ?: prefix avoids capturing or else discards the optional 'txt' in 'pagedimgtxt',
+        # so that we can consider the actual portion we want to capture: the img type
+        $convert_to_ext = 'jpg' unless defined $convert_to_ext;
+        $self->{'convert_to_ext'} = $convert_to_ext;
+    }
+    }
+}

main/trunk/greenstone2/perllib/plugins/PDFv2Plugin.pm

-              r32277
+              r32280
     # TODO
     print STDERR "@@@ Conversion to " . $self->{'convert_to'} , " with Xpdf Tools is not yet implemented.\n";
     print STDERR "@@@ Converting to a text instead.\n";
     $self->{'convert_to'} = "text";
+    #print STDERR "@@@ Converting to text instead.\n";
+    #$self->{'convert_to'} = "text";
+    }

main/trunk/greenstone2/perllib/plugins/PrintInfo.pm

r25957	r32280
175	175	}
176	176	$self->print_xml($high_level_information_only);
	177	# print STDERR "\n@@@ BLA\n\n";
177	178	}
178	179

main/trunk/greenstone2/perllib/util.pm

-              r32193
+              r32280
     sub page_number {
     my ($dir) = @_;
     my ($pagenum) =($dir =~ m/^.*?[-\.]?(\d+)(\.(jpg|gif|png))?$/i);
+    my ($pagenum) =($dir =~ m/^.*?[-\.]?(\d+)(\.(jpg|gif|png|txt))?$/i);
 #   my ($pagenum) =($dir =~ m/(\d+)(\.(jpg|gif|png))?$/i); # this works but is not as safe/strict about input filepatterns as the above
 …
     my $hasTxtFile = &FileUtils::fileExists($txtfilename);
+    foreach my $file (@dir_files){
+    if ($file !~ /\.item/i && $file !~ /\.txt/i){
+    # Write out the elements of the item file.
+    # We could be dealing with 3 types of conversion output formats: txt only (paged_text),
+    # images only (pagedimg_) and images AND text (pagedimgtxt_).
+    foreach my $file (@dir_files) {
+    if ($file !~ /\.item/i) {
         $page_num = page_number($file);
         $page_num++ if $starts_at_0; # image numbers start at 0, so add 1
+        if($hasTxtFile) {
+        print $item_fh "   <Page pagenum=\"$page_num\" imgfile=\"$file\" txtfile=\"$page_num.txt\"/>\n";
+        } else {
+        print $item_fh "   <Page pagenum=\"$page_num\" imgfile=\"$file\" txtfile=\"\"/>\n";
+        if ($convert_to eq "txt") { # output format is paged_text, which has no images
+        if ($file =~ m/\.txt/i) { # check only txt files (should be all there is, besides the skipped .item file)
+            print $item_fh "   <Page pagenum=\"$page_num\" imgfile=\"\" txtfile=\"$page_num.txt\"/>\n";
+        } # else, some non-txt file ext, skip
+        }
+    }
+    }
+        else { # either pagedimg or pagedimgtxt output mode
+        if($file !~ /\.txt/i) { # check only img files, skip any matching txt files
+            if($hasTxtFile) { # if every image has a matching txt file, output txtfile too
+            print $item_fh "   <Page pagenum=\"$page_num\" imgfile=\"$file\" txtfile=\"$page_num.txt\"/>\n";
+            } else { # when its pagedimg only, txtfile is empty
+            print $item_fh "   <Page pagenum=\"$page_num\" imgfile=\"$file\" txtfile=\"\"/>\n";
+            }
+        }
+        }
+    }
+    }
     print $item_fh "</PagedDocument>\n";

Note: See TracChangeset for help on using the changeset viewer.