Context Navigation

← Previous Change
Next Change →

PDFv2Plugin.pm

Timestamp:

2018-07-18T20:30:14+12:00 (6 years ago)

Author:

ak19

Message:

Cleaning up unused strings, some debug statements and recently commented out code.

File:

: 1 edited

main/trunk/greenstone2/perllib/plugins/PDFv2Plugin.pm (modified) (11 diffs)

Legend:

: Unmodified
: Added
: Removed

main/trunk/greenstone2/perllib/plugins/PDFv2Plugin.pm

-              r32286
+              r32287
 use Mojo::DOM; # for HTML parsing
-#use AutoLoadConverters;
 use PDFBoxConverter;
 use ConvertBinaryFile;
-#@PDFv2Plugin::ISA = ('ConvertBinaryFile', 'AutoLoadConverters', 'ReadTextFile');
 @PDFv2Plugin::ISA = ('ConvertBinaryFile', 'PDFBoxConverter', 'ReadTextFile');
 …
     push(@$pluginlist, $class);
-#    push(@$inputargs,"-title_sub");
-#    push(@$inputargs,'^(Page\s+\d+)?(\s*1\s+)?');
     push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
     push(@{$hashArgOptLists->{"OptList"}},$options);
-    #    my $auto_converter_self = new AutoLoadConverters($pluginlist,$inputargs,$hashArgOptLists,["PDFBoxConverter"],1);
     my $pdfbox_converter_self = new PDFBoxConverter($pluginlist, $inputargs, $hashArgOptLists);
     my $cbf_self = new ConvertBinaryFile($pluginlist, $inputargs, $hashArgOptLists);
-    #    my $self = BaseImporter::merge_inheritance($auto_converter_self, $cbf_self);
     my $self = BaseImporter::merge_inheritance($pdfbox_converter_self, $cbf_self);
 …
+    }
+    }
-    # if pdfbox_conversion is not on, check convert_to to make sure that xpdftools can
-    # support the selected output format, or fallback on a sensible default
-    # Not all available conversion output options are possible with xpdftools, as some are
-    # only handled by pdfbox. If a format is unavailable with xpdftools, default to pretty_html
-    # if (!$self->{"pdfbox_conversion"}) {
-    #   my $convert_to = $self->{'convert_to'};
-    #   my $fallback_convert_to = $convert_to;
-    #   if($convert_to =~ /^html$/) {
-    #       $fallback_convert_to = "pretty_html";
-    #   }
-    #   elsif ($self->{'convert_to'} =~ /^pagedimg/) {
-    #       $fallback_convert_to = "paged_pretty_html";
-    #   }
-    #   elsif ($self->{'convert_to'} =~ /^paged_text$/) {
-    #       #   print STDERR "@@@ Conversion to " . $self->{'convert_to'} , " with Xpdf Tools is not yet implemented.\n";
-    #       $fallback_convert_to = "text";
-    #   }
-    #   if($convert_to =~ /^(html|pagedimg|paged_text)/) {
-    #       &gsprintf::gsprintf(STDERR, "{PDFv2Plugin.conversion_needs_pdfbox}\n", ($self->{'convert_to'}, $fallback_convert_to));
-    #       $self->{'convert_to'} = $fallback_convert_to;
-    #   }
-    # }
     # set convert_to_plugin and convert_to_ext
 …
     my $specific_options = $secondary_plugin_options->{$secondary_plugin_name};
-    # following title_sub removes "Page 1" added by pdftohtml, and a leading
-    # "1", which is often the page number at the top of the page. Bad Luck
-    # if your document title actually starts with "1 " - is there a better way?
-#    push(@$specific_options , "-title_sub", '^(Page\s+\d+)?(\s*1\s+)?');
     my $associate_tail_re = $self->{'associate_tail_re'};
     if ((defined $associate_tail_re) && ($associate_tail_re ne "")) {
 …
     # ConvertBinaryFile init
     $self->SUPER::init(@_);
-#   $self->AutoLoadConverters::init(@_);
     $self->PDFBoxConverter::init(@_);
 …
     my $self = shift (@_);
-#   $self->AutoLoadConverters::begin(@_);
     $self->PDFBoxConverter::begin(@_);
     $self->SUPER::begin(@_);
 …
     $self->PDFBoxConverter::deinit(@_);
-#   $self->AutoLoadConverters::deinit(@_);
     $self->SUPER::deinit(@_);
 …
+    }
+    # for all other output formats, use pdfbox:
+    #return $self->AutoLoadConverters::tmp_area_convert_file(@_);
+    # Here, we now do what AutoLoadConverters::tmp_area_convert_file(@_) does:
+    # for all other output formats, use pdfbox:
+    # Here, we now do directly what AutoLoadConverters::tmp_area_convert_file(@_)
+    # does with PDFBoxConverter:
     my ($result, $result_str, $new_filename) = $self->PDFBoxConverter::convert($input_filename, $output_ext);
     if (defined $result && $result != 0) {
 …
     $self->xpdftohtml_convert_post_process($conv_filename);
+    }
     else { # use PDFPlugin's usual post processing
+    else { # use original PDFPlugin's usual post processing
     $self->default_convert_post_process($conv_filename);
+    }
 …
     my $title = $sections[0];
     $title =~ s/^\"?\w+\"?>//; # specific for pdftohtml...
+    $title =~ s/^\"?\w+\"?>//; # specific for old pdftohtml...
     $title =~ s/<\/([^>]+)><\1>//g; # (eg) </b><b> - no space
     $title =~ s/<[^>]*>/ /g;
 …
     $title =~ s/\s+$//;
     $title =~ s/\s+/ /gs;
+    $title =~ s/^$self->{'title_sub'}// if ($self->{'title_sub'});
+    $title =~ s/^\s+//s; # in case title_sub introduced any...
+    $title =~ s/^\s+//s; # in case title_sub (of old PDFPlugin's old pdftohtml) introduced any... Generally still useful to remove spaces at the start?
     $title = substr ($title, 0, 100);
     $title =~ s/\s\S*$/.../;

Note: See TracChangeset for help on using the changeset viewer.

Context Navigation

Changeset 32287 for main/trunk/greenstone2/perllib/plugins/PDFv2Plugin.pm

Legend:

main/trunk/greenstone2/perllib/plugins/PDFv2Plugin.pm

Download in other formats: