Context Navigation

← Previous Changeset
Next Changeset →

Changeset 10273

Timestamp:

2005-07-25T11:23:27+12:00 (19 years ago)

Author:

chi

Message:

A modification to allow a secondary-plugin setting through ConvertToPlug.

File:

: 1 edited

trunk/gsdl/perllib/plugins/PDFPlug.pm (modified) (9 diffs)

Legend:

: Unmodified
: Added
: Removed

trunk/gsdl/perllib/plugins/PDFPlug.pm

-              r10218
+              r10273
 sub BEGIN {
 @PDFPlug::ISA = ('ConvertToPlug');
+    @PDFPlug::ISA = ('ConvertToPlug');
+}
 …
     if ($self->{"use_sections"}) {
     $self->{"description_tags"} = 1;
+    }
+    #if ($self->{"use_sections"}) {
+    #$self->{"description_tags"} = 1;
+    #}
     # these are passed through to gsConvert.pl by ConvertToPlug.pm
 …
     $self->{'convert_options'} .= " -pdf_ignore_images" if $self->{"noimages"};
+    # pdftohtml will always produce html files encoded as utf-8
+    my $secondary_plugin_options = $self->{'secondary_plugin_options'};
+    if (!defined $secondary_plugin_options->{'HTMLPlug'}) {
+    $secondary_plugin_options->{'HTMLPlug'} = [];
+    }
+    if (!defined $secondary_plugin_options->{'TEXTPlug'}) {
+    $secondary_plugin_options->{'TEXTPlug'} = [];
+    }
+    my $html_options = $secondary_plugin_options->{'HTMLPlug'};
+    my $text_options = $secondary_plugin_options->{'TEXTPlug'};
     if ($self->{'input_encoding'} eq "auto") {
+    # pdftohtml will always produce html files encoded as utf-8
+    # => restrict primary PDFPlug and secondary HTML plugin to use
+    # utf8 and extract language.
     $self->{'input_encoding'} = "utf8";
     $self->{'extract_language'} = 1;
+    }
+    return bless $self, $class;
+    push(@$html_options,"-input_encoding", "utf8");
+    push(@$html_options,"-extract_language");
+    }
+    # Instruct HTMLPlug (when eventually accessed through read_into_doc_obj)
+    # to extract these metadata fields from the HEAD META fields
+    push(@$html_options,"-metadata_fields","Title,GENERATOR,date,author<Creator>");
+    if ($self->{'use_sections'}) {
+    $self->{'description_tags'} = 1;
+    push(@$html_options,"-description_tags");
+    }
+    # following title_sub removes "Page 1" added by pdftohtml, and a leading
+    # "1", which is often the page number at the top of the page. Bad Luck
+    # if your document title actually starts with "1 " - is there a better way?
+    push(@$html_options , "-title_sub", '^(Page\s+\d+)?(\s*1\s+)?');
+    push(@$text_options , "-title_sub", '^(Page\s+\d+)?(\s*1\s+)?');
+    $self = bless $self, $class;
+    $self->load_secondary_plugins($class,$secondary_plugin_options);
+    return $self;
+}
 …
+}
+# do plugin specific processing of doc_obj for HTML type
+sub process {
+sub convert_post_process
+{
     my $self = shift (@_);
     #my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli) = @_;
+    my ($conv_filename) = @_;
     my $outhandle=$self->{'outhandle'};
+    my $textref=$_[0];
+    my ($language, $encoding) = $self->textcat_get_language_encoding ($conv_filename);
+    # read in file ($text will be in utf8)
+    my $text = "";
+    $self->read_file ($conv_filename, $encoding, $language, \$text);
+    # Calculate number of pages based on <a ...> tags (we have a <a name=1> etc
+    # for each page).  Metadata based on this calculation not set until process()
+    #
+    # Note: this is done even if we are not breaking to document into pages as it might
+    # be useful to give an indication of document length in browser through setting
+    # num_pages as metadata.
+    my @pages = ($text =~ /\<[Aa] name=\"?\w+\"?>/ig);
+    my $num_pages = scalar(@pages);
+    $self->{'num_pages'} = $num_pages;
     if ($self->{'use_sections'}
 …
     # we have "<a name=1></a>" etc for each page
     # it may be <A name=
+    my @sections = split('<[Aa] name=', $$textref);
+    my @sections = split('<[Aa] name=', $text);
+    my $top_section = "";
     if (scalar (@sections) == 1) { #only one section - no split!
         print $outhandle "PDFPlug: warning - no sections found\n";
     } else {
         shift @sections; # don't need HTML header, etc
+        $top_section .= shift @sections; # keep HTML header etc as top_section
+    }
 …
     $title =~ s/\s\S*$/.../;
+    my $top_section;
     if (scalar (@sections) == 1) { # no sections found
         $top_section=$sections[0];
+        $top_section .= $sections[0];
         @sections=();
     } else {
         $top_section = "<!--<Section>\n<Metadata name=\"Title\">$title</Metadata>\n-->\n <!--</Section>-->\n";
+        $top_section .= "<!--<Section>\n<Metadata name=\"Title\">$title</Metadata>\n-->\n <!--</Section>-->\n";
+    }
 …
+    }
     $$textref=join('', ($top_section, @sections));
+    $text=join('', ($top_section, @sections));
+    }
     # turn any high bytes that aren't valid utf-8 into utf-8.
+    unicode::ensure_utf8($textref);
+    print $outhandle "PDFPlug: passing $_[3] on to $self->{'converted_to'}Plug\n"
+    if $self->{'verbosity'} > 1;
+    print STDERR "<Processing n='$_[3]' p='PDFPlug'>\n" if ($_[6]);
+    # tell htmlplug to extract these metadata fields from the HEAD META fields
+    $self->{'metadata_fields'} .= ",date,author<Creator>";
+    my $result = ConvertToPlug::process_type($self,"pdf",@_);
+    #my $doc_obj = pop(@_);
+    my $doc_obj = $_[5];
+    unicode::ensure_utf8(\$text);
+    # Write it out again!
+    $self->utf8_write_file (\$text, $conv_filename);
+}
+# do plugin specific processing of doc_obj for HTML type
+sub process {
+    my $self = shift (@_);
+    my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli) = @_;
+    my $result = $self->process_type("pdf",$base_dir,$file,$doc_obj);
     # fix up the extracted date metadata to be in Greenstone date format,
     # and fix the capitalisation of 'date'
 …
+    }
+    # Add NumPages metadata (we have "<a name=1>" etc for each page)
+    my @pages = ($$textref =~ /\<[Aa] name=\"?\w+\"?>/ig);
+    $doc_obj->add_utf8_metadata($cursection, "NumPages", scalar(@pages));
+    $doc_obj->add_utf8_metadata($cursection, "NumPages", $self->{'num_pages'});
     if ($self->{'use_sections'} && $self->{'converted_to'} eq "HTML") {
 …
     $doc_obj->set_utf8_metadata_element ($cursection, "gsdlthistype", "Paged");
+    }
     return $result;
+}

Note: See TracChangeset for help on using the changeset viewer.

Context Navigation

Changeset 10273

Legend:

trunk/gsdl/perllib/plugins/PDFPlug.pm

Download in other formats: