Changeset 10273


Ignore:
Timestamp:
2005-07-25T11:23:27+12:00 (19 years ago)
Author:
chi
Message:

A modification to allow a secondary-plugin setting through ConvertToPlug.

File:
1 edited

Legend:

Unmodified
Added
Removed
  • trunk/gsdl/perllib/plugins/PDFPlug.pm

    r10218 r10273  
    3232
    3333sub BEGIN {
    34 @PDFPlug::ISA = ('ConvertToPlug');
     34    @PDFPlug::ISA = ('ConvertToPlug');
    3535}
    3636
     
    8383
    8484
    85     if ($self->{"use_sections"}) {
    86     $self->{"description_tags"} = 1;
    87     }
     85    #if ($self->{"use_sections"}) {
     86    #$self->{"description_tags"} = 1;
     87    #}
    8888
    8989    # these are passed through to gsConvert.pl by ConvertToPlug.pm
     
    9494    $self->{'convert_options'} .= " -pdf_ignore_images" if $self->{"noimages"};
    9595
    96     # pdftohtml will always produce html files encoded as utf-8
     96    my $secondary_plugin_options = $self->{'secondary_plugin_options'};
     97
     98    if (!defined $secondary_plugin_options->{'HTMLPlug'}) {
     99    $secondary_plugin_options->{'HTMLPlug'} = [];
     100    }
     101    if (!defined $secondary_plugin_options->{'TEXTPlug'}) {
     102    $secondary_plugin_options->{'TEXTPlug'} = [];
     103    }
     104
     105    my $html_options = $secondary_plugin_options->{'HTMLPlug'};
     106    my $text_options = $secondary_plugin_options->{'TEXTPlug'};
     107
    97108    if ($self->{'input_encoding'} eq "auto") {
     109    # pdftohtml will always produce html files encoded as utf-8
     110    # => restrict primary PDFPlug and secondary HTML plugin to use
     111    # utf8 and extract language.
     112
    98113    $self->{'input_encoding'} = "utf8";
    99114    $self->{'extract_language'} = 1;
    100     }
    101 
    102     return bless $self, $class;
     115
     116    push(@$html_options,"-input_encoding", "utf8");
     117    push(@$html_options,"-extract_language");
     118    }
     119
     120    # Instruct HTMLPlug (when eventually accessed through read_into_doc_obj)
     121    # to extract these metadata fields from the HEAD META fields
     122    push(@$html_options,"-metadata_fields","Title,GENERATOR,date,author<Creator>");
     123
     124    if ($self->{'use_sections'}) {
     125    $self->{'description_tags'} = 1;
     126    push(@$html_options,"-description_tags");
     127    }
     128
     129    # following title_sub removes "Page 1" added by pdftohtml, and a leading
     130    # "1", which is often the page number at the top of the page. Bad Luck
     131    # if your document title actually starts with "1 " - is there a better way?
     132    push(@$html_options , "-title_sub", '^(Page\s+\d+)?(\s*1\s+)?');
     133    push(@$text_options , "-title_sub", '^(Page\s+\d+)?(\s*1\s+)?');
     134
     135    $self = bless $self, $class;
     136
     137    $self->load_secondary_plugins($class,$secondary_plugin_options);
     138
     139    return $self;
    103140}
    104141
     
    114151}
    115152   
    116 
    117 # do plugin specific processing of doc_obj for HTML type
    118 sub process {
     153sub convert_post_process
     154{
    119155    my $self = shift (@_);
    120     #my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli) = @_;
     156    my ($conv_filename) = @_;
    121157
    122158    my $outhandle=$self->{'outhandle'};
    123159
    124     my $textref=$_[0];
     160    my ($language, $encoding) = $self->textcat_get_language_encoding ($conv_filename);
     161
     162    # read in file ($text will be in utf8)
     163    my $text = "";
     164    $self->read_file ($conv_filename, $encoding, $language, \$text);
     165
     166    # Calculate number of pages based on <a ...> tags (we have a <a name=1> etc
     167    # for each page).  Metadata based on this calculation not set until process()
     168    #
     169    # Note: this is done even if we are not breaking to document into pages as it might
     170    # be useful to give an indication of document length in browser through setting
     171    # num_pages as metadata.
     172
     173    my @pages = ($text =~ /\<[Aa] name=\"?\w+\"?>/ig);
     174    my $num_pages = scalar(@pages);
     175    $self->{'num_pages'} = $num_pages;
    125176
    126177    if ($self->{'use_sections'}
     
    131182    # we have "<a name=1></a>" etc for each page
    132183    # it may be <A name=
    133     my @sections = split('<[Aa] name=', $$textref);
     184    my @sections = split('<[Aa] name=', $text);
     185
     186    my $top_section = "";
    134187
    135188    if (scalar (@sections) == 1) { #only one section - no split!
    136189        print $outhandle "PDFPlug: warning - no sections found\n";
    137190    } else {
    138         shift @sections; # don't need HTML header, etc
     191        $top_section .= shift @sections; # keep HTML header etc as top_section
    139192    }
    140193
     
    154207    $title =~ s/\s\S*$/.../;
    155208
    156     my $top_section;
     209
    157210    if (scalar (@sections) == 1) { # no sections found
    158         $top_section=$sections[0];
     211        $top_section .= $sections[0];
    159212        @sections=();
    160213    } else {
    161         $top_section = "<!--<Section>\n<Metadata name=\"Title\">$title</Metadata>\n-->\n <!--</Section>-->\n";
     214        $top_section .= "<!--<Section>\n<Metadata name=\"Title\">$title</Metadata>\n-->\n <!--</Section>-->\n";
    162215    }
    163216
     
    180233    }
    181234
    182     $$textref=join('', ($top_section, @sections));
     235    $text=join('', ($top_section, @sections));
    183236    }
    184237
    185238    # turn any high bytes that aren't valid utf-8 into utf-8.
    186     unicode::ensure_utf8($textref);
    187 
    188     print $outhandle "PDFPlug: passing $_[3] on to $self->{'converted_to'}Plug\n"
    189     if $self->{'verbosity'} > 1;
    190     print STDERR "<Processing n='$_[3]' p='PDFPlug'>\n" if ($_[6]);
    191 
    192     # tell htmlplug to extract these metadata fields from the HEAD META fields
    193     $self->{'metadata_fields'} .= ",date,author<Creator>";
    194 
    195     my $result = ConvertToPlug::process_type($self,"pdf",@_);
    196 
    197     #my $doc_obj = pop(@_);
    198     my $doc_obj = $_[5];
     239    unicode::ensure_utf8(\$text);
     240
     241    # Write it out again!
     242    $self->utf8_write_file (\$text, $conv_filename);
     243}
     244
     245
     246# do plugin specific processing of doc_obj for HTML type
     247sub process {
     248    my $self = shift (@_);
     249    my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli) = @_;
     250
     251    my $result = $self->process_type("pdf",$base_dir,$file,$doc_obj);
     252
    199253    # fix up the extracted date metadata to be in Greenstone date format,
    200254    # and fix the capitalisation of 'date'
     
    220274    }
    221275
    222     # Add NumPages metadata (we have "<a name=1>" etc for each page)
    223     my @pages = ($$textref =~ /\<[Aa] name=\"?\w+\"?>/ig);
    224     $doc_obj->add_utf8_metadata($cursection, "NumPages", scalar(@pages));
     276    $doc_obj->add_utf8_metadata($cursection, "NumPages", $self->{'num_pages'});
     277
    225278   
    226279    if ($self->{'use_sections'} && $self->{'converted_to'} eq "HTML") {
     
    229282    $doc_obj->set_utf8_metadata_element ($cursection, "gsdlthistype", "Paged");
    230283    }
     284
    231285    return $result;
    232286}
Note: See TracChangeset for help on using the changeset viewer.