Ignore:
Timestamp:
2002-08-29T16:37:54+12:00 (22 years ago)
Author:
jrm21
Message:

Now takes a "-use_sections" option to make a section per page.

File:
1 edited

Legend:

Unmodified
Added
Removed
  • trunk/gsdl/perllib/plugins/PDFPlug.pm

    r2979 r3411  
    3939    # if your document title actually starts with "1 " - is there a better way?
    4040
    41     my $self = new ConvertToPlug ($class, @_, "-title_sub", '^(Page\s+\d+)?(\s*1\s+)?');
     41    my @args=();
     42    my $use_sections=0;
     43    while ($#_ > -1) {
     44    my $arg=shift(@_);
     45    if ($arg =~ /^\-use_sections/) {
     46        $use_sections=1;
     47        # in case someone put "-use_sections 1"
     48        $arg=shift(@_);
     49        if (defined($arg) && $arg ne "1") {
     50        push(@args, $arg);
     51        }
     52        push(@args, "-description_tags"); # so HTML uses sections...
     53        push(@args, @_); # any other args to PDFPlug...
     54        @_=();
     55        last;
     56    } else {
     57        push(@args, $arg);
     58    }
     59    }
     60    my $self = new ConvertToPlug ($class, @args, "-title_sub", '^(Page\s+\d+)?(\s*1\s+)?');
     61    if ($use_sections) {
     62    $self->{'use_sections'}=1;
     63    }
    4264   
    4365    return bless $self, $class;
     66}
     67
     68
     69sub print_usage {
     70    print STDERR "\n  usage: plugin PDFPlug [options]\n\n";
     71    print STDERR "  options:\n";
     72    print STDERR "   -convert_to (html|text) plugin converts to TEXT or HTML\n";
     73    print STDERR "                           (default html)\n";
     74    print STDERR "   -use_sections     create a separate section for each page of the PDF file.\n\n";
    4475}
    4576
     
    6293    my $self = shift (@_);
    6394
     95    if ($self->{'use_sections'}
     96    && $self->{'converted_to'} eq "HTML") {
     97
     98    print STDERR "PDFPlug: Calculating sections...\n";
     99    my $textref=$_[0];
     100
     101    # This might be specific to the version of pdftohtml in <=  gsdl-2.38
     102    my @sections = split('<p><a name=\d+>', $$textref);
     103
     104    shift @sections; # don't need HTML header, etc
     105    # handle first section specially for title? Or all use first 100...
     106   
     107    my $title = $sections[0];
     108    $title =~ s/^\d+>//; # specific for pdftohtml...
     109    $title =~ s/<\/([^>]+)><\1>//g; # (eg) </b><b> - no space
     110    $title =~ s/<[^>]*>/ /g;
     111    $title =~ s/(?:&nbsp;|\xc2\xa0)/ /g; # utf-8 for nbsp...
     112    $title =~ s/^\s+//s;
     113    $title =~ s/\s+$//;
     114    $title =~ s/\s+/ /gs;
     115    $title =~ s/^$self->{'title_sub'}// if ($self->{'title_sub'});
     116    $title =~ s/^\s+//s; # in case title_sub introduced any...
     117    $title = substr ($title, 0, 100);
     118    $title =~ s/\s\S*$/.../;
     119
     120    my $top_section = "<!--<Section>\n<Metadata name=\"Title\">$title</Metadata>\n-->\n <!--</Section>-->\n";
     121
     122    # add metadata per section...
     123    foreach my $section (@sections) {
     124        $section =~ m@^<b>Page (\d+)</b>@;
     125        $title = $1; # Greenstone does magic if sections are titled digits
     126        if (! defined($title) ) {
     127        print STDERR "no title: $section\n";
     128        }
     129        my $newsection = "<!-- from PDFPlug -->\n<!-- <Section>\n";
     130        $newsection .= "<Metadata name=\"Title\">" . $title
     131        . "</Metadata>\n--><p>\n";
     132        $newsection .= $section;
     133        $newsection .= "<!--</Section>-->\n";
     134        $section = $newsection;
     135    }
     136
     137    $$textref=join('', ($top_section, @sections));
     138    }
     139
    64140    my $outhandle = $self->{'outhandle'};
    65141    print $outhandle "PDFPlug: passing $_[3] on to $self->{'converted_to'}Plug\n"
Note: See TracChangeset for help on using the changeset viewer.