Changeset 3411 for trunk/gsdl/perllib
- Timestamp:
- 2002-08-29T16:37:54+12:00 (22 years ago)
- File:
-
- 1 edited
Legend:
- Unmodified
- Added
- Removed
-
trunk/gsdl/perllib/plugins/PDFPlug.pm
r2979 r3411 39 39 # if your document title actually starts with "1 " - is there a better way? 40 40 41 my $self = new ConvertToPlug ($class, @_, "-title_sub", '^(Page\s+\d+)?(\s*1\s+)?'); 41 my @args=(); 42 my $use_sections=0; 43 while ($#_ > -1) { 44 my $arg=shift(@_); 45 if ($arg =~ /^\-use_sections/) { 46 $use_sections=1; 47 # in case someone put "-use_sections 1" 48 $arg=shift(@_); 49 if (defined($arg) && $arg ne "1") { 50 push(@args, $arg); 51 } 52 push(@args, "-description_tags"); # so HTML uses sections... 53 push(@args, @_); # any other args to PDFPlug... 54 @_=(); 55 last; 56 } else { 57 push(@args, $arg); 58 } 59 } 60 my $self = new ConvertToPlug ($class, @args, "-title_sub", '^(Page\s+\d+)?(\s*1\s+)?'); 61 if ($use_sections) { 62 $self->{'use_sections'}=1; 63 } 42 64 43 65 return bless $self, $class; 66 } 67 68 69 sub print_usage { 70 print STDERR "\n usage: plugin PDFPlug [options]\n\n"; 71 print STDERR " options:\n"; 72 print STDERR " -convert_to (html|text) plugin converts to TEXT or HTML\n"; 73 print STDERR " (default html)\n"; 74 print STDERR " -use_sections create a separate section for each page of the PDF file.\n\n"; 44 75 } 45 76 … … 62 93 my $self = shift (@_); 63 94 95 if ($self->{'use_sections'} 96 && $self->{'converted_to'} eq "HTML") { 97 98 print STDERR "PDFPlug: Calculating sections...\n"; 99 my $textref=$_[0]; 100 101 # This might be specific to the version of pdftohtml in <= gsdl-2.38 102 my @sections = split('<p><a name=\d+>', $$textref); 103 104 shift @sections; # don't need HTML header, etc 105 # handle first section specially for title? Or all use first 100... 106 107 my $title = $sections[0]; 108 $title =~ s/^\d+>//; # specific for pdftohtml... 109 $title =~ s/<\/([^>]+)><\1>//g; # (eg) </b><b> - no space 110 $title =~ s/<[^>]*>/ /g; 111 $title =~ s/(?: |\xc2\xa0)/ /g; # utf-8 for nbsp... 112 $title =~ s/^\s+//s; 113 $title =~ s/\s+$//; 114 $title =~ s/\s+/ /gs; 115 $title =~ s/^$self->{'title_sub'}// if ($self->{'title_sub'}); 116 $title =~ s/^\s+//s; # in case title_sub introduced any... 117 $title = substr ($title, 0, 100); 118 $title =~ s/\s\S*$/.../; 119 120 my $top_section = "<!--<Section>\n<Metadata name=\"Title\">$title</Metadata>\n-->\n <!--</Section>-->\n"; 121 122 # add metadata per section... 123 foreach my $section (@sections) { 124 $section =~ m@^<b>Page (\d+)</b>@; 125 $title = $1; # Greenstone does magic if sections are titled digits 126 if (! defined($title) ) { 127 print STDERR "no title: $section\n"; 128 } 129 my $newsection = "<!-- from PDFPlug -->\n<!-- <Section>\n"; 130 $newsection .= "<Metadata name=\"Title\">" . $title 131 . "</Metadata>\n--><p>\n"; 132 $newsection .= $section; 133 $newsection .= "<!--</Section>-->\n"; 134 $section = $newsection; 135 } 136 137 $$textref=join('', ($top_section, @sections)); 138 } 139 64 140 my $outhandle = $self->{'outhandle'}; 65 141 print $outhandle "PDFPlug: passing $_[3] on to $self->{'converted_to'}Plug\n"
Note:
See TracChangeset
for help on using the changeset viewer.