Changeset 10273
- Timestamp:
- 2005-07-25T11:23:27+12:00 (19 years ago)
- File:
-
- 1 edited
Legend:
- Unmodified
- Added
- Removed
-
trunk/gsdl/perllib/plugins/PDFPlug.pm
r10218 r10273 32 32 33 33 sub BEGIN { 34 @PDFPlug::ISA = ('ConvertToPlug');34 @PDFPlug::ISA = ('ConvertToPlug'); 35 35 } 36 36 … … 83 83 84 84 85 if ($self->{"use_sections"}) {86 $self->{"description_tags"} = 1;87 }85 #if ($self->{"use_sections"}) { 86 #$self->{"description_tags"} = 1; 87 #} 88 88 89 89 # these are passed through to gsConvert.pl by ConvertToPlug.pm … … 94 94 $self->{'convert_options'} .= " -pdf_ignore_images" if $self->{"noimages"}; 95 95 96 # pdftohtml will always produce html files encoded as utf-8 96 my $secondary_plugin_options = $self->{'secondary_plugin_options'}; 97 98 if (!defined $secondary_plugin_options->{'HTMLPlug'}) { 99 $secondary_plugin_options->{'HTMLPlug'} = []; 100 } 101 if (!defined $secondary_plugin_options->{'TEXTPlug'}) { 102 $secondary_plugin_options->{'TEXTPlug'} = []; 103 } 104 105 my $html_options = $secondary_plugin_options->{'HTMLPlug'}; 106 my $text_options = $secondary_plugin_options->{'TEXTPlug'}; 107 97 108 if ($self->{'input_encoding'} eq "auto") { 109 # pdftohtml will always produce html files encoded as utf-8 110 # => restrict primary PDFPlug and secondary HTML plugin to use 111 # utf8 and extract language. 112 98 113 $self->{'input_encoding'} = "utf8"; 99 114 $self->{'extract_language'} = 1; 100 } 101 102 return bless $self, $class; 115 116 push(@$html_options,"-input_encoding", "utf8"); 117 push(@$html_options,"-extract_language"); 118 } 119 120 # Instruct HTMLPlug (when eventually accessed through read_into_doc_obj) 121 # to extract these metadata fields from the HEAD META fields 122 push(@$html_options,"-metadata_fields","Title,GENERATOR,date,author<Creator>"); 123 124 if ($self->{'use_sections'}) { 125 $self->{'description_tags'} = 1; 126 push(@$html_options,"-description_tags"); 127 } 128 129 # following title_sub removes "Page 1" added by pdftohtml, and a leading 130 # "1", which is often the page number at the top of the page. Bad Luck 131 # if your document title actually starts with "1 " - is there a better way? 132 push(@$html_options , "-title_sub", '^(Page\s+\d+)?(\s*1\s+)?'); 133 push(@$text_options , "-title_sub", '^(Page\s+\d+)?(\s*1\s+)?'); 134 135 $self = bless $self, $class; 136 137 $self->load_secondary_plugins($class,$secondary_plugin_options); 138 139 return $self; 103 140 } 104 141 … … 114 151 } 115 152 116 117 # do plugin specific processing of doc_obj for HTML type 118 sub process { 153 sub convert_post_process 154 { 119 155 my $self = shift (@_); 120 #my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli) = @_;156 my ($conv_filename) = @_; 121 157 122 158 my $outhandle=$self->{'outhandle'}; 123 159 124 my $textref=$_[0]; 160 my ($language, $encoding) = $self->textcat_get_language_encoding ($conv_filename); 161 162 # read in file ($text will be in utf8) 163 my $text = ""; 164 $self->read_file ($conv_filename, $encoding, $language, \$text); 165 166 # Calculate number of pages based on <a ...> tags (we have a <a name=1> etc 167 # for each page). Metadata based on this calculation not set until process() 168 # 169 # Note: this is done even if we are not breaking to document into pages as it might 170 # be useful to give an indication of document length in browser through setting 171 # num_pages as metadata. 172 173 my @pages = ($text =~ /\<[Aa] name=\"?\w+\"?>/ig); 174 my $num_pages = scalar(@pages); 175 $self->{'num_pages'} = $num_pages; 125 176 126 177 if ($self->{'use_sections'} … … 131 182 # we have "<a name=1></a>" etc for each page 132 183 # it may be <A name= 133 my @sections = split('<[Aa] name=', $$textref); 184 my @sections = split('<[Aa] name=', $text); 185 186 my $top_section = ""; 134 187 135 188 if (scalar (@sections) == 1) { #only one section - no split! 136 189 print $outhandle "PDFPlug: warning - no sections found\n"; 137 190 } else { 138 shift @sections; # don't need HTML header, etc191 $top_section .= shift @sections; # keep HTML header etc as top_section 139 192 } 140 193 … … 154 207 $title =~ s/\s\S*$/.../; 155 208 156 my $top_section; 209 157 210 if (scalar (@sections) == 1) { # no sections found 158 $top_section =$sections[0];211 $top_section .= $sections[0]; 159 212 @sections=(); 160 213 } else { 161 $top_section = "<!--<Section>\n<Metadata name=\"Title\">$title</Metadata>\n-->\n <!--</Section>-->\n";214 $top_section .= "<!--<Section>\n<Metadata name=\"Title\">$title</Metadata>\n-->\n <!--</Section>-->\n"; 162 215 } 163 216 … … 180 233 } 181 234 182 $ $textref=join('', ($top_section, @sections));235 $text=join('', ($top_section, @sections)); 183 236 } 184 237 185 238 # turn any high bytes that aren't valid utf-8 into utf-8. 186 unicode::ensure_utf8($textref); 187 188 print $outhandle "PDFPlug: passing $_[3] on to $self->{'converted_to'}Plug\n" 189 if $self->{'verbosity'} > 1; 190 print STDERR "<Processing n='$_[3]' p='PDFPlug'>\n" if ($_[6]); 191 192 # tell htmlplug to extract these metadata fields from the HEAD META fields 193 $self->{'metadata_fields'} .= ",date,author<Creator>"; 194 195 my $result = ConvertToPlug::process_type($self,"pdf",@_); 196 197 #my $doc_obj = pop(@_); 198 my $doc_obj = $_[5]; 239 unicode::ensure_utf8(\$text); 240 241 # Write it out again! 242 $self->utf8_write_file (\$text, $conv_filename); 243 } 244 245 246 # do plugin specific processing of doc_obj for HTML type 247 sub process { 248 my $self = shift (@_); 249 my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli) = @_; 250 251 my $result = $self->process_type("pdf",$base_dir,$file,$doc_obj); 252 199 253 # fix up the extracted date metadata to be in Greenstone date format, 200 254 # and fix the capitalisation of 'date' … … 220 274 } 221 275 222 # Add NumPages metadata (we have "<a name=1>" etc for each page) 223 my @pages = ($$textref =~ /\<[Aa] name=\"?\w+\"?>/ig); 224 $doc_obj->add_utf8_metadata($cursection, "NumPages", scalar(@pages)); 276 $doc_obj->add_utf8_metadata($cursection, "NumPages", $self->{'num_pages'}); 277 225 278 226 279 if ($self->{'use_sections'} && $self->{'converted_to'} eq "HTML") { … … 229 282 $doc_obj->set_utf8_metadata_element ($cursection, "gsdlthistype", "Paged"); 230 283 } 284 231 285 return $result; 232 286 }
Note:
See TracChangeset
for help on using the changeset viewer.