Changeset 10278
- Timestamp:
- 2005-07-25T14:14:57+12:00 (19 years ago)
- File:
-
- 1 edited
Legend:
- Unmodified
- Added
- Removed
-
trunk/gsdl/perllib/plugins/ConvertToPlug.pm
r10254 r10278 1 1 ########################################################################### 2 2 # 3 # ConvertToPlug.pm -- plugin that inherits from HTML or TEXT Plug, depending 4 # on plugin argument convert_to 3 # ConvertToPlug.pm -- plugin that inherits from BasPlug 5 4 # 6 5 # A component of the Greenstone digital library software … … 26 25 ########################################################################### 27 26 28 # The plugin is inherited by such plugins as WordPlug and PDFPlug. 29 # It facilitates the conversion of these document types to either HTML 30 # or TEXT by setting up variable that instruct ConvertToBasPlug 31 # how to work. 32 33 # It works by dynamically inheriting HTMLPlug or TEXTPlug based on 34 # the plugin argument 'convert_to'. If the argument is not present, 35 # the default is to inherit HTMLPlug. 27 # This plugin is inherited by such plugins as WordPlug, PPTPlug, PSPlug, RTFPlug 28 # and PDFPlug. It facilitates the conversion of these document types to either 29 # HTML, Text or auto (allow user to choose which format to convert to). 30 # It works by dynamically inheriting BasPlug and base on the plugin type in 31 # secondary_plugins to devide which format to 'convert_to'. If the argument is 32 # not present, the default is to inherit auto. 36 33 37 34 … … 39 36 40 37 use BasPlug; 38 use ghtml; 41 39 use HTMLPlug; 42 40 use TEXTPlug; 43 use ghtml;44 45 use strict;46 no strict 'refs'; # allow filehandles to be variables and viceversa41 use PagedImgPlug; 42 43 #use strict; 44 #no strict 'refs'; # allow filehandles to be variables and viceversa 47 45 48 46 sub BEGIN { 49 @ConvertToPlug::ISA = ('HTMLPlug'); 50 # @ISA = ('HTMLPlug', 'TEXTPlug'); 51 # @ISA = ('BasPlug'); #, 'HTMLPlug', 'TEXTPlug'); 47 @ISA = ('BasPlug'); 52 48 } 53 49 54 50 my $convert_to_list = 55 [ { 'name' => "html", 51 [ { 'name' => "auto", 52 'desc' => "{ConvertToPlug.convert_to.auto}" }, 53 { 'name' => "html", 56 54 'desc' => "{ConvertToPlug.convert_to.html}" }, 57 55 { 'name' => "text", 58 'desc' => "{ConvertToPlug.convert_to.text}" } ]; 56 'desc' => "{ConvertToPlug.convert_to.text}" }, 57 { 'name' => "pagedimg-jpg", 58 'desc' => "{ConvertToPlug.convert_to.pagedimg-jpg"}, 59 { 'name' => "pagedimg-gif", 60 'desc' => "{ConvertToPlug.convert_to.pagedimg-gif"}, 61 { 'name' => "pagedimg-png", 62 'desc' => "{ConvertToPlug.convert_to.pagedimg-png"}, 63 ]; 59 64 60 65 my $arguments = … … 95 100 if($inputargs->[$intCounter] eq "-convert_to") 96 101 { 97 if($inputargs->[$intCounter+1] eq "text" || $inputargs->[$intCounter+1] eq "html") 102 #if($inputargs->[$intCounter+1] eq "auto" || $inputargs->[$intCounter+1] =~ /pagedimg.*/i || $inputargs->[$intCounter+1] eq "text" || $inputargs->[$intCounter+1] eq "html") 103 # if the setting is "auto" then refer to html for now 104 if($inputargs->[$intCounter+1] =~ /pagedimg.*/i || $inputargs->[$intCounter+1] eq "text" || $inputargs->[$intCounter+1] eq "html") 98 105 { 99 106 return $inputargs->[$intCounter+1]; … … 103 110 } 104 111 return "html"; 112 } 113 114 sub load_secondary_plugins 115 { 116 my $self = shift (@_); 117 my ($class,$plugin_options) = @_; 118 119 my @convert_to_list = split(",",$self->{'convert_to'}); 120 121 $secondary_plugins = {}; 122 123 foreach my $convert_to (@convert_to_list) { 124 # load in "convert_to" plugin package 125 my $plugin_class = $convert_to."Plug"; 126 my $plugin_package = $plugin_class.".pm"; 127 require $plugin_package; 128 129 # call its constructor with extra options that we've worked out! 130 my $arglist = $plugin_options->{$plugin_class}; 131 my $secondary_plugin = new $plugin_class([], \@$arglist); 132 $secondary_plugins->{$plugin_class} = $secondary_plugin; 133 } 134 $self->{'secondary_plugins'} = $secondary_plugins; 105 135 } 106 136 … … 127 157 $self->{'convert_to'} = "TEXT"; 128 158 $self->{'convert_to_ext'} = "txt"; 129 } 130 else 131 { 159 my $text_options = []; 160 push(@$text_options,"-metadata_fields","Title,GENERATOR"); 161 $secondary_plugin_options->{'TextPlug'} = $text_options; 162 } 163 elsif ($strConvertTo =~ /pagedimg.*/i){ 164 $self = (defined $hashArgOptLists)? new PagedImgPlug($pluginlist,$inputargs,$hashArgOptLists): new PagedImgPlug($pluginlist,$inputargs); 165 $self->{'convert_to'} = "PagedImg"; 166 #$self->{'convert_to'} = $strConvertTo; 167 my $convert_to_ext = $strConvertTo; 168 $convert_to_ext =~ s/.*\-(.*)/$1/i; 169 if ($convert_to_ext eq "gif"){ 170 $self->{'convert_to_ext'} = "gif"; 171 } elsif ($convert_to_ext eq "jpg"){ 172 $self->{'convert_to_ext'} = "jpg"; 173 } elsif ($convert_to_ext eq "png") { 174 $self->{'convert_to_ext'} = "png"; 175 } 176 my $pagedimg_options = []; 177 push(@$pagedimg_options,"-metadata_fields","Title,GENERATOR"); 178 $secondary_plugin_options->{'PagedImgPlug'} = $pagedimg_options; 179 } else { 180 # HTML or auto 132 181 $self = (defined $hashArgOptLists)? new HTMLPlug($pluginlist,$inputargs,$hashArgOptLists): new HTMLPlug($pluginlist,$inputargs); 133 182 $self->{'convert_to'} = "HTML"; 134 183 $self->{'convert_to_ext'} = "html"; 135 136 $self->{'rename_assoc_files'} = 1; 137 $self->{'metadata_fields'} .= ",GENERATOR"; 184 my $html_options = []; 185 push(@$html_options,"-rename_assoc_files","1"); 186 push(@$html_options,"-metadata_fields","Title,GENERATOR"); 187 $secondary_plugin_options->{'HTMLPlug'} = $html_options; 138 188 } 139 189 … … 141 191 } 142 192 143 # we don't need to block anything, so override the one for HTMLPlug 144 # files are converted in a temp dir and extra files not passed down the 145 # plugin list 146 sub get_default_block_exp { 147 my $self = shift (@_); 148 149 return ""; 150 } 151 152 # Go straight to BasPlug and avoid the special case implemented by HTMLPlug 153 sub store_block_files { 154 return BasPlug::store_block_files(@_); 155 } 193 194 sub init { 195 my $self = shift (@_); 196 my ($verbosity, $outhandle, $failhandle) = @_; 197 198 $self->SUPER::init($verbosity,$outhandle,$failhandle); 199 200 my $secondary_plugins = $self->{'secondary_plugins'}; 201 202 foreach my $plug_name (keys %$secondary_plugins) { 203 my $plugin = $secondary_plugins->{$plug_name}; 204 $plugin->init($verbosity,$outhandle,$failhandle); 205 } 206 } 207 208 sub deinit { 209 # called only once, after all plugin passes have been done 210 211 my ($self) = @_; 212 213 my $secondary_plugins = $self->{'secondary_plugins'}; 214 215 foreach my $plug_name (keys %$secondary_plugins) { 216 my $plugin = $secondary_plugins->{$plug_name}; 217 $plugin->deinit(); 218 } 219 } 220 221 222 sub convert_post_process 223 { 224 # by default do no post processing 225 return; 226 } 227 156 228 157 229 # Run conversion utility on the input file. … … 171 243 my $convert_to = $self->{'convert_to'}; 172 244 my $failhandle = $self->{'failhandle'}; 245 my $convert_to_ext = $self->{'convert_to_ext'}; 173 246 174 247 # softlink to collection tmp dir … … 198 271 # Execute the conversion command and get the type of the result, 199 272 # making sure the converter gives us the appropriate output type 200 my $output_type = lc($convert_to); 273 my $output_type=""; 274 if ($convert_to =~ m/PagedImg/i) { 275 $output_type = lc($convert_to)."-".lc($convert_to_ext); 276 } else { 277 $output_type = lc($convert_to); 278 } 279 201 280 my $cmd = "perl -S gsConvert.pl -verbose $verbosity "; 202 281 if (defined $self->{'convert_options'}) { … … 211 290 212 291 # remove symbolic link to original file 213 &util::rm($tmp_filename);292 #&util::rm($tmp_filename); 214 293 215 294 # Check STDERR here … … 233 312 # store the *actual* output type and return the output filename 234 313 # it's possible we requested conversion to html, but only to text succeeded 235 236 314 $self->{'convert_to_ext'} = $output_type; 237 315 if ($output_type =~ /html/i) { … … 239 317 } elsif ($output_type =~ /te?xt/i) { 240 318 $self->{'converted_to'} = "TEXT"; 241 } 319 } elsif ($output_type =~ /item/i){ 320 $self->{'converted_to'} = "PagedImg"; 321 } 322 242 323 my $output_filename = $tmp_filename; 243 244 $output_filename =~ s/$suffix$/.$output_type/; 245 324 if ($output_type =~ /item/i) { 325 $output_filename = $tmp_dirname . "\\$tailname\\" . $tailname . ".$output_type"; 326 } else { 327 $output_filename =~ s/$suffix$/.$output_type/; 328 } 246 329 return $output_filename; 247 330 } … … 249 332 250 333 # Remove collection specific tmp directory and all its contents. 251 252 334 sub cleanup_tmp_area { 253 335 my $self = shift (@_); … … 255 337 my $tmp_dirname 256 338 = &util::filename_cat($ENV{'GSDLCOLLECTDIR'}, "tmp"); 257 &util::rm_r($tmp_dirname);339 #&util::rm_r($tmp_dirname); 258 340 &util::mk_dir($tmp_dirname); 259 341 } 260 261 262 263 342 264 343 # Override BasPlug read 265 344 # We don't want to get language encoding stuff until after we've converted 266 # our file to either TEXT or HTML .345 # our file to either TEXT or HTML or PagedImage. 267 346 sub read { 268 347 my $self = shift (@_); … … 273 352 274 353 my $outhandle = $self->{'outhandle'}; 275 276 my $filename = $file; 277 $filename = &util::filename_cat ($base_dir, $file) if $base_dir =~ /\w/; 278 279 if ($self->associate_with($file,$filename,$metadata)) { 280 # a form of smart block 281 $self->{'num_blocked'} ++; 282 return 0; # blocked 283 } 284 285 if ($self->{'block_exp'} ne "" && $filename =~ /$self->{'block_exp'}/) { 286 $self->{'num_blocked'} ++; 287 return 0; 288 } 289 if ($filename !~ /$self->{'process_exp'}/ || !-f $filename) { 290 return undef; 291 } 292 $file =~ s/^[\/\\]+//; # $file often begins with / so we'll tidy it up 293 294 # read in file ($text will be in utf8) 295 my $text = ""; 354 355 my ($block_status,$filename) = $self->read_block(@_); 356 return $block_status if ((!defined $block_status) || ($block_status==0)); 357 $file = $self->read_tidy_file($file); 296 358 297 359 my $output_ext = $self->{'convert_to_ext'}; 298 299 360 my $conv_filename = $self->tmp_area_convert_file($output_ext, $filename); 300 301 361 if ("$conv_filename" eq "") {return 0;} # allows continue on errors 302 362 if (! -e "$conv_filename") {return 0;} # allows continue on errors 303 363 $self->{'conv_filename'} = $conv_filename; 304 305 # Do encoding stuff 306 my ($language, $encoding) = $self->textcat_get_language_encoding ($conv_filename); 307 308 &BasPlug::read_file($self, $conv_filename, $encoding, $language, \$text); 309 if (!length ($text)) { 310 my $plugin_name = ref ($self); 311 print $outhandle "$plugin_name: ERROR: $file contains no text\n" if $self->{'verbosity'}; 312 return 0; 313 } 314 315 # if we converted to HTML, convert é and etc to utf-8. 316 # this should really happen before language_extraction, but that means 317 # modifying a file on disk... 318 $text =~ s/&([^;]+);/&ghtml::getcharequiv($1,0)/ge; 319 320 # create a new document 321 #my $doc_obj = new doc ($conv_filename, "indexed_doc"); 322 # now we use the original filename here 323 my $doc_obj = new doc($filename, "indexed_doc"); 324 $doc_obj->set_converted_filename($conv_filename); 325 $doc_obj->set_OIDtype ($processor->{'OIDtype'}); 326 $doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "Language", $language); 327 $doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "Encoding", $encoding); 364 365 $self->convert_post_process($conv_filename); 366 367 my $secondary_plugins = $self->{'secondary_plugins'}; 368 my $num_secondary_plugins = scalar(keys %$secondary_plugins); 369 if ($num_secondary_plugins == 0) { 370 print $outhandle "Warning: No secondary plugin to use in conversion. Skipping $file\n"; 371 return 0; # effectively block it 372 } 373 374 my @plugin_names = keys %$secondary_plugins; 375 my $plugin_name = shift @plugin_names; 376 377 if ($num_secondary_plugins > 1) { 378 print $outhandle "Warning: Multiple secondary plugins not supported yet! Choosing $plugin_name\n."; 379 } 380 381 my $secondary_plugin = $secondary_plugins->{$plugin_name}; 382 383 # note: metadata is not carried on to the next level 384 my ($rv,$doc_obj) 385 = $secondary_plugin->read_into_doc_obj ($pluginfo,"", $conv_filename, 386 $metadata, $processor, $maxdocs, $total_count, 387 $gli); 388 389 if ((!defined $rv) || ($rv<1)) { 390 # wasn't processed 391 return $rv; 392 } 393 394 # Override previous gsdlsourcefilename set by secondary plugin 395 my $collect_file = &util::filename_within_collection($filename); 396 my $collect_conv_file = &util::filename_within_collection($conv_filename); 397 $doc_obj->set_source_filename ($collect_file); 398 $doc_obj->set_converted_filename($collect_conv_file); 399 328 400 my ($filemeta) = $file =~ /([^\\\/]+)$/; 329 $doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "Source", &ghtml::dmsafe($filemeta)); 330 if ($self->{'cover_image'}) { 331 $self->associate_cover_image($doc_obj, $filename); 332 } 333 $doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "Plugin", "$self->{'plugin_type'}"); 334 $doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "FileSize", (-s $filename)); 335 336 # include any metadata passed in from previous plugins 337 # note that this metadata is associated with the top level section 338 $self->extra_metadata ($doc_obj, $doc_obj->get_top_section(), $metadata); 401 $doc_obj->set_utf8_metadata_element($doc_obj->get_top_section(), "Source", &ghtml::dmsafe($filemeta)); 402 $doc_obj->set_utf8_metadata_element($doc_obj->get_top_section(), "Plugin", "$self->{'plugin_type'}"); 403 $doc_obj->set_utf8_metadata_element($doc_obj->get_top_section(), "FileSize", (-s $filename)); 404 339 405 # do plugin specific processing of doc_obj 340 unless (defined ($self->process( \$text, $pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli))) {406 unless (defined ($self->process(undef, $pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli))) { 341 407 print STDERR "<ProcessingError n='$file'>\n" if ($gli); 342 408 return -1; … … 348 414 # process the document 349 415 $processor->process($doc_obj); 350 $self->cleanup_tmp_area();416 ## $self->cleanup_tmp_area(); 351 417 352 418 $self->{'num_processed'} ++; … … 356 422 357 423 358 # do plugin specific processing of doc_obj for HTMLtype424 # do plugin specific processing of doc_obj for doc_ext type 359 425 sub process_type { 360 426 my $self = shift (@_); 361 my ($doc_ext, $textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj) = @_; 362 363 my $conv_filename = $self->{'conv_filename'}; 364 my $tmp_dirname = File::Basename::dirname($conv_filename); 365 my $tmp_tailname = File::Basename::basename($conv_filename); 366 367 my $converted_to = $self->{'converted_to'}; 368 my $ret_val; 369 370 if ($converted_to eq "TEXT") 371 { 372 373 $ret_val = &TEXTPlug::process($self, $textref, $pluginfo, 374 $tmp_dirname, $tmp_tailname, 375 $metadata, $doc_obj); 376 } 377 else 378 { 379 $ret_val = &HTMLPlug::process($self, $textref, $pluginfo, 380 $tmp_dirname, $tmp_tailname, 381 $metadata, $doc_obj); 382 } 383 427 my ($doc_ext, $base_dir, $file, $doc_obj) = @_; 428 384 429 # associate original file with doc object 385 430 my $cursection = $doc_obj->get_top_section(); … … 413 458 $doc_obj->add_utf8_metadata ($cursection, "/srclink", "</a>"); 414 459 415 return $ret_val;460 return 1; 416 461 } 417 462 418 463 1; 464 465 466 467 468 469 470
Note:
See TracChangeset
for help on using the changeset viewer.