Changeset 15871
- Timestamp:
- 2008-06-05T09:26:56+12:00 (16 years ago)
- Location:
- gsdl/trunk/perllib/plugins
- Files:
-
- 3 edited
Legend:
- Unmodified
- Added
- Removed
-
gsdl/trunk/perllib/plugins/ConvertBinaryFile.pm
r15865 r15871 1 1 ########################################################################### 2 2 # 3 # Convert ToPlug.pm -- plugin that inherits from BasPlug3 # ConvertBinaryFile.pm -- plugin that inherits from BasPlug 4 4 # 5 5 # A component of the Greenstone digital library software … … 27 27 # This plugin is inherited by such plugins as WordPlug, PPTPlug, PSPlug, 28 28 # RTFPlug and PDFPlug. It facilitates the conversion of these document types 29 # to either HTML, T EXTor a series of images. It works by dynamically loading29 # to either HTML, Text or a series of images. It works by dynamically loading 30 30 # an appropriate secondary plugin (HTMLPlug, StructuredHTMLPlug, 31 # PagedIm gPlug or TEXTPlug) based on the plugin argument 'convert_to'.32 33 package Convert ToPlug;34 35 use Bas Plug;31 # PagedImagePlugin or TextPlugin) based on the plugin argument 'convert_to'. 32 33 package ConvertBinaryFile; 34 35 use BasePlugin; 36 36 use ghtml; 37 use HTMLPlug ;38 use T EXTPlug;39 use PagedIm gPlug;37 use HTMLPlugin; 38 use TextPlugin; 39 use PagedImagePlugin; 40 40 41 41 use strict; 42 42 no strict 'refs'; # allow filehandles to be variables and viceversa 43 43 no strict 'subs'; 44 44 45 sub BEGIN { 45 @Convert ToPlug::ISA = ('BasPlug');46 @ConvertBinaryFile::ISA = ('BasePlugin'); 46 47 } 47 48 48 49 my $convert_to_list = 49 50 [ { 'name' => "auto", 50 'desc' => "{Convert ToPlug.convert_to.auto}" },51 'desc' => "{ConvertBinaryFile.convert_to.auto}" }, 51 52 { 'name' => "html", 52 'desc' => "{Convert ToPlug.convert_to.html}" },53 'desc' => "{ConvertBinaryFile.convert_to.html}" }, 53 54 { 'name' => "text", 54 'desc' => "{Convert ToPlug.convert_to.text}" }55 'desc' => "{ConvertBinaryFile.convert_to.text}" } 55 56 ]; 56 57 57 58 my $arguments = 58 59 [ { 'name' => "convert_to", 59 'desc' => "{Convert ToPlug.convert_to}",60 'desc' => "{ConvertBinaryFile.convert_to}", 60 61 'type' => "enum", 61 62 'reqd' => "yes", … … 63 64 'deft' => "auto" }, 64 65 { 'name' => "keep_original_filename", 65 'desc' => "{Convert ToPlug.keep_original_filename}",66 'desc' => "{ConvertBinaryFile.keep_original_filename}", 66 67 'type' => "flag" }, 67 68 { 'name' => "title_sub", … … 71 72 'deft' => "" }, 72 73 { 'name' => "apply_fribidi", 73 'desc' => "{Convert ToPlug.apply_fribidi}",74 'desc' => "{ConvertBinaryFile.apply_fribidi}", 74 75 'type' => "flag", 75 76 'reqd' => "no" }, 76 77 { 'name' => "use_strings", 77 'desc' => "{Convert ToPlug.use_strings}",78 'desc' => "{ConvertBinaryFile.use_strings}", 78 79 'type' => "flag", 79 80 'reqd' => "no" }, 80 { 'name' => "extract_keyphrases", 81 'desc' => "{BasPlug.extract_keyphrases}", 82 'type' => "flag", 83 'reqd' => "no", 84 'hiddengli' => "yes" }, 85 { 'name' => "extract_keyphrase_options", 86 'desc' => "{BasPlug.extract_keyphrase_options}", 87 'type' => "string", 88 'reqd' => "no", 89 'hiddengli' => "yes" } ]; 90 91 my $options = { 'name' => "ConvertToPlug", 92 'desc' => "{ConvertToPlug.desc}", 81 # { 'name' => "extract_keyphrases", 82 # 'desc' => "{BasPlug.extract_keyphrases}", 83 # 'type' => "flag", 84 # 'reqd' => "no", 85 # 'hiddengli' => "yes" }, 86 # { 'name' => "extract_keyphrase_options", 87 # 'desc' => "{BasPlug.extract_keyphrase_options}", 88 # 'type' => "string", 89 # 'reqd' => "no", 90 # 'hiddengli' => "yes" } 91 ]; 92 93 my $options = { 'name' => "ConvertBinaryFile", 94 'desc' => "{ConvertBinaryFile.desc}", 93 95 'abstract' => "yes", 94 96 'inherits' => "yes", … … 107 109 foreach my $convert_to (@convert_to_list) { 108 110 # load in "convert_to" plugin package 109 my $plugin_class = $convert_to."Plug ";111 my $plugin_class = $convert_to."Plugin"; 110 112 my $plugin_package = $plugin_class.".pm"; 111 113 … … 145 147 push(@$pluginlist, $class); 146 148 my $classPluginName = (defined $pluginlist->[0]) ? $pluginlist->[0] : $class; 147 if(defined $arguments){ push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});}148 if(defined $options) { push(@{$hashArgOptLists->{"OptList"}},$options)};149 150 my $self = new Bas Plug($pluginlist, $inputargs, $hashArgOptLists);149 push(@{$hashArgOptLists->{"ArgList"}},@{$arguments}); 150 push(@{$hashArgOptLists->{"OptList"}},$options); 151 152 my $self = new BasePlugin($pluginlist, $inputargs, $hashArgOptLists); 151 153 152 154 if ($self->{'info_only'}) { … … 161 163 my $windows_scripting = $self->{'windows_scripting'}; 162 164 $windows_scripting = 0 unless defined $windows_scripting; 163 if ($classPluginName eq "PDFPlug ") {165 if ($classPluginName eq "PDFPlugin") { 164 166 if ($convert_to_type eq "text" && 165 167 $ENV{'GSDLOS'} =~ /^windows$/i) { … … 167 169 $convert_to_type = "html"; 168 170 } 169 } elsif ($classPluginName eq "WordPlug ") {171 } elsif ($classPluginName eq "WordPlugin") { 170 172 if ($windows_scripting && $ENV{'GSDLOS'} =~ /^windows$/i && $convert_to_type =~ /^(html|auto)$/) { 171 173 # we use structured HTML, not normal html 172 174 $convert_to_type = "structuredhtml"; 173 175 } 174 } elsif ($classPluginName eq "PPTPlug ") {176 } elsif ($classPluginName eq "PPTPlugin") { 175 177 if ($windows_scripting && $ENV{'GSDLOS'} =~ /^windows$/i && $convert_to_type eq "auto") { 176 178 # we use paged img 177 179 $convert_to_type = "pagedimg_jpg"; 178 180 } 179 } elsif ($classPluginName eq "PSPlug ") {181 } elsif ($classPluginName eq "PSPlugin") { 180 182 if ($convert_to_type eq "auto") { 181 183 # we use text … … 193 195 $self->{'convert_to_ext'} = "html"; 194 196 } elsif ($convert_to_type eq "text") { 195 $self->{'convert_to'} = "T EXT";197 $self->{'convert_to'} = "Text"; 196 198 $self->{'convert_to_ext'} = "txt"; 197 199 } elsif ($convert_to_type eq "structuredhtml") { … … 199 201 $self->{'convert_to_ext'} = "html"; 200 202 } elsif ($convert_to_type =~ /^pagedimg/) { 201 $self->{'convert_to'} = "PagedIm g";203 $self->{'convert_to'} = "PagedImage"; 202 204 my ($convert_to_ext) = $convert_to_type =~ /pagedimg\_(jpg|gif|png)/i; 203 205 $convert_to_ext = 'jpg' unless defined $convert_to_ext; … … 305 307 # making sure the converter gives us the appropriate output type 306 308 my $output_type=""; 307 if ($convert_to =~ m/PagedIm g/i) {309 if ($convert_to =~ m/PagedImage/i) { 308 310 $output_type = lc($convert_to)."_".lc($convert_to_ext); 309 311 } else { … … 349 351 $self->{'converted_to'} = "HTML"; 350 352 } elsif ($output_type =~ /te?xt/i) { 351 $self->{'converted_to'} = "T EXT";353 $self->{'converted_to'} = "Text"; 352 354 } elsif ($output_type =~ /item/i){ 353 $self->{'converted_to'} = "PagedIm g";355 $self->{'converted_to'} = "PagedImage"; 354 356 } 355 357 … … 370 372 371 373 372 # Override BasPlug read 373 # We don't want to get language encoding stuff until after we've converted 374 # our file to either TEXT or HTML or PagedImage. 375 sub read { 374 # Override BasPlug read_into_doc_obj - we need to call secondary plugin stuff 375 sub read_into_doc_obj { 376 376 my $self = shift (@_); 377 377 my ($pluginfo, $base_dir, $file, $metadata, $processor, $maxdocs, $total_count, $gli) = @_; 378 378 379 379 my $outhandle = $self->{'outhandle'}; 380 381 my ($block_status,$filename) = $self->read_block(@_); 382 return $block_status if ((!defined $block_status) || ($block_status==0)); 383 $file = $self->read_tidy_file($file); 384 380 381 my ($filename_full_path, $filename_no_path) = $self->get_full_filenames($base_dir, $file); 382 385 383 my $output_ext = $self->{'convert_to_ext'}; 386 384 my $conv_filename = ""; 387 $conv_filename = $self->tmp_area_convert_file($output_ext, $filename );385 $conv_filename = $self->tmp_area_convert_file($output_ext, $filename_full_path); 388 386 389 387 if ("$conv_filename" eq "") {return -1;} # had an error, will be passed down pipeline … … 394 392 # Run the "fribidi" (http://fribidi.org) Unicode Bidirectional Algorithm program over the converted file 395 393 # Added for fixing up Persian PDFs after being processed by pdftohtml, but may be useful in other cases too 396 if ($self->{'apply_fribidi'} && $self->{'converted_to'} =~ /(HTML|T EXT)/) {394 if ($self->{'apply_fribidi'} && $self->{'converted_to'} =~ /(HTML|Text)/) { 397 395 my $fribidi_command = "fribidi \"$conv_filename\" >\"${conv_filename}.tmp\""; 398 396 if (system($fribidi_command) != 0) { … … 423 421 # note: metadata is not carried on to the next level 424 422 my ($rv,$doc_obj) 425 = $secondary_plugin->read_into_doc_obj ($pluginfo,"", $conv_filename, 426 $metadata, $processor, $maxdocs, $total_count, 427 $gli); 423 = $secondary_plugin->read_into_doc_obj ($pluginfo,"", $conv_filename, $metadata, $processor, $maxdocs, $total_count, $gli); 428 424 429 425 if ((!defined $rv) || ($rv<1)) { … … 433 429 434 430 # Override previous gsdlsourcefilename set by secondary plugin 435 my $collect_file = &util::filename_within_collection($filename );431 my $collect_file = &util::filename_within_collection($filename_full_path); 436 432 my $collect_conv_file = &util::filename_within_collection($conv_filename); 437 433 $doc_obj->set_source_filename ($collect_file); 438 434 $doc_obj->set_converted_filename($collect_conv_file); 439 435 440 my ($filemeta) = $file =~ /([^\\\/]+)$/;441 $doc_obj->set_utf8_metadata_element($doc_obj->get_top_section(), "Source", &ghtml::dmsafe($filemeta));436 $self->set_Source_metadata($doc_obj, $filename_no_path); 437 442 438 $doc_obj->set_utf8_metadata_element($doc_obj->get_top_section(), "Plugin", "$self->{'plugin_type'}"); 443 $doc_obj->set_utf8_metadata_element($doc_obj->get_top_section(), "FileSize", (-s $filename)); 444 445 if ($self->{'cover_image'}) { 446 $self->associate_cover_image($doc_obj, $filename); 447 } 439 $doc_obj->set_utf8_metadata_element($doc_obj->get_top_section(), "FileSize", (-s $filename_full_path)); 448 440 449 441 # do plugin specific processing of doc_obj 450 unless (defined ($self->process( undef,$pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli))) {442 unless (defined ($self->process($pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli))) { 451 443 print STDERR "<ProcessingError n='$file'>\n" if ($gli); 452 444 return -1; 453 445 } 446 447 my $topsection = $doc_obj->get_top_section(); 448 $self->add_associated_files($doc_obj, $filename_full_path); 449 $self->extra_metadata($doc_obj, $topsection, $metadata); # do we need this here?? 454 450 # do any automatic metadata extraction 455 451 $self->auto_extract_metadata ($doc_obj); 456 452 457 453 # have we found a Title?? 458 $self->title_fallback($doc_obj,$doc_obj->get_top_section(),$filemeta); 459 460 # # add an OID 461 # $doc_obj->set_OID(); 462 463 # add an OID 464 # see if there is a plugin-specific set_OID function... 465 if (defined ($self->can('set_OID'))) { 466 # it will need $doc_obj to set the Identifier metadata... 467 $self->set_OID($doc_obj); 468 } else { 469 # use the default set_OID() in doc.pm 470 $doc_obj->set_OID(); 471 } 472 473 474 # process the document 475 $processor->process($doc_obj); 476 477 $self->{'num_processed'} ++; 478 479 return 1; 480 } 481 454 $self->title_fallback($doc_obj,$topsection,$filename_no_path); 455 456 $self->add_OID($doc_obj); 457 458 return (1, $doc_obj); 459 460 } 461 462 sub process { 463 my $self = shift (@_); 464 my ($pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli) = @_; 465 466 return $self->process_type($base_dir, $file, $doc_obj); 467 } 482 468 483 469 # do plugin specific processing of doc_obj for doc_ext type 484 470 sub process_type { 485 471 my $self = shift (@_); 486 my ($doc_ext, $base_dir, $file, $doc_obj) = @_; 472 my ($base_dir, $file, $doc_obj) = @_; 473 474 # need to check that not empty 475 my $doc_ext = $self->{'filename_extension'}; 476 my $file_type = "unknown"; 477 $file_type = $self->{'file_type'} if defined $self->{'file_type'}; 487 478 488 479 # associate original file with doc object … … 496 487 $doc_obj->associate_file($filename, $assocfilename, undef, $cursection); 497 488 498 my $file_type;499 500 if ($doc_ext eq "doc") {501 $file_type = "Word";502 } elsif ($doc_ext eq "xls") {503 $file_type = "Excel";504 } elsif ($doc_ext eq "ppt") {505 $file_type = "PPT";506 } elsif ($doc_ext eq "pdf") {507 $file_type = "PDF";508 } elsif ($doc_ext eq "rtf") {509 $file_type = "RTF";510 } elsif ($doc_ext eq "ps") {511 $file_type = "PS";512 }513 514 my $file_format = $file_type || "unknown";515 516 489 # We use set instead of add here because we only want one value 517 $doc_obj->set_utf8_metadata_element($cursection, "FileFormat", $file_ format);490 $doc_obj->set_utf8_metadata_element($cursection, "FileFormat", $file_type); 518 491 my $doclink = "<a href=\"_httpprefix_/collect/[collection]/index/assoc/[archivedir]/doc.$doc_ext\">"; 519 492 if ($self->{'keep_original_filename'} == 1) { -
gsdl/trunk/perllib/plugins/ReadXMLFile.pm
r15865 r15871 1 1 ########################################################################### 2 2 # 3 # XMLPlug.pm -- base class for XML plugins3 # ReadXMLFile.pm -- base class for XML plugins 4 4 # A component of the Greenstone digital library software 5 5 # from the New Zealand Digital Library Project at the … … 24 24 ########################################################################### 25 25 26 package XMLPlug;27 28 use Bas Plug;26 package ReadXMLFile; 27 28 use BasePlugin; 29 29 use doc; 30 30 use strict; … … 32 32 33 33 sub BEGIN { 34 @ XMLPlug::ISA = ('BasPlug');34 @ReadXMLFile::ISA = ('BasePlugin'); 35 35 unshift (@INC, "$ENV{'GSDLHOME'}/perllib/cpan"); 36 36 } … … 40 40 my $arguments = 41 41 [ { 'name' => "process_exp", 42 'desc' => "{Bas Plug.process_exp}",42 'desc' => "{BasePlugin.process_exp}", 43 43 'type' => "regexp", 44 44 'deft' => &get_default_process_exp(), 45 45 'reqd' => "no" }, 46 46 { 'name' => "xslt", 47 'desc' => "{ XMLPlug.xslt}",47 'desc' => "{ReadXMLFile.xslt}", 48 48 'type' => "string", 49 49 'deft' => "", 50 50 'reqd' => "no" } ]; 51 51 52 my $options = { 'name' => " XMLPlug",53 'desc' => "{ XMLPlug.desc}",52 my $options = { 'name' => "ReadXMLFile", 53 'desc' => "{ReadXMLFile.desc}", 54 54 'abstract' => "yes", 55 55 'inherits' => "yes", … … 61 61 push(@$pluginlist, $class); 62 62 63 if(defined $arguments){ push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});} 64 if(defined $options) { push(@{$hashArgOptLists->{"OptList"}},$options)}; 65 66 # $self is global for use within subroutines called by XML::Parser 67 my $self = new BasPlug($pluginlist, $inputargs, $hashArgOptLists); 63 push(@{$hashArgOptLists->{"ArgList"}},@{$arguments}); 64 push(@{$hashArgOptLists->{"OptList"}},$options); 65 66 my $self = new BasePlugin($pluginlist, $inputargs, $hashArgOptLists); 68 67 69 68 if ($self->{'info_only'}) { 70 # don't worry about any options etc 69 # don't worry about creating the XML parser as all we want is the 70 # list of plugin options 71 71 return bless $self, $class; 72 72 } 73 73 74 74 my $parser = new XML::Parser('Style' => 'Stream', 75 'Pkg' => ' XMLPlug',75 'Pkg' => 'ReadXMLFile', 76 76 'PluginObj' => $self, 77 77 'Handlers' => {'Char' => \&Char, … … 198 198 if (defined $result) { 199 199 # we think we are processing this, but check that we actually are 200 my $filename = &util::filename_cat ($base_dir, $file) if $base_dir =~ /\w/;200 my $filename = $self->get_full_filename($base_dir, $file); 201 201 202 202 if ($self->check_doctype($filename)) { … … 207 207 } 208 208 209 # we need to implement read cos we are not just using process_exp to determine 210 # whether to process this or not. 209 211 sub read { 210 212 my $self = shift (@_); … … 213 215 214 216 # Make sure we're processing the correct file, do blocking etc 215 my ($block_status,$filename ) = $self->read_block(@_);217 my ($block_status,$filename_full_path) = $self->read_block(@_); 216 218 return $block_status if ((!defined $block_status) || ($block_status==0)); 217 219 218 220 ## check the doctype to see whether we really want to process the file 219 if (!$self->check_doctype($filename )) {221 if (!$self->check_doctype($filename_full_path)) { 220 222 # this file is not for us 221 223 return undef; … … 225 227 $self->{'base_dir'} = $base_dir; 226 228 $self->{'file'} = $file; 227 $self->{'filename'} = $filename ;229 $self->{'filename'} = $filename_full_path; 228 230 $self->{'processor'} = $processor; 229 231 $self->{'metadata'} = $metadata; … … 233 235 if (defined $xslt && ($xslt ne "")) { 234 236 # perform xslt 235 my $transformed_xml = $self->apply_xslt($xslt,$filename );237 my $transformed_xml = $self->apply_xslt($xslt,$filename_full_path); 236 238 237 239 # feed transformed file (now in memory as string) into XML parser … … 239 241 } 240 242 else { 241 $self->{'parser'}->parsefile($filename );243 $self->{'parser'}->parsefile($filename_full_path); 242 244 } 243 245 }; … … 246 248 247 249 # parsefile may either croak somewhere in XML::Parser (e.g. because 248 # the document is not well formed) or die somewhere in XMLPlugor a250 # the document is not well formed) or die somewhere in ReadXMLFile or a 249 251 # derived plugin (e.g. because we're attempting to process a 250 252 # document whose DOCTYPE is not meant for this plugin). For the … … 271 273 } 272 274 273 # the following two methods are for if you want to do the parsing from a274 # plugin that inherits from this. it seems that you can't call the parse275 # methods directly. WHY???276 #277 # [Stefan 27/5/07] These two methods may not be necessary any more as I've278 # fixed XMLPlug so $self is no longer required to be a global variable279 # (that was why inheritance wasn't working quite right with XMLPlug I280 # think). I don't really know what other plugins rely on these methods281 # though so have left them here for now.282 sub parse_file {283 my $self = shift (@_);284 my ($filename) = @_;285 $self->{'parser'}->parsefile($filename);286 }287 288 sub parse_string {289 my $self = shift (@_);290 my ($xml_string) = @_;291 $self->{'parser'}->parse($xml_string);292 }293 275 294 276 sub get_default_process_exp { … … 344 326 345 327 my ($expat, $name, $sysid, $pubid, $internal) = @_; 346 die " XMLPlugCannot process XML document with DOCTYPE of $name";328 die "ReadXMLFile Cannot process XML document with DOCTYPE of $name"; 347 329 } 348 330 … … 395 377 $self->{'doc_obj'} = new doc ($self->{'filename'}, "indexed_doc"); 396 378 $self->{'doc_obj'}->set_OIDtype ($self->{'processor'}->{'OIDtype'}, $self->{'processor'}->{'OIDmetadata'}); 379 $self->{'doc_obj'}->add_utf8_metadata($self->{'doc_obj'}->get_top_section(), "Plugin", "$self->{'plugin_type'}"); 380 381 # do we want other auto metadata here (see BasePlugin.read_into_doc_obj) 397 382 } 398 383 … … 400 385 my $self = shift(@_); 401 386 my $doc_obj = $self->{'doc_obj'}; 387 388 # do we want other auto stuff here, see BasePlugin.read_into_doc_obj 389 402 390 # include any metadata passed in from previous plugins 403 391 # note that this metadata is associated with the top level section … … 410 398 411 399 # add an OID 412 $ doc_obj->set_OID();400 $self->add_OID(); 413 401 414 402 $doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "Plugin", "$self->{'plugin_type'}"); … … 419 407 420 408 $self->{'num_processed'} ++; 409 undef $self->{'doc_obj'}; 410 undef $doc_obj; # is this the same as above?? 421 411 } 422 412 -
gsdl/trunk/perllib/plugins/SplitTextFile.pm
r15865 r15871 1 1 ########################################################################### 2 2 # 3 # Split Plug.pm - a plugin for splitting input files into segments that3 # SplitTextFile.pm - a plugin for splitting input files into segments that 4 4 # will then be individually processed. 5 5 # … … 29 29 30 30 31 # Split Plugis a plugin for splitting input files into segments that will31 # SplitTextFile is a plugin for splitting input files into segments that will 32 32 # then be individually processed. 33 33 … … 35 35 # process input files that contain several documents, you should write a 36 36 # plugin with a process function that will handle one of those documents 37 # and have it inherit from Split Plug. See ReferPlug for an example.38 39 40 package Split Plug;41 42 use BasPlug;37 # and have it inherit from SplitTextFile. See ReferPlug for an example. 38 39 40 package SplitTextFile; 41 42 use ReadTextFile; 43 43 use gsprintf 'gsprintf'; 44 44 use util; … … 47 47 no strict 'refs'; # allow filehandles to be variables and viceversa 48 48 49 # Split Plugis a sub-class of BasPlug.49 # SplitTextFile is a sub-class of BasPlug. 50 50 sub BEGIN { 51 @Split Plug::ISA = ('BasPlug');51 @SplitTextFile::ISA = ('ReadTextFile'); 52 52 } 53 53 … … 55 55 my $arguments = 56 56 [ { 'name' => "split_exp", 57 'desc' => "{Split Plug.split_exp}",57 'desc' => "{SplitTextFile.split_exp}", 58 58 'type' => "regexp", 59 59 #'deft' => &get_default_split_exp(), … … 61 61 'reqd' => "no" } ]; 62 62 63 my $options = { 'name' => "Split Plug",64 'desc' => "{Split Plug.desc}",63 my $options = { 'name' => "SplitTextFile", 64 'desc' => "{SplitTextFile.desc}", 65 65 'abstract' => "yes", 66 66 'inherits' => "yes", … … 73 73 push(@$pluginlist, $class); 74 74 75 if(defined $arguments){ push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});}76 if(defined $options) { push(@{$hashArgOptLists->{"OptList"}},$options)};77 78 my $self = new BasPlug($pluginlist, $inputargs, $hashArgOptLists);75 push(@{$hashArgOptLists->{"ArgList"}},@{$arguments}); 76 push(@{$hashArgOptLists->{"OptList"}},$options); 77 78 my $self = new ReadTextFile($pluginlist, $inputargs, $hashArgOptLists); 79 79 80 80 $self->{'textcat_store'} = {}; … … 87 87 my ($verbosity, $outhandle, $failhandle) = @_; 88 88 89 $self->BasPlug::init($verbosity, $outhandle, $failhandle); 90 89 $self->ReadTextFile::init($verbosity, $outhandle, $failhandle); 90 91 # why is this is init and not in new?? 91 92 if ((!defined $self->{'process_exp'}) || ($self->{'process_exp'} eq "")) { 92 93 … … 119 120 my ($pluginfo, $base_dir, $file, $metadata, $extrametakeys, $extrametadata, $processor, $maxdocs, $gli) = @_; 120 121 122 # returns 1 if matches process_exp, and has done blocking in the meantime 121 123 my $matched = $self->SUPER::metadata_read($pluginfo, $base_dir, $file, 122 124 $metadata, $extrametakeys, … … 146 148 147 149 if ($text !~ /\w/) { 148 gsprintf($outhandle, "$plugin_name: { BasPlug.file_has_no_text}\n",150 gsprintf($outhandle, "$plugin_name: {ReadTextFile.file_has_no_text}\n", 149 151 $file) 150 152 if $self->{'verbosity'}; … … 171 173 } 172 174 173 print $outhandle "Split Plugfound " . (scalar @segments) . " documents in $filename\n"175 print $outhandle "SplitTextFile found " . (scalar @segments) . " documents in $filename\n" 174 176 if $self->{'verbosity'}; 175 177 … … 231 233 $doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "Encoding", $encoding); 232 234 my ($filemeta) = $file =~ /([^\\\/]+)$/; 233 $ doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "Source", &ghtml::dmsafe($filemeta));235 $self->set_Source_metadata($doc_obj, $filemeta, $encoding); 234 236 $doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "SourceSegment", "$segment"); 235 237 if ($self->{'cover_image'}) {
Note:
See TracChangeset
for help on using the changeset viewer.