Changeset 15871


Ignore:
Timestamp:
2008-06-05T09:26:56+12:00 (16 years ago)
Author:
kjdon
Message:

plugin overhaul: Split plug renamed to SplitTextFile, XMLPlug renamed to ReadXMLFile, ConvertToPlug renamed to ConvertBinaryFile. With the exception of BasePlugin, only 'real' plugins (top level ones) are named xxPlugin.

Location:
gsdl/trunk/perllib/plugins
Files:
3 edited

Legend:

Unmodified
Added
Removed
  • gsdl/trunk/perllib/plugins/ConvertBinaryFile.pm

    r15865 r15871  
    11###########################################################################
    22#
    3 # ConvertToPlug.pm -- plugin that inherits from BasPlug
     3# ConvertBinaryFile.pm -- plugin that inherits from BasPlug
    44#
    55# A component of the Greenstone digital library software
     
    2727# This plugin is inherited by such plugins as WordPlug, PPTPlug, PSPlug,
    2828# RTFPlug and PDFPlug. It facilitates the conversion of these document types
    29 # to either HTML, TEXT or a series of images. It works by dynamically loading
     29# to either HTML, Text or a series of images. It works by dynamically loading
    3030# an appropriate secondary plugin (HTMLPlug, StructuredHTMLPlug,
    31 # PagedImgPlug or TEXTPlug) based on the plugin argument 'convert_to'.
    32 
    33 package ConvertToPlug;
    34 
    35 use BasPlug;
     31# PagedImagePlugin or TextPlugin) based on the plugin argument 'convert_to'.
     32
     33package ConvertBinaryFile;
     34
     35use BasePlugin;
    3636use ghtml;
    37 use HTMLPlug;
    38 use TEXTPlug;
    39 use PagedImgPlug;
     37use HTMLPlugin;
     38use TextPlugin;
     39use PagedImagePlugin;
    4040
    4141use strict;
    4242no strict 'refs'; # allow filehandles to be variables and viceversa
    4343no strict 'subs';
     44
    4445sub BEGIN {
    45     @ConvertToPlug::ISA = ('BasPlug');
     46    @ConvertBinaryFile::ISA = ('BasePlugin');
    4647}
    4748
    4849my $convert_to_list =
    4950    [ { 'name' => "auto",
    50     'desc' => "{ConvertToPlug.convert_to.auto}" },
     51    'desc' => "{ConvertBinaryFile.convert_to.auto}" },
    5152      { 'name' => "html",
    52     'desc' => "{ConvertToPlug.convert_to.html}" },
     53    'desc' => "{ConvertBinaryFile.convert_to.html}" },
    5354      { 'name' => "text",
    54     'desc' => "{ConvertToPlug.convert_to.text}" }
     55    'desc' => "{ConvertBinaryFile.convert_to.text}" }
    5556      ];
    5657
    5758my $arguments =
    5859    [ { 'name' => "convert_to",
    59     'desc' => "{ConvertToPlug.convert_to}",
     60    'desc' => "{ConvertBinaryFile.convert_to}",
    6061    'type' => "enum",
    6162    'reqd' => "yes",
     
    6364    'deft' => "auto" },
    6465      { 'name' => "keep_original_filename",
    65     'desc' => "{ConvertToPlug.keep_original_filename}",
     66    'desc' => "{ConvertBinaryFile.keep_original_filename}",
    6667    'type' => "flag" },
    6768      { 'name' => "title_sub",
     
    7172    'deft' => "" },
    7273      { 'name' => "apply_fribidi",
    73     'desc' => "{ConvertToPlug.apply_fribidi}",
     74    'desc' => "{ConvertBinaryFile.apply_fribidi}",
    7475    'type' => "flag",
    7576    'reqd' => "no" },
    7677      { 'name' => "use_strings",
    77     'desc' => "{ConvertToPlug.use_strings}",
     78    'desc' => "{ConvertBinaryFile.use_strings}",
    7879    'type' => "flag",
    7980    'reqd' => "no" },
    80       { 'name' => "extract_keyphrases",
    81     'desc' => "{BasPlug.extract_keyphrases}",
    82     'type' => "flag",
    83     'reqd' => "no",
    84     'hiddengli' => "yes" },
    85       { 'name' => "extract_keyphrase_options",
    86     'desc' => "{BasPlug.extract_keyphrase_options}",
    87     'type' => "string",
    88     'reqd' => "no",
    89     'hiddengli' => "yes" } ];
    90 
    91 my $options = { 'name'     => "ConvertToPlug",
    92         'desc'     => "{ConvertToPlug.desc}",
     81#      { 'name' => "extract_keyphrases",
     82#   'desc' => "{BasPlug.extract_keyphrases}",
     83#   'type' => "flag",
     84#   'reqd' => "no",
     85#   'hiddengli' => "yes" },
     86#      { 'name' => "extract_keyphrase_options",
     87#   'desc' => "{BasPlug.extract_keyphrase_options}",
     88#   'type' => "string",
     89#   'reqd' => "no",
     90#   'hiddengli' => "yes" }
     91      ];
     92
     93my $options = { 'name'     => "ConvertBinaryFile",
     94        'desc'     => "{ConvertBinaryFile.desc}",
    9395        'abstract' => "yes",
    9496        'inherits' => "yes",
     
    107109    foreach my $convert_to (@convert_to_list) {
    108110    # load in "convert_to" plugin package
    109     my $plugin_class = $convert_to."Plug";
     111    my $plugin_class = $convert_to."Plugin";
    110112    my $plugin_package = $plugin_class.".pm";
    111113
     
    145147    push(@$pluginlist, $class);
    146148    my $classPluginName = (defined $pluginlist->[0]) ? $pluginlist->[0] : $class;
    147     if(defined $arguments){ push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});}
    148     if(defined $options) { push(@{$hashArgOptLists->{"OptList"}},$options)};
    149 
    150     my $self = new BasPlug($pluginlist, $inputargs, $hashArgOptLists);
     149    push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
     150    push(@{$hashArgOptLists->{"OptList"}},$options);
     151
     152    my $self = new BasePlugin($pluginlist, $inputargs, $hashArgOptLists);
    151153   
    152154    if ($self->{'info_only'}) {
     
    161163    my $windows_scripting = $self->{'windows_scripting'};
    162164    $windows_scripting = 0 unless defined $windows_scripting;
    163     if ($classPluginName eq "PDFPlug") {
     165    if ($classPluginName eq "PDFPlugin") {
    164166    if ($convert_to_type eq "text" &&
    165167        $ENV{'GSDLOS'} =~ /^windows$/i) {
     
    167169        $convert_to_type = "html";
    168170    }
    169     } elsif ($classPluginName eq "WordPlug") {
     171    } elsif ($classPluginName eq "WordPlugin") {
    170172    if ($windows_scripting && $ENV{'GSDLOS'} =~ /^windows$/i && $convert_to_type =~ /^(html|auto)$/) {
    171173        # we use structured HTML, not normal html
    172174        $convert_to_type = "structuredhtml";
    173175    }
    174     } elsif ($classPluginName eq "PPTPlug") {
     176    } elsif ($classPluginName eq "PPTPlugin") {
    175177    if ($windows_scripting && $ENV{'GSDLOS'} =~ /^windows$/i && $convert_to_type eq "auto") {
    176178        # we use paged img
    177179        $convert_to_type = "pagedimg_jpg";
    178180    }
    179     } elsif ($classPluginName eq "PSPlug") {
     181    } elsif ($classPluginName eq "PSPlugin") {
    180182    if ($convert_to_type eq "auto") {
    181183        # we use text
     
    193195    $self->{'convert_to_ext'} = "html";
    194196    } elsif ($convert_to_type eq "text") {
    195     $self->{'convert_to'} = "TEXT";
     197    $self->{'convert_to'} = "Text";
    196198    $self->{'convert_to_ext'} = "txt";
    197199    } elsif ($convert_to_type eq "structuredhtml") {
     
    199201    $self->{'convert_to_ext'} = "html";
    200202    } elsif ($convert_to_type =~ /^pagedimg/) {
    201     $self->{'convert_to'} = "PagedImg";
     203    $self->{'convert_to'} = "PagedImage";
    202204    my ($convert_to_ext) = $convert_to_type =~ /pagedimg\_(jpg|gif|png)/i;
    203205    $convert_to_ext = 'jpg' unless defined $convert_to_ext;
     
    305307    # making sure the converter gives us the appropriate output type
    306308    my $output_type="";
    307     if ($convert_to =~ m/PagedImg/i) {
     309    if ($convert_to =~ m/PagedImage/i) {
    308310    $output_type = lc($convert_to)."_".lc($convert_to_ext);
    309311    } else {
     
    349351    $self->{'converted_to'} = "HTML";
    350352    } elsif ($output_type =~ /te?xt/i) {
    351     $self->{'converted_to'} = "TEXT";
     353    $self->{'converted_to'} = "Text";
    352354    } elsif ($output_type =~ /item/i){
    353     $self->{'converted_to'} = "PagedImg";
     355    $self->{'converted_to'} = "PagedImage";
    354356    }
    355357   
     
    370372
    371373
    372 # Override BasPlug read
    373 # We don't want to get language encoding stuff until after we've converted
    374 # our file to either TEXT or HTML or PagedImage.
    375 sub read {
     374# Override BasPlug read_into_doc_obj - we need to call secondary plugin stuff
     375sub read_into_doc_obj {
    376376    my $self = shift (@_);
    377377    my ($pluginfo, $base_dir, $file, $metadata, $processor, $maxdocs, $total_count, $gli) = @_;
    378378
    379379    my $outhandle = $self->{'outhandle'};
    380    
    381     my ($block_status,$filename) = $self->read_block(@_);
    382     return $block_status if ((!defined $block_status) || ($block_status==0));
    383     $file = $self->read_tidy_file($file);
    384    
     380
     381    my ($filename_full_path, $filename_no_path) = $self->get_full_filenames($base_dir, $file);
     382
    385383    my $output_ext = $self->{'convert_to_ext'};
    386384    my $conv_filename = "";
    387     $conv_filename = $self->tmp_area_convert_file($output_ext, $filename);
     385    $conv_filename = $self->tmp_area_convert_file($output_ext, $filename_full_path);
    388386   
    389387    if ("$conv_filename" eq "") {return -1;} # had an error, will be passed down pipeline
     
    394392    # Run the "fribidi" (http://fribidi.org) Unicode Bidirectional Algorithm program over the converted file
    395393    # Added for fixing up Persian PDFs after being processed by pdftohtml, but may be useful in other cases too
    396     if ($self->{'apply_fribidi'} && $self->{'converted_to'} =~ /(HTML|TEXT)/) {
     394    if ($self->{'apply_fribidi'} && $self->{'converted_to'} =~ /(HTML|Text)/) {
    397395    my $fribidi_command = "fribidi \"$conv_filename\" >\"${conv_filename}.tmp\"";
    398396    if (system($fribidi_command) != 0) {
     
    423421    # note: metadata is not carried on to the next level
    424422    my ($rv,$doc_obj)
    425     = $secondary_plugin->read_into_doc_obj ($pluginfo,"", $conv_filename,
    426                         $metadata, $processor, $maxdocs, $total_count,
    427                         $gli);
     423    = $secondary_plugin->read_into_doc_obj ($pluginfo,"", $conv_filename, $metadata, $processor, $maxdocs, $total_count, $gli);
    428424
    429425    if ((!defined $rv) || ($rv<1)) {
     
    433429   
    434430    # Override previous gsdlsourcefilename set by secondary plugin
    435     my $collect_file = &util::filename_within_collection($filename);
     431    my $collect_file = &util::filename_within_collection($filename_full_path);
    436432    my $collect_conv_file = &util::filename_within_collection($conv_filename);
    437433    $doc_obj->set_source_filename ($collect_file);
    438434    $doc_obj->set_converted_filename($collect_conv_file);
    439435
    440     my ($filemeta) = $file =~ /([^\\\/]+)$/;
    441     $doc_obj->set_utf8_metadata_element($doc_obj->get_top_section(), "Source", &ghtml::dmsafe($filemeta));
     436    $self->set_Source_metadata($doc_obj, $filename_no_path);
     437       
    442438    $doc_obj->set_utf8_metadata_element($doc_obj->get_top_section(), "Plugin", "$self->{'plugin_type'}");
    443     $doc_obj->set_utf8_metadata_element($doc_obj->get_top_section(), "FileSize", (-s $filename));
    444 
    445     if ($self->{'cover_image'}) {
    446     $self->associate_cover_image($doc_obj, $filename);
    447     }
     439    $doc_obj->set_utf8_metadata_element($doc_obj->get_top_section(), "FileSize", (-s $filename_full_path));
    448440
    449441    # do plugin specific processing of doc_obj
    450     unless (defined ($self->process(undef, $pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli))) {
     442    unless (defined ($self->process($pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli))) {
    451443    print STDERR "<ProcessingError n='$file'>\n" if ($gli);
    452444    return -1;
    453445    }
     446
     447    my $topsection = $doc_obj->get_top_section();
     448    $self->add_associated_files($doc_obj, $filename_full_path);
     449    $self->extra_metadata($doc_obj, $topsection, $metadata); # do we need this here??
    454450    # do any automatic metadata extraction
    455451    $self->auto_extract_metadata ($doc_obj);
    456452
    457453    # have we found a Title??
    458     $self->title_fallback($doc_obj,$doc_obj->get_top_section(),$filemeta);
    459 
    460 #    # add an OID
    461 #    $doc_obj->set_OID();
    462 
    463     # add an OID
    464     # see if there is a plugin-specific set_OID function...
    465     if (defined ($self->can('set_OID'))) {
    466     # it will need $doc_obj to set the Identifier metadata...
    467     $self->set_OID($doc_obj);
    468     } else {
    469     # use the default set_OID() in doc.pm
    470     $doc_obj->set_OID();
    471     }
    472 
    473 
    474     # process the document
    475     $processor->process($doc_obj);
    476 
    477     $self->{'num_processed'} ++;
    478 
    479     return 1;
    480 }
    481 
     454    $self->title_fallback($doc_obj,$topsection,$filename_no_path);
     455
     456    $self->add_OID($doc_obj);
     457
     458    return (1, $doc_obj);
     459
     460}
     461
     462sub process {
     463    my $self = shift (@_);
     464    my ($pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli) = @_;
     465
     466    return $self->process_type($base_dir, $file, $doc_obj);
     467}
    482468
    483469# do plugin specific processing of doc_obj for doc_ext type
    484470sub process_type {
    485471    my $self = shift (@_);
    486     my ($doc_ext, $base_dir, $file, $doc_obj) = @_;
     472    my ($base_dir, $file, $doc_obj) = @_;
     473   
     474    # need to check that not empty
     475    my $doc_ext = $self->{'filename_extension'};
     476    my $file_type = "unknown";
     477    $file_type = $self->{'file_type'} if defined $self->{'file_type'};
    487478   
    488479    # associate original file with doc object
     
    496487    $doc_obj->associate_file($filename, $assocfilename, undef, $cursection);
    497488
    498     my $file_type;
    499 
    500     if ($doc_ext eq "doc") {
    501         $file_type = "Word";
    502     } elsif ($doc_ext eq "xls") {
    503     $file_type = "Excel";
    504     } elsif ($doc_ext eq "ppt") {
    505     $file_type = "PPT";
    506     } elsif ($doc_ext eq "pdf") {
    507     $file_type = "PDF";
    508     } elsif ($doc_ext eq "rtf") {
    509     $file_type = "RTF";
    510     } elsif ($doc_ext eq "ps") {
    511     $file_type = "PS";
    512     }
    513 
    514     my $file_format = $file_type || "unknown";
    515 
    516489    # We use set instead of add here because we only want one value
    517     $doc_obj->set_utf8_metadata_element($cursection, "FileFormat", $file_format);
     490    $doc_obj->set_utf8_metadata_element($cursection, "FileFormat", $file_type);
    518491    my $doclink = "<a href=\"_httpprefix_/collect/[collection]/index/assoc/[archivedir]/doc.$doc_ext\">";
    519492    if ($self->{'keep_original_filename'} == 1) {
  • gsdl/trunk/perllib/plugins/ReadXMLFile.pm

    r15865 r15871  
    11###########################################################################
    22#
    3 # XMLPlug.pm -- base class for XML plugins
     3# ReadXMLFile.pm -- base class for XML plugins
    44# A component of the Greenstone digital library software
    55# from the New Zealand Digital Library Project at the
     
    2424###########################################################################
    2525
    26 package XMLPlug;
    27 
    28 use BasPlug;
     26package ReadXMLFile;
     27
     28use BasePlugin;
    2929use doc;
    3030use strict;
     
    3232
    3333sub BEGIN {
    34     @XMLPlug::ISA = ('BasPlug');
     34    @ReadXMLFile::ISA = ('BasePlugin');
    3535    unshift (@INC, "$ENV{'GSDLHOME'}/perllib/cpan");
    3636}
     
    4040my $arguments =
    4141    [ { 'name' => "process_exp",
    42     'desc' => "{BasPlug.process_exp}",
     42    'desc' => "{BasePlugin.process_exp}",
    4343    'type' => "regexp",
    4444    'deft' => &get_default_process_exp(),
    4545    'reqd' => "no" },
    4646      { 'name' => "xslt",
    47     'desc' => "{XMLPlug.xslt}",
     47    'desc' => "{ReadXMLFile.xslt}",
    4848    'type' => "string",
    4949    'deft' => "",
    5050    'reqd' => "no" } ];
    5151
    52 my $options = { 'name'     => "XMLPlug",
    53         'desc'     => "{XMLPlug.desc}",
     52my $options = { 'name'     => "ReadXMLFile",
     53        'desc'     => "{ReadXMLFile.desc}",
    5454        'abstract' => "yes",
    5555        'inherits' => "yes",
     
    6161    push(@$pluginlist, $class);
    6262
    63     if(defined $arguments){ push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});}
    64     if(defined $options) { push(@{$hashArgOptLists->{"OptList"}},$options)};
    65    
    66     # $self is global for use within subroutines called by XML::Parser
    67     my $self = new BasPlug($pluginlist, $inputargs, $hashArgOptLists);
     63    push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
     64    push(@{$hashArgOptLists->{"OptList"}},$options);
     65   
     66    my $self = new BasePlugin($pluginlist, $inputargs, $hashArgOptLists);
    6867
    6968    if ($self->{'info_only'}) {
    70     # don't worry about any options etc
     69    # don't worry about creating the XML parser as all we want is the
     70    # list of plugin options
    7171    return bless $self, $class;
    7272    }
    7373
    7474    my $parser = new XML::Parser('Style' => 'Stream',
    75                                  'Pkg' => 'XMLPlug',
     75                                 'Pkg' => 'ReadXMLFile',
    7676                                 'PluginObj' => $self,
    7777                 'Handlers' => {'Char' => \&Char,
     
    198198    if (defined $result) {
    199199    # we think we are processing this, but check that we actually are
    200     my $filename = &util::filename_cat ($base_dir, $file) if $base_dir =~ /\w/;
     200    my $filename = $self->get_full_filename($base_dir, $file);
    201201
    202202    if ($self->check_doctype($filename)) {
     
    207207}
    208208
     209# we need to implement read cos we are not just using process_exp to determine
     210# whether to process this or not.
    209211sub read {
    210212    my $self = shift (@_); 
     
    213215
    214216    # Make sure we're processing the correct file, do blocking etc
    215     my ($block_status,$filename) = $self->read_block(@_);   
     217    my ($block_status,$filename_full_path) = $self->read_block(@_);   
    216218    return $block_status if ((!defined $block_status) || ($block_status==0));
    217219
    218220    ## check the doctype to see whether we really want to process the file
    219     if (!$self->check_doctype($filename)) {
     221    if (!$self->check_doctype($filename_full_path)) {
    220222    # this file is not for us
    221223    return undef;
     
    225227    $self->{'base_dir'} = $base_dir;
    226228    $self->{'file'} = $file;
    227     $self->{'filename'} = $filename;
     229    $self->{'filename'} = $filename_full_path;
    228230    $self->{'processor'} = $processor;
    229231    $self->{'metadata'} = $metadata;
     
    233235    if (defined $xslt && ($xslt ne "")) {
    234236        # perform xslt
    235         my $transformed_xml = $self->apply_xslt($xslt,$filename);
     237        my $transformed_xml = $self->apply_xslt($xslt,$filename_full_path);
    236238
    237239        # feed transformed file (now in memory as string) into XML parser
     
    239241    }
    240242    else {
    241         $self->{'parser'}->parsefile($filename);
     243        $self->{'parser'}->parsefile($filename_full_path);
    242244    }
    243245    };
     
    246248
    247249    # parsefile may either croak somewhere in XML::Parser (e.g. because
    248     # the document is not well formed) or die somewhere in XMLPlug or a
     250    # the document is not well formed) or die somewhere in ReadXMLFile or a
    249251    # derived plugin (e.g. because we're attempting to process a
    250252    # document whose DOCTYPE is not meant for this plugin). For the
     
    271273}
    272274
    273 # the following two methods are for if you want to do the parsing from a
    274 # plugin that inherits from this. it seems that you can't call the parse
    275 # methods directly. WHY???
    276 #
    277 # [Stefan 27/5/07] These two methods may not be necessary any more as I've
    278 # fixed XMLPlug so $self is no longer required to be a global variable
    279 # (that was why inheritance wasn't working quite right with XMLPlug I
    280 # think). I don't really know what other plugins rely on these methods
    281 # though so have left them here for now.
    282 sub parse_file {
    283     my $self = shift (@_);
    284     my ($filename) = @_;
    285     $self->{'parser'}->parsefile($filename);
    286 }
    287 
    288 sub parse_string {
    289     my $self = shift (@_);
    290     my ($xml_string) = @_;
    291     $self->{'parser'}->parse($xml_string);
    292 }
    293275
    294276sub get_default_process_exp {
     
    344326
    345327    my ($expat, $name, $sysid, $pubid, $internal) = @_;
    346     die "XMLPlug Cannot process XML document with DOCTYPE of $name";
     328    die "ReadXMLFile Cannot process XML document with DOCTYPE of $name";
    347329}
    348330
     
    395377    $self->{'doc_obj'} = new doc ($self->{'filename'}, "indexed_doc");
    396378    $self->{'doc_obj'}->set_OIDtype ($self->{'processor'}->{'OIDtype'}, $self->{'processor'}->{'OIDmetadata'});
     379    $self->{'doc_obj'}->add_utf8_metadata($self->{'doc_obj'}->get_top_section(), "Plugin", "$self->{'plugin_type'}");
     380
     381    # do we want other auto metadata here (see BasePlugin.read_into_doc_obj)
    397382}
    398383
     
    400385    my $self = shift(@_);
    401386    my $doc_obj = $self->{'doc_obj'};
     387
     388    # do we want other auto stuff here, see BasePlugin.read_into_doc_obj
     389
    402390    # include any metadata passed in from previous plugins
    403391    # note that this metadata is associated with the top level section
     
    410398   
    411399    # add an OID
    412     $doc_obj->set_OID();
     400    $self->add_OID();
    413401   
    414402    $doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "Plugin", "$self->{'plugin_type'}");
     
    419407   
    420408    $self->{'num_processed'} ++;
     409    undef $self->{'doc_obj'};
     410    undef $doc_obj; # is this the same as above??
    421411}
    422412
  • gsdl/trunk/perllib/plugins/SplitTextFile.pm

    r15865 r15871  
    11###########################################################################
    22#
    3 # SplitPlug.pm - a plugin for splitting input files into segments that
     3# SplitTextFile.pm - a plugin for splitting input files into segments that
    44#                will then be individually processed.
    55#
     
    2929
    3030
    31 # SplitPlug is a plugin for splitting input files into segments that will
     31# SplitTextFile is a plugin for splitting input files into segments that will
    3232# then be individually processed. 
    3333
     
    3535# process input files that contain several documents, you should write a
    3636# plugin with a process function that will handle one of those documents
    37 # and have it inherit from SplitPlug.  See ReferPlug for an example.
    38 
    39 
    40 package SplitPlug;
    41 
    42 use BasPlug;
     37# and have it inherit from SplitTextFile.  See ReferPlug for an example.
     38
     39
     40package SplitTextFile;
     41
     42use ReadTextFile;
    4343use gsprintf 'gsprintf';
    4444use util;
     
    4747no strict 'refs'; # allow filehandles to be variables and viceversa
    4848
    49 # SplitPlug is a sub-class of BasPlug.
     49# SplitTextFile is a sub-class of BasPlug.
    5050sub BEGIN {
    51     @SplitPlug::ISA = ('BasPlug');
     51    @SplitTextFile::ISA = ('ReadTextFile');
    5252}
    5353
     
    5555my $arguments =
    5656    [ { 'name' => "split_exp",
    57     'desc' => "{SplitPlug.split_exp}",
     57    'desc' => "{SplitTextFile.split_exp}",
    5858    'type' => "regexp",
    5959    #'deft' => &get_default_split_exp(),
     
    6161    'reqd' => "no" } ];
    6262
    63 my $options = { 'name'     => "SplitPlug",
    64         'desc'     => "{SplitPlug.desc}",
     63my $options = { 'name'     => "SplitTextFile",
     64        'desc'     => "{SplitTextFile.desc}",
    6565        'abstract' => "yes",
    6666        'inherits' => "yes",
     
    7373    push(@$pluginlist, $class);
    7474
    75     if(defined $arguments){ push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});}
    76     if(defined $options) { push(@{$hashArgOptLists->{"OptList"}},$options)};
    77 
    78     my $self = new BasPlug($pluginlist, $inputargs, $hashArgOptLists);
     75    push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
     76    push(@{$hashArgOptLists->{"OptList"}},$options);
     77
     78    my $self = new ReadTextFile($pluginlist, $inputargs, $hashArgOptLists);
    7979
    8080    $self->{'textcat_store'} = {};
     
    8787    my ($verbosity, $outhandle, $failhandle) = @_;
    8888
    89     $self->BasPlug::init($verbosity, $outhandle, $failhandle);
    90 
     89    $self->ReadTextFile::init($verbosity, $outhandle, $failhandle);
     90
     91    # why is this is init and not in new??
    9192    if ((!defined $self->{'process_exp'}) || ($self->{'process_exp'} eq "")) {
    9293
     
    119120    my ($pluginfo, $base_dir, $file, $metadata, $extrametakeys, $extrametadata, $processor, $maxdocs, $gli) = @_;
    120121
     122    # returns 1 if matches process_exp, and has done blocking in the meantime
    121123    my $matched = $self->SUPER::metadata_read($pluginfo, $base_dir, $file,
    122124                          $metadata, $extrametakeys,
     
    146148 
    147149    if ($text !~ /\w/) {
    148         gsprintf($outhandle, "$plugin_name: {BasPlug.file_has_no_text}\n",
     150        gsprintf($outhandle, "$plugin_name: {ReadTextFile.file_has_no_text}\n",
    149151             $file)
    150152        if $self->{'verbosity'};
     
    171173    }
    172174
    173     print $outhandle "SplitPlug found " . (scalar @segments) . " documents in $filename\n"
     175    print $outhandle "SplitTextFile found " . (scalar @segments) . " documents in $filename\n"
    174176        if $self->{'verbosity'};
    175177   
     
    231233    $doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "Encoding", $encoding);
    232234    my ($filemeta) = $file =~ /([^\\\/]+)$/;
    233     $doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "Source", &ghtml::dmsafe($filemeta));
     235    $self->set_Source_metadata($doc_obj, $filemeta, $encoding);
    234236    $doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "SourceSegment", "$segment");
    235237    if ($self->{'cover_image'}) {
Note: See TracChangeset for help on using the changeset viewer.