Changeset 15871

Show
Ignore:
Timestamp:
05.06.2008 09:26:56 (11 years ago)
Author:
kjdon
Message:

plugin overhaul: Split plug renamed to SplitTextFile?, XMLPlug renamed to ReadXMLFile, ConvertToPlug? renamed to ConvertBinaryFile?. With the exception of BasePlugin?, only 'real' plugins (top level ones) are named xxPlugin.

Location:
gsdl/trunk/perllib/plugins
Files:
3 modified

Legend:

Unmodified
Added
Removed
  • gsdl/trunk/perllib/plugins/ConvertBinaryFile.pm

    r15865 r15871  
    11########################################################################### 
    22# 
    3 # ConvertToPlug.pm -- plugin that inherits from BasPlug 
     3# ConvertBinaryFile.pm -- plugin that inherits from BasPlug 
    44# 
    55# A component of the Greenstone digital library software 
     
    2727# This plugin is inherited by such plugins as WordPlug, PPTPlug, PSPlug,  
    2828# RTFPlug and PDFPlug. It facilitates the conversion of these document types  
    29 # to either HTML, TEXT or a series of images. It works by dynamically loading  
     29# to either HTML, Text or a series of images. It works by dynamically loading  
    3030# an appropriate secondary plugin (HTMLPlug, StructuredHTMLPlug,  
    31 # PagedImgPlug or TEXTPlug) based on the plugin argument 'convert_to'.  
    32  
    33 package ConvertToPlug; 
    34  
    35 use BasPlug; 
     31# PagedImagePlugin or TextPlugin) based on the plugin argument 'convert_to'.  
     32 
     33package ConvertBinaryFile; 
     34 
     35use BasePlugin; 
    3636use ghtml; 
    37 use HTMLPlug; 
    38 use TEXTPlug; 
    39 use PagedImgPlug; 
     37use HTMLPlugin; 
     38use TextPlugin; 
     39use PagedImagePlugin; 
    4040 
    4141use strict; 
    4242no strict 'refs'; # allow filehandles to be variables and viceversa 
    4343no strict 'subs'; 
     44 
    4445sub BEGIN { 
    45     @ConvertToPlug::ISA = ('BasPlug'); 
     46    @ConvertBinaryFile::ISA = ('BasePlugin'); 
    4647} 
    4748 
    4849my $convert_to_list = 
    4950    [ { 'name' => "auto", 
    50     'desc' => "{ConvertToPlug.convert_to.auto}" }, 
     51    'desc' => "{ConvertBinaryFile.convert_to.auto}" }, 
    5152      { 'name' => "html", 
    52     'desc' => "{ConvertToPlug.convert_to.html}" }, 
     53    'desc' => "{ConvertBinaryFile.convert_to.html}" }, 
    5354      { 'name' => "text", 
    54     'desc' => "{ConvertToPlug.convert_to.text}" } 
     55    'desc' => "{ConvertBinaryFile.convert_to.text}" } 
    5556      ]; 
    5657 
    5758my $arguments = 
    5859    [ { 'name' => "convert_to", 
    59     'desc' => "{ConvertToPlug.convert_to}", 
     60    'desc' => "{ConvertBinaryFile.convert_to}", 
    6061    'type' => "enum", 
    6162    'reqd' => "yes", 
     
    6364    'deft' => "auto" }, 
    6465      { 'name' => "keep_original_filename", 
    65     'desc' => "{ConvertToPlug.keep_original_filename}", 
     66    'desc' => "{ConvertBinaryFile.keep_original_filename}", 
    6667    'type' => "flag" }, 
    6768      { 'name' => "title_sub", 
     
    7172    'deft' => "" }, 
    7273      { 'name' => "apply_fribidi", 
    73     'desc' => "{ConvertToPlug.apply_fribidi}", 
     74    'desc' => "{ConvertBinaryFile.apply_fribidi}", 
    7475    'type' => "flag", 
    7576    'reqd' => "no" }, 
    7677      { 'name' => "use_strings", 
    77     'desc' => "{ConvertToPlug.use_strings}", 
     78    'desc' => "{ConvertBinaryFile.use_strings}", 
    7879    'type' => "flag", 
    7980    'reqd' => "no" }, 
    80       { 'name' => "extract_keyphrases", 
    81     'desc' => "{BasPlug.extract_keyphrases}", 
    82     'type' => "flag", 
    83     'reqd' => "no", 
    84     'hiddengli' => "yes" }, 
    85       { 'name' => "extract_keyphrase_options", 
    86     'desc' => "{BasPlug.extract_keyphrase_options}", 
    87     'type' => "string", 
    88     'reqd' => "no", 
    89     'hiddengli' => "yes" } ]; 
    90  
    91 my $options = { 'name'     => "ConvertToPlug", 
    92         'desc'     => "{ConvertToPlug.desc}", 
     81#      { 'name' => "extract_keyphrases", 
     82#   'desc' => "{BasPlug.extract_keyphrases}", 
     83#   'type' => "flag", 
     84#   'reqd' => "no", 
     85#   'hiddengli' => "yes" }, 
     86#      { 'name' => "extract_keyphrase_options", 
     87#   'desc' => "{BasPlug.extract_keyphrase_options}", 
     88#   'type' => "string", 
     89#   'reqd' => "no", 
     90#   'hiddengli' => "yes" }  
     91      ]; 
     92 
     93my $options = { 'name'     => "ConvertBinaryFile", 
     94        'desc'     => "{ConvertBinaryFile.desc}", 
    9395        'abstract' => "yes", 
    9496        'inherits' => "yes", 
     
    107109    foreach my $convert_to (@convert_to_list) { 
    108110    # load in "convert_to" plugin package 
    109     my $plugin_class = $convert_to."Plug"; 
     111    my $plugin_class = $convert_to."Plugin"; 
    110112    my $plugin_package = $plugin_class.".pm"; 
    111113 
     
    145147    push(@$pluginlist, $class); 
    146148    my $classPluginName = (defined $pluginlist->[0]) ? $pluginlist->[0] : $class; 
    147     if(defined $arguments){ push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});} 
    148     if(defined $options) { push(@{$hashArgOptLists->{"OptList"}},$options)}; 
    149  
    150     my $self = new BasPlug($pluginlist, $inputargs, $hashArgOptLists); 
     149    push(@{$hashArgOptLists->{"ArgList"}},@{$arguments}); 
     150    push(@{$hashArgOptLists->{"OptList"}},$options); 
     151 
     152    my $self = new BasePlugin($pluginlist, $inputargs, $hashArgOptLists); 
    151153     
    152154    if ($self->{'info_only'}) { 
     
    161163    my $windows_scripting = $self->{'windows_scripting'}; 
    162164    $windows_scripting = 0 unless defined $windows_scripting; 
    163     if ($classPluginName eq "PDFPlug") { 
     165    if ($classPluginName eq "PDFPlugin") { 
    164166    if ($convert_to_type eq "text" &&  
    165167        $ENV{'GSDLOS'} =~ /^windows$/i) { 
     
    167169        $convert_to_type = "html"; 
    168170    } 
    169     } elsif ($classPluginName eq "WordPlug") { 
     171    } elsif ($classPluginName eq "WordPlugin") { 
    170172    if ($windows_scripting && $ENV{'GSDLOS'} =~ /^windows$/i && $convert_to_type =~ /^(html|auto)$/) { 
    171173        # we use structured HTML, not normal html 
    172174        $convert_to_type = "structuredhtml"; 
    173175    }  
    174     } elsif ($classPluginName eq "PPTPlug") { 
     176    } elsif ($classPluginName eq "PPTPlugin") { 
    175177    if ($windows_scripting && $ENV{'GSDLOS'} =~ /^windows$/i && $convert_to_type eq "auto") { 
    176178        # we use paged img 
    177179        $convert_to_type = "pagedimg_jpg"; 
    178180    }  
    179     } elsif ($classPluginName eq "PSPlug") { 
     181    } elsif ($classPluginName eq "PSPlugin") { 
    180182    if ($convert_to_type eq "auto") { 
    181183        # we use text 
     
    193195    $self->{'convert_to_ext'} = "html"; 
    194196    } elsif ($convert_to_type eq "text") { 
    195     $self->{'convert_to'} = "TEXT"; 
     197    $self->{'convert_to'} = "Text"; 
    196198    $self->{'convert_to_ext'} = "txt"; 
    197199    } elsif ($convert_to_type eq "structuredhtml") { 
     
    199201    $self->{'convert_to_ext'} = "html"; 
    200202    } elsif ($convert_to_type =~ /^pagedimg/) { 
    201     $self->{'convert_to'} = "PagedImg"; 
     203    $self->{'convert_to'} = "PagedImage"; 
    202204    my ($convert_to_ext) = $convert_to_type =~ /pagedimg\_(jpg|gif|png)/i; 
    203205    $convert_to_ext = 'jpg' unless defined $convert_to_ext; 
     
    305307    # making sure the converter gives us the appropriate output type 
    306308    my $output_type=""; 
    307     if ($convert_to =~ m/PagedImg/i) { 
     309    if ($convert_to =~ m/PagedImage/i) { 
    308310    $output_type = lc($convert_to)."_".lc($convert_to_ext); 
    309311    } else { 
     
    349351    $self->{'converted_to'} = "HTML"; 
    350352    } elsif ($output_type =~ /te?xt/i) { 
    351     $self->{'converted_to'} = "TEXT"; 
     353    $self->{'converted_to'} = "Text"; 
    352354    } elsif ($output_type =~ /item/i){ 
    353     $self->{'converted_to'} = "PagedImg"; 
     355    $self->{'converted_to'} = "PagedImage"; 
    354356    } 
    355357     
     
    370372 
    371373 
    372 # Override BasPlug read 
    373 # We don't want to get language encoding stuff until after we've converted 
    374 # our file to either TEXT or HTML or PagedImage. 
    375 sub read { 
     374# Override BasPlug read_into_doc_obj - we need to call secondary plugin stuff 
     375sub read_into_doc_obj { 
    376376    my $self = shift (@_); 
    377377    my ($pluginfo, $base_dir, $file, $metadata, $processor, $maxdocs, $total_count, $gli) = @_; 
    378378 
    379379    my $outhandle = $self->{'outhandle'}; 
    380      
    381     my ($block_status,$filename) = $self->read_block(@_); 
    382     return $block_status if ((!defined $block_status) || ($block_status==0)); 
    383     $file = $self->read_tidy_file($file); 
    384      
     380 
     381    my ($filename_full_path, $filename_no_path) = $self->get_full_filenames($base_dir, $file); 
     382 
    385383    my $output_ext = $self->{'convert_to_ext'}; 
    386384    my $conv_filename = ""; 
    387     $conv_filename = $self->tmp_area_convert_file($output_ext, $filename); 
     385    $conv_filename = $self->tmp_area_convert_file($output_ext, $filename_full_path); 
    388386     
    389387    if ("$conv_filename" eq "") {return -1;} # had an error, will be passed down pipeline  
     
    394392    # Run the "fribidi" (http://fribidi.org) Unicode Bidirectional Algorithm program over the converted file 
    395393    # Added for fixing up Persian PDFs after being processed by pdftohtml, but may be useful in other cases too 
    396     if ($self->{'apply_fribidi'} && $self->{'converted_to'} =~ /(HTML|TEXT)/) { 
     394    if ($self->{'apply_fribidi'} && $self->{'converted_to'} =~ /(HTML|Text)/) { 
    397395    my $fribidi_command = "fribidi \"$conv_filename\" >\"${conv_filename}.tmp\""; 
    398396    if (system($fribidi_command) != 0) { 
     
    423421    # note: metadata is not carried on to the next level 
    424422    my ($rv,$doc_obj)  
    425     = $secondary_plugin->read_into_doc_obj ($pluginfo,"", $conv_filename,  
    426                         $metadata, $processor, $maxdocs, $total_count, 
    427                         $gli); 
     423    = $secondary_plugin->read_into_doc_obj ($pluginfo,"", $conv_filename, $metadata, $processor, $maxdocs, $total_count, $gli); 
    428424 
    429425    if ((!defined $rv) || ($rv<1)) { 
     
    433429     
    434430    # Override previous gsdlsourcefilename set by secondary plugin 
    435     my $collect_file = &util::filename_within_collection($filename); 
     431    my $collect_file = &util::filename_within_collection($filename_full_path); 
    436432    my $collect_conv_file = &util::filename_within_collection($conv_filename); 
    437433    $doc_obj->set_source_filename ($collect_file);  
    438434    $doc_obj->set_converted_filename($collect_conv_file); 
    439435 
    440     my ($filemeta) = $file =~ /([^\\\/]+)$/; 
    441     $doc_obj->set_utf8_metadata_element($doc_obj->get_top_section(), "Source", &ghtml::dmsafe($filemeta)); 
     436    $self->set_Source_metadata($doc_obj, $filename_no_path); 
     437         
    442438    $doc_obj->set_utf8_metadata_element($doc_obj->get_top_section(), "Plugin", "$self->{'plugin_type'}"); 
    443     $doc_obj->set_utf8_metadata_element($doc_obj->get_top_section(), "FileSize", (-s $filename)); 
    444  
    445     if ($self->{'cover_image'}) { 
    446     $self->associate_cover_image($doc_obj, $filename); 
    447     } 
     439    $doc_obj->set_utf8_metadata_element($doc_obj->get_top_section(), "FileSize", (-s $filename_full_path)); 
    448440 
    449441    # do plugin specific processing of doc_obj 
    450     unless (defined ($self->process(undef, $pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli))) { 
     442    unless (defined ($self->process($pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli))) { 
    451443    print STDERR "<ProcessingError n='$file'>\n" if ($gli); 
    452444    return -1; 
    453445    } 
     446 
     447    my $topsection = $doc_obj->get_top_section(); 
     448    $self->add_associated_files($doc_obj, $filename_full_path); 
     449    $self->extra_metadata($doc_obj, $topsection, $metadata); # do we need this here?? 
    454450    # do any automatic metadata extraction 
    455451    $self->auto_extract_metadata ($doc_obj); 
    456452 
    457453    # have we found a Title?? 
    458     $self->title_fallback($doc_obj,$doc_obj->get_top_section(),$filemeta); 
    459  
    460 #    # add an OID 
    461 #    $doc_obj->set_OID(); 
    462  
    463     # add an OID 
    464     # see if there is a plugin-specific set_OID function... 
    465     if (defined ($self->can('set_OID'))) { 
    466     # it will need $doc_obj to set the Identifier metadata... 
    467     $self->set_OID($doc_obj); 
    468     } else { 
    469     # use the default set_OID() in doc.pm 
    470     $doc_obj->set_OID(); 
    471     } 
    472  
    473  
    474     # process the document 
    475     $processor->process($doc_obj); 
    476  
    477     $self->{'num_processed'} ++; 
    478  
    479     return 1; 
    480 } 
    481  
     454    $self->title_fallback($doc_obj,$topsection,$filename_no_path); 
     455 
     456    $self->add_OID($doc_obj); 
     457 
     458    return (1, $doc_obj); 
     459 
     460} 
     461 
     462sub process { 
     463    my $self = shift (@_); 
     464    my ($pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli) = @_; 
     465 
     466    return $self->process_type($base_dir, $file, $doc_obj); 
     467} 
    482468 
    483469# do plugin specific processing of doc_obj for doc_ext type 
    484470sub process_type { 
    485471    my $self = shift (@_); 
    486     my ($doc_ext, $base_dir, $file, $doc_obj) = @_; 
     472    my ($base_dir, $file, $doc_obj) = @_; 
     473     
     474    # need to check that not empty 
     475    my $doc_ext = $self->{'filename_extension'}; 
     476    my $file_type = "unknown"; 
     477    $file_type = $self->{'file_type'} if defined $self->{'file_type'}; 
    487478     
    488479    # associate original file with doc object 
     
    496487    $doc_obj->associate_file($filename, $assocfilename, undef, $cursection); 
    497488 
    498     my $file_type; 
    499  
    500     if ($doc_ext eq "doc") { 
    501         $file_type = "Word"; 
    502     } elsif ($doc_ext eq "xls") { 
    503     $file_type = "Excel"; 
    504     } elsif ($doc_ext eq "ppt") { 
    505     $file_type = "PPT";  
    506     } elsif ($doc_ext eq "pdf") { 
    507     $file_type = "PDF";  
    508     } elsif ($doc_ext eq "rtf") { 
    509     $file_type = "RTF"; 
    510     } elsif ($doc_ext eq "ps") { 
    511     $file_type = "PS"; 
    512     } 
    513  
    514     my $file_format = $file_type || "unknown"; 
    515  
    516489    # We use set instead of add here because we only want one value 
    517     $doc_obj->set_utf8_metadata_element($cursection, "FileFormat", $file_format); 
     490    $doc_obj->set_utf8_metadata_element($cursection, "FileFormat", $file_type); 
    518491    my $doclink = "<a href=\"_httpprefix_/collect/[collection]/index/assoc/[archivedir]/doc.$doc_ext\">"; 
    519492    if ($self->{'keep_original_filename'} == 1) { 
  • gsdl/trunk/perllib/plugins/ReadXMLFile.pm

    r15865 r15871  
    11########################################################################### 
    22# 
    3 # XMLPlug.pm -- base class for XML plugins 
     3# ReadXMLFile.pm -- base class for XML plugins 
    44# A component of the Greenstone digital library software 
    55# from the New Zealand Digital Library Project at the  
     
    2424########################################################################### 
    2525 
    26 package XMLPlug; 
    27  
    28 use BasPlug; 
     26package ReadXMLFile; 
     27 
     28use BasePlugin; 
    2929use doc; 
    3030use strict; 
     
    3232 
    3333sub BEGIN { 
    34     @XMLPlug::ISA = ('BasPlug'); 
     34    @ReadXMLFile::ISA = ('BasePlugin'); 
    3535    unshift (@INC, "$ENV{'GSDLHOME'}/perllib/cpan"); 
    3636} 
     
    4040my $arguments = 
    4141    [ { 'name' => "process_exp", 
    42     'desc' => "{BasPlug.process_exp}", 
     42    'desc' => "{BasePlugin.process_exp}", 
    4343    'type' => "regexp", 
    4444    'deft' => &get_default_process_exp(), 
    4545    'reqd' => "no" }, 
    4646      { 'name' => "xslt", 
    47     'desc' => "{XMLPlug.xslt}", 
     47    'desc' => "{ReadXMLFile.xslt}", 
    4848    'type' => "string", 
    4949    'deft' => "", 
    5050    'reqd' => "no" } ]; 
    5151 
    52 my $options = { 'name'     => "XMLPlug", 
    53         'desc'     => "{XMLPlug.desc}", 
     52my $options = { 'name'     => "ReadXMLFile", 
     53        'desc'     => "{ReadXMLFile.desc}", 
    5454        'abstract' => "yes", 
    5555        'inherits' => "yes", 
     
    6161    push(@$pluginlist, $class); 
    6262 
    63     if(defined $arguments){ push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});} 
    64     if(defined $options) { push(@{$hashArgOptLists->{"OptList"}},$options)}; 
    65      
    66     # $self is global for use within subroutines called by XML::Parser 
    67     my $self = new BasPlug($pluginlist, $inputargs, $hashArgOptLists); 
     63    push(@{$hashArgOptLists->{"ArgList"}},@{$arguments}); 
     64    push(@{$hashArgOptLists->{"OptList"}},$options); 
     65     
     66    my $self = new BasePlugin($pluginlist, $inputargs, $hashArgOptLists); 
    6867 
    6968    if ($self->{'info_only'}) { 
    70     # don't worry about any options etc 
     69    # don't worry about creating the XML parser as all we want is the  
     70    # list of plugin options 
    7171    return bless $self, $class; 
    7272    } 
    7373 
    7474    my $parser = new XML::Parser('Style' => 'Stream', 
    75                                  'Pkg' => 'XMLPlug', 
     75                                 'Pkg' => 'ReadXMLFile', 
    7676                                 'PluginObj' => $self, 
    7777                 'Handlers' => {'Char' => \&Char, 
     
    198198    if (defined $result) { 
    199199    # we think we are processing this, but check that we actually are 
    200     my $filename = &util::filename_cat ($base_dir, $file) if $base_dir =~ /\w/; 
     200    my $filename = $self->get_full_filename($base_dir, $file); 
    201201 
    202202    if ($self->check_doctype($filename)) { 
     
    207207} 
    208208 
     209# we need to implement read cos we are not just using process_exp to determine 
     210# whether to process this or not. 
    209211sub read { 
    210212    my $self = shift (@_);   
     
    213215 
    214216    # Make sure we're processing the correct file, do blocking etc 
    215     my ($block_status,$filename) = $self->read_block(@_);     
     217    my ($block_status,$filename_full_path) = $self->read_block(@_);     
    216218    return $block_status if ((!defined $block_status) || ($block_status==0)); 
    217219 
    218220    ## check the doctype to see whether we really want to process the file 
    219     if (!$self->check_doctype($filename)) { 
     221    if (!$self->check_doctype($filename_full_path)) { 
    220222    # this file is not for us 
    221223    return undef; 
     
    225227    $self->{'base_dir'} = $base_dir; 
    226228    $self->{'file'} = $file; 
    227     $self->{'filename'} = $filename; 
     229    $self->{'filename'} = $filename_full_path; 
    228230    $self->{'processor'} = $processor; 
    229231    $self->{'metadata'} = $metadata; 
     
    233235    if (defined $xslt && ($xslt ne "")) { 
    234236        # perform xslt 
    235         my $transformed_xml = $self->apply_xslt($xslt,$filename); 
     237        my $transformed_xml = $self->apply_xslt($xslt,$filename_full_path); 
    236238 
    237239        # feed transformed file (now in memory as string) into XML parser 
     
    239241    } 
    240242    else { 
    241         $self->{'parser'}->parsefile($filename); 
     243        $self->{'parser'}->parsefile($filename_full_path); 
    242244    } 
    243245    }; 
     
    246248 
    247249    # parsefile may either croak somewhere in XML::Parser (e.g. because 
    248     # the document is not well formed) or die somewhere in XMLPlug or a 
     250    # the document is not well formed) or die somewhere in ReadXMLFile or a 
    249251    # derived plugin (e.g. because we're attempting to process a 
    250252    # document whose DOCTYPE is not meant for this plugin). For the 
     
    271273} 
    272274 
    273 # the following two methods are for if you want to do the parsing from a 
    274 # plugin that inherits from this. it seems that you can't call the parse  
    275 # methods directly. WHY??? 
    276 # 
    277 # [Stefan 27/5/07] These two methods may not be necessary any more as I've 
    278 # fixed XMLPlug so $self is no longer required to be a global variable 
    279 # (that was why inheritance wasn't working quite right with XMLPlug I 
    280 # think). I don't really know what other plugins rely on these methods 
    281 # though so have left them here for now. 
    282 sub parse_file { 
    283     my $self = shift (@_);  
    284     my ($filename) = @_; 
    285     $self->{'parser'}->parsefile($filename); 
    286 } 
    287  
    288 sub parse_string { 
    289     my $self = shift (@_);  
    290     my ($xml_string) = @_; 
    291     $self->{'parser'}->parse($xml_string); 
    292 } 
    293275 
    294276sub get_default_process_exp { 
     
    344326 
    345327    my ($expat, $name, $sysid, $pubid, $internal) = @_; 
    346     die "XMLPlug Cannot process XML document with DOCTYPE of $name"; 
     328    die "ReadXMLFile Cannot process XML document with DOCTYPE of $name"; 
    347329} 
    348330 
     
    395377    $self->{'doc_obj'} = new doc ($self->{'filename'}, "indexed_doc"); 
    396378    $self->{'doc_obj'}->set_OIDtype ($self->{'processor'}->{'OIDtype'}, $self->{'processor'}->{'OIDmetadata'}); 
     379    $self->{'doc_obj'}->add_utf8_metadata($self->{'doc_obj'}->get_top_section(), "Plugin", "$self->{'plugin_type'}"); 
     380 
     381    # do we want other auto metadata here (see BasePlugin.read_into_doc_obj) 
    397382} 
    398383 
     
    400385    my $self = shift(@_); 
    401386    my $doc_obj = $self->{'doc_obj'}; 
     387 
     388    # do we want other auto stuff here, see BasePlugin.read_into_doc_obj 
     389 
    402390    # include any metadata passed in from previous plugins  
    403391    # note that this metadata is associated with the top level section 
     
    410398    
    411399    # add an OID 
    412     $doc_obj->set_OID(); 
     400    $self->add_OID(); 
    413401     
    414402    $doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "Plugin", "$self->{'plugin_type'}"); 
     
    419407     
    420408    $self->{'num_processed'} ++; 
     409    undef $self->{'doc_obj'}; 
     410    undef $doc_obj; # is this the same as above?? 
    421411} 
    422412 
  • gsdl/trunk/perllib/plugins/SplitTextFile.pm

    r15865 r15871  
    11########################################################################### 
    22# 
    3 # SplitPlug.pm - a plugin for splitting input files into segments that 
     3# SplitTextFile.pm - a plugin for splitting input files into segments that 
    44#                will then be individually processed. 
    55# 
     
    2929 
    3030 
    31 # SplitPlug is a plugin for splitting input files into segments that will 
     31# SplitTextFile is a plugin for splitting input files into segments that will 
    3232# then be individually processed.   
    3333 
     
    3535# process input files that contain several documents, you should write a 
    3636# plugin with a process function that will handle one of those documents 
    37 # and have it inherit from SplitPlug.  See ReferPlug for an example. 
    38  
    39  
    40 package SplitPlug; 
    41  
    42 use BasPlug; 
     37# and have it inherit from SplitTextFile.  See ReferPlug for an example. 
     38 
     39 
     40package SplitTextFile; 
     41 
     42use ReadTextFile; 
    4343use gsprintf 'gsprintf'; 
    4444use util; 
     
    4747no strict 'refs'; # allow filehandles to be variables and viceversa 
    4848 
    49 # SplitPlug is a sub-class of BasPlug. 
     49# SplitTextFile is a sub-class of BasPlug. 
    5050sub BEGIN { 
    51     @SplitPlug::ISA = ('BasPlug'); 
     51    @SplitTextFile::ISA = ('ReadTextFile'); 
    5252} 
    5353 
     
    5555my $arguments = 
    5656    [ { 'name' => "split_exp", 
    57     'desc' => "{SplitPlug.split_exp}", 
     57    'desc' => "{SplitTextFile.split_exp}", 
    5858    'type' => "regexp", 
    5959    #'deft' => &get_default_split_exp(), 
     
    6161    'reqd' => "no" } ]; 
    6262 
    63 my $options = { 'name'     => "SplitPlug", 
    64         'desc'     => "{SplitPlug.desc}", 
     63my $options = { 'name'     => "SplitTextFile", 
     64        'desc'     => "{SplitTextFile.desc}", 
    6565        'abstract' => "yes", 
    6666        'inherits' => "yes", 
     
    7373    push(@$pluginlist, $class); 
    7474 
    75     if(defined $arguments){ push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});} 
    76     if(defined $options) { push(@{$hashArgOptLists->{"OptList"}},$options)}; 
    77  
    78     my $self = new BasPlug($pluginlist, $inputargs, $hashArgOptLists); 
     75    push(@{$hashArgOptLists->{"ArgList"}},@{$arguments}); 
     76    push(@{$hashArgOptLists->{"OptList"}},$options); 
     77 
     78    my $self = new ReadTextFile($pluginlist, $inputargs, $hashArgOptLists); 
    7979 
    8080    $self->{'textcat_store'} = {}; 
     
    8787    my ($verbosity, $outhandle, $failhandle) = @_; 
    8888 
    89     $self->BasPlug::init($verbosity, $outhandle, $failhandle); 
    90  
     89    $self->ReadTextFile::init($verbosity, $outhandle, $failhandle); 
     90 
     91    # why is this is init and not in new?? 
    9192    if ((!defined $self->{'process_exp'}) || ($self->{'process_exp'} eq "")) { 
    9293 
     
    119120    my ($pluginfo, $base_dir, $file, $metadata, $extrametakeys, $extrametadata, $processor, $maxdocs, $gli) = @_; 
    120121 
     122    # returns 1 if matches process_exp, and has done blocking in the meantime 
    121123    my $matched = $self->SUPER::metadata_read($pluginfo, $base_dir, $file,  
    122124                          $metadata, $extrametakeys,  
     
    146148  
    147149    if ($text !~ /\w/) { 
    148         gsprintf($outhandle, "$plugin_name: {BasPlug.file_has_no_text}\n", 
     150        gsprintf($outhandle, "$plugin_name: {ReadTextFile.file_has_no_text}\n", 
    149151             $file) 
    150152        if $self->{'verbosity'}; 
     
    171173    } 
    172174 
    173     print $outhandle "SplitPlug found " . (scalar @segments) . " documents in $filename\n"  
     175    print $outhandle "SplitTextFile found " . (scalar @segments) . " documents in $filename\n"  
    174176        if $self->{'verbosity'}; 
    175177     
     
    231233    $doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "Encoding", $encoding); 
    232234    my ($filemeta) = $file =~ /([^\\\/]+)$/; 
    233     $doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "Source", &ghtml::dmsafe($filemeta)); 
     235    $self->set_Source_metadata($doc_obj, $filemeta, $encoding); 
    234236    $doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "SourceSegment", "$segment"); 
    235237    if ($self->{'cover_image'}) {