Context Navigation

← Previous Changeset
Next Changeset →

Changeset 15871

Timestamp:

2008-06-05T09:26:56+12:00 (16 years ago)

Author:

kjdon

Message:

plugin overhaul: Split plug renamed to SplitTextFile, XMLPlug renamed to ReadXMLFile, ConvertToPlug renamed to ConvertBinaryFile. With the exception of BasePlugin, only 'real' plugins (top level ones) are named xxPlugin.

Location:

gsdl/trunk/perllib/plugins

Files:

: 3 edited

ConvertBinaryFile.pm (modified) (17 diffs)
ReadXMLFile.pm (modified) (18 diffs)
SplitTextFile.pm (modified) (12 diffs)

Legend:

: Unmodified
: Added
: Removed

gsdl/trunk/perllib/plugins/ConvertBinaryFile.pm

-              r15865
+              r15871
 ###########################################################################
+#
 # ConvertToPlug.pm -- plugin that inherits from BasPlug
+# ConvertBinaryFile.pm -- plugin that inherits from BasPlug
+#
 # A component of the Greenstone digital library software
 …
 # This plugin is inherited by such plugins as WordPlug, PPTPlug, PSPlug,
 # RTFPlug and PDFPlug. It facilitates the conversion of these document types
 # to either HTML, TEXT or a series of images. It works by dynamically loading
+# to either HTML, Text or a series of images. It works by dynamically loading
 # an appropriate secondary plugin (HTMLPlug, StructuredHTMLPlug,
 # PagedImgPlug or TEXTPlug) based on the plugin argument 'convert_to'.
 package ConvertToPlug;
 use BasPlug;
+# PagedImagePlugin or TextPlugin) based on the plugin argument 'convert_to'.
+package ConvertBinaryFile;
+use BasePlugin;
 use ghtml;
 use HTMLPlug;
 use TEXTPlug;
 use PagedImgPlug;
+use HTMLPlugin;
+use TextPlugin;
+use PagedImagePlugin;
 use strict;
 no strict 'refs'; # allow filehandles to be variables and viceversa
 no strict 'subs';
 sub BEGIN {
     @ConvertToPlug::ISA = ('BasPlug');
+    @ConvertBinaryFile::ISA = ('BasePlugin');
+}
 my $convert_to_list =
     [ { 'name' => "auto",
     'desc' => "{ConvertToPlug.convert_to.auto}" },
+    'desc' => "{ConvertBinaryFile.convert_to.auto}" },
       { 'name' => "html",
     'desc' => "{ConvertToPlug.convert_to.html}" },
+    'desc' => "{ConvertBinaryFile.convert_to.html}" },
       { 'name' => "text",
     'desc' => "{ConvertToPlug.convert_to.text}" }
+    'desc' => "{ConvertBinaryFile.convert_to.text}" }
       ];
 my $arguments =
     [ { 'name' => "convert_to",
     'desc' => "{ConvertToPlug.convert_to}",
+    'desc' => "{ConvertBinaryFile.convert_to}",
     'type' => "enum",
     'reqd' => "yes",
 …
     'deft' => "auto" },
       { 'name' => "keep_original_filename",
     'desc' => "{ConvertToPlug.keep_original_filename}",
+    'desc' => "{ConvertBinaryFile.keep_original_filename}",
     'type' => "flag" },
       { 'name' => "title_sub",
 …
     'deft' => "" },
       { 'name' => "apply_fribidi",
     'desc' => "{ConvertToPlug.apply_fribidi}",
+    'desc' => "{ConvertBinaryFile.apply_fribidi}",
     'type' => "flag",
     'reqd' => "no" },
       { 'name' => "use_strings",
     'desc' => "{ConvertToPlug.use_strings}",
+    'desc' => "{ConvertBinaryFile.use_strings}",
     'type' => "flag",
     'reqd' => "no" },
+      { 'name' => "extract_keyphrases",
+    'desc' => "{BasPlug.extract_keyphrases}",
+    'type' => "flag",
+    'reqd' => "no",
+    'hiddengli' => "yes" },
+      { 'name' => "extract_keyphrase_options",
+    'desc' => "{BasPlug.extract_keyphrase_options}",
+    'type' => "string",
+    'reqd' => "no",
+    'hiddengli' => "yes" } ];
+my $options = { 'name'     => "ConvertToPlug",
+        'desc'     => "{ConvertToPlug.desc}",
+#      { 'name' => "extract_keyphrases",
+#   'desc' => "{BasPlug.extract_keyphrases}",
+#   'type' => "flag",
+#   'reqd' => "no",
+#   'hiddengli' => "yes" },
+#      { 'name' => "extract_keyphrase_options",
+#   'desc' => "{BasPlug.extract_keyphrase_options}",
+#   'type' => "string",
+#   'reqd' => "no",
+#   'hiddengli' => "yes" }
+      ];
+my $options = { 'name'     => "ConvertBinaryFile",
+        'desc'     => "{ConvertBinaryFile.desc}",
         'abstract' => "yes",
         'inherits' => "yes",
 …
     foreach my $convert_to (@convert_to_list) {
     # load in "convert_to" plugin package
     my $plugin_class = $convert_to."Plug";
+    my $plugin_class = $convert_to."Plugin";
     my $plugin_package = $plugin_class.".pm";
 …
     push(@$pluginlist, $class);
     my $classPluginName = (defined $pluginlist->[0]) ? $pluginlist->[0] : $class;
     if(defined $arguments){ push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});}
     if(defined $options) { push(@{$hashArgOptLists->{"OptList"}},$options)};
     my $self = new BasPlug($pluginlist, $inputargs, $hashArgOptLists);
+    push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
+    push(@{$hashArgOptLists->{"OptList"}},$options);
+    my $self = new BasePlugin($pluginlist, $inputargs, $hashArgOptLists);
     if ($self->{'info_only'}) {
 …
     my $windows_scripting = $self->{'windows_scripting'};
     $windows_scripting = 0 unless defined $windows_scripting;
     if ($classPluginName eq "PDFPlug") {
+    if ($classPluginName eq "PDFPlugin") {
     if ($convert_to_type eq "text" &&
         $ENV{'GSDLOS'} =~ /^windows$/i) {
 …
         $convert_to_type = "html";
+    }
     } elsif ($classPluginName eq "WordPlug") {
+    } elsif ($classPluginName eq "WordPlugin") {
     if ($windows_scripting && $ENV{'GSDLOS'} =~ /^windows$/i && $convert_to_type =~ /^(html|auto)$/) {
         # we use structured HTML, not normal html
         $convert_to_type = "structuredhtml";
+    }
     } elsif ($classPluginName eq "PPTPlug") {
+    } elsif ($classPluginName eq "PPTPlugin") {
     if ($windows_scripting && $ENV{'GSDLOS'} =~ /^windows$/i && $convert_to_type eq "auto") {
         # we use paged img
         $convert_to_type = "pagedimg_jpg";
+    }
     } elsif ($classPluginName eq "PSPlug") {
+    } elsif ($classPluginName eq "PSPlugin") {
     if ($convert_to_type eq "auto") {
         # we use text
 …
     $self->{'convert_to_ext'} = "html";
     } elsif ($convert_to_type eq "text") {
     $self->{'convert_to'} = "TEXT";
+    $self->{'convert_to'} = "Text";
     $self->{'convert_to_ext'} = "txt";
     } elsif ($convert_to_type eq "structuredhtml") {
 …
     $self->{'convert_to_ext'} = "html";
     } elsif ($convert_to_type =~ /^pagedimg/) {
     $self->{'convert_to'} = "PagedImg";
+    $self->{'convert_to'} = "PagedImage";
     my ($convert_to_ext) = $convert_to_type =~ /pagedimg\_(jpg|gif|png)/i;
     $convert_to_ext = 'jpg' unless defined $convert_to_ext;
 …
     # making sure the converter gives us the appropriate output type
     my $output_type="";
     if ($convert_to =~ m/PagedImg/i) {
+    if ($convert_to =~ m/PagedImage/i) {
     $output_type = lc($convert_to)."_".lc($convert_to_ext);
     } else {
 …
     $self->{'converted_to'} = "HTML";
     } elsif ($output_type =~ /te?xt/i) {
     $self->{'converted_to'} = "TEXT";
+    $self->{'converted_to'} = "Text";
     } elsif ($output_type =~ /item/i){
     $self->{'converted_to'} = "PagedImg";
+    $self->{'converted_to'} = "PagedImage";
+    }
 …
+# Override BasPlug read
+# We don't want to get language encoding stuff until after we've converted
+# our file to either TEXT or HTML or PagedImage.
+sub read {
+# Override BasPlug read_into_doc_obj - we need to call secondary plugin stuff
+sub read_into_doc_obj {
     my $self = shift (@_);
     my ($pluginfo, $base_dir, $file, $metadata, $processor, $maxdocs, $total_count, $gli) = @_;
     my $outhandle = $self->{'outhandle'};
+    my ($block_status,$filename) = $self->read_block(@_);
+    return $block_status if ((!defined $block_status) || ($block_status==0));
+    $file = $self->read_tidy_file($file);
+    my ($filename_full_path, $filename_no_path) = $self->get_full_filenames($base_dir, $file);
     my $output_ext = $self->{'convert_to_ext'};
     my $conv_filename = "";
     $conv_filename = $self->tmp_area_convert_file($output_ext, $filename);
+    $conv_filename = $self->tmp_area_convert_file($output_ext, $filename_full_path);
     if ("$conv_filename" eq "") {return -1;} # had an error, will be passed down pipeline
 …
     # Run the "fribidi" (http://fribidi.org) Unicode Bidirectional Algorithm program over the converted file
     # Added for fixing up Persian PDFs after being processed by pdftohtml, but may be useful in other cases too
     if ($self->{'apply_fribidi'} && $self->{'converted_to'} =~ /(HTML|TEXT)/) {
+    if ($self->{'apply_fribidi'} && $self->{'converted_to'} =~ /(HTML|Text)/) {
     my $fribidi_command = "fribidi \"$conv_filename\" >\"${conv_filename}.tmp\"";
     if (system($fribidi_command) != 0) {
 …
     # note: metadata is not carried on to the next level
     my ($rv,$doc_obj)
+    = $secondary_plugin->read_into_doc_obj ($pluginfo,"", $conv_filename,
+                        $metadata, $processor, $maxdocs, $total_count,
+                        $gli);
+    = $secondary_plugin->read_into_doc_obj ($pluginfo,"", $conv_filename, $metadata, $processor, $maxdocs, $total_count, $gli);
     if ((!defined $rv) || ($rv<1)) {
 …
     # Override previous gsdlsourcefilename set by secondary plugin
     my $collect_file = &util::filename_within_collection($filename);
+    my $collect_file = &util::filename_within_collection($filename_full_path);
     my $collect_conv_file = &util::filename_within_collection($conv_filename);
     $doc_obj->set_source_filename ($collect_file);
     $doc_obj->set_converted_filename($collect_conv_file);
     my ($filemeta) = $file =~ /([^\\\/]+)$/;
     $doc_obj->set_utf8_metadata_element($doc_obj->get_top_section(), "Source", &ghtml::dmsafe($filemeta));
+    $self->set_Source_metadata($doc_obj, $filename_no_path);
     $doc_obj->set_utf8_metadata_element($doc_obj->get_top_section(), "Plugin", "$self->{'plugin_type'}");
+    $doc_obj->set_utf8_metadata_element($doc_obj->get_top_section(), "FileSize", (-s $filename));
+    if ($self->{'cover_image'}) {
+    $self->associate_cover_image($doc_obj, $filename);
+    }
+    $doc_obj->set_utf8_metadata_element($doc_obj->get_top_section(), "FileSize", (-s $filename_full_path));
     # do plugin specific processing of doc_obj
     unless (defined ($self->process(undef, $pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli))) {
+    unless (defined ($self->process($pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli))) {
     print STDERR "<ProcessingError n='$file'>\n" if ($gli);
     return -1;
+    }
+    my $topsection = $doc_obj->get_top_section();
+    $self->add_associated_files($doc_obj, $filename_full_path);
+    $self->extra_metadata($doc_obj, $topsection, $metadata); # do we need this here??
     # do any automatic metadata extraction
     $self->auto_extract_metadata ($doc_obj);
     # have we found a Title??
+    $self->title_fallback($doc_obj,$doc_obj->get_top_section(),$filemeta);
+#    # add an OID
+#    $doc_obj->set_OID();
+    # add an OID
+    # see if there is a plugin-specific set_OID function...
+    if (defined ($self->can('set_OID'))) {
+    # it will need $doc_obj to set the Identifier metadata...
+    $self->set_OID($doc_obj);
+    } else {
+    # use the default set_OID() in doc.pm
+    $doc_obj->set_OID();
+    }
+    # process the document
+    $processor->process($doc_obj);
+    $self->{'num_processed'} ++;
+    return 1;
+}
+    $self->title_fallback($doc_obj,$topsection,$filename_no_path);
+    $self->add_OID($doc_obj);
+    return (1, $doc_obj);
+}
+sub process {
+    my $self = shift (@_);
+    my ($pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli) = @_;
+    return $self->process_type($base_dir, $file, $doc_obj);
+}
 # do plugin specific processing of doc_obj for doc_ext type
 sub process_type {
     my $self = shift (@_);
+    my ($doc_ext, $base_dir, $file, $doc_obj) = @_;
+    my ($base_dir, $file, $doc_obj) = @_;
+    # need to check that not empty
+    my $doc_ext = $self->{'filename_extension'};
+    my $file_type = "unknown";
+    $file_type = $self->{'file_type'} if defined $self->{'file_type'};
     # associate original file with doc object
 …
     $doc_obj->associate_file($filename, $assocfilename, undef, $cursection);
-    my $file_type;
-    if ($doc_ext eq "doc") {
-        $file_type = "Word";
-    } elsif ($doc_ext eq "xls") {
-    $file_type = "Excel";
-    } elsif ($doc_ext eq "ppt") {
-    $file_type = "PPT";
-    } elsif ($doc_ext eq "pdf") {
-    $file_type = "PDF";
-    } elsif ($doc_ext eq "rtf") {
-    $file_type = "RTF";
-    } elsif ($doc_ext eq "ps") {
-    $file_type = "PS";
+    }
-    my $file_format = $file_type || "unknown";
     # We use set instead of add here because we only want one value
     $doc_obj->set_utf8_metadata_element($cursection, "FileFormat", $file_format);
+    $doc_obj->set_utf8_metadata_element($cursection, "FileFormat", $file_type);
     my $doclink = "<a href=\"_httpprefix_/collect/[collection]/index/assoc/[archivedir]/doc.$doc_ext\">";
     if ($self->{'keep_original_filename'} == 1) {

gsdl/trunk/perllib/plugins/ReadXMLFile.pm

-              r15865
+              r15871
 ###########################################################################
+#
 # XMLPlug.pm -- base class for XML plugins
+# ReadXMLFile.pm -- base class for XML plugins
 # A component of the Greenstone digital library software
 # from the New Zealand Digital Library Project at the
 …
 ###########################################################################
 package XMLPlug;
 use BasPlug;
+package ReadXMLFile;
+use BasePlugin;
 use doc;
 use strict;
 …
 sub BEGIN {
     @XMLPlug::ISA = ('BasPlug');
+    @ReadXMLFile::ISA = ('BasePlugin');
     unshift (@INC, "$ENV{'GSDLHOME'}/perllib/cpan");
+}
 …
 my $arguments =
     [ { 'name' => "process_exp",
     'desc' => "{BasPlug.process_exp}",
+    'desc' => "{BasePlugin.process_exp}",
     'type' => "regexp",
     'deft' => &get_default_process_exp(),
     'reqd' => "no" },
       { 'name' => "xslt",
     'desc' => "{XMLPlug.xslt}",
+    'desc' => "{ReadXMLFile.xslt}",
     'type' => "string",
     'deft' => "",
     'reqd' => "no" } ];
 my $options = { 'name'     => "XMLPlug",
         'desc'     => "{XMLPlug.desc}",
+my $options = { 'name'     => "ReadXMLFile",
+        'desc'     => "{ReadXMLFile.desc}",
         'abstract' => "yes",
         'inherits' => "yes",
 …
     push(@$pluginlist, $class);
+    if(defined $arguments){ push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});}
+    if(defined $options) { push(@{$hashArgOptLists->{"OptList"}},$options)};
+    # $self is global for use within subroutines called by XML::Parser
+    my $self = new BasPlug($pluginlist, $inputargs, $hashArgOptLists);
+    push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
+    push(@{$hashArgOptLists->{"OptList"}},$options);
+    my $self = new BasePlugin($pluginlist, $inputargs, $hashArgOptLists);
     if ($self->{'info_only'}) {
+    # don't worry about any options etc
+    # don't worry about creating the XML parser as all we want is the
+    # list of plugin options
     return bless $self, $class;
+    }
     my $parser = new XML::Parser('Style' => 'Stream',
                                  'Pkg' => 'XMLPlug',
+                                 'Pkg' => 'ReadXMLFile',
                                  'PluginObj' => $self,
                  'Handlers' => {'Char' => \&Char,
 …
     if (defined $result) {
     # we think we are processing this, but check that we actually are
     my $filename = &util::filename_cat ($base_dir, $file) if $base_dir =~ /\w/;
+    my $filename = $self->get_full_filename($base_dir, $file);
     if ($self->check_doctype($filename)) {
 …
+}
+# we need to implement read cos we are not just using process_exp to determine
+# whether to process this or not.
 sub read {
     my $self = shift (@_);
 …
     # Make sure we're processing the correct file, do blocking etc
     my ($block_status,$filename) = $self->read_block(@_);
+    my ($block_status,$filename_full_path) = $self->read_block(@_);
     return $block_status if ((!defined $block_status) || ($block_status==0));
     ## check the doctype to see whether we really want to process the file
     if (!$self->check_doctype($filename)) {
+    if (!$self->check_doctype($filename_full_path)) {
     # this file is not for us
     return undef;
 …
     $self->{'base_dir'} = $base_dir;
     $self->{'file'} = $file;
     $self->{'filename'} = $filename;
+    $self->{'filename'} = $filename_full_path;
     $self->{'processor'} = $processor;
     $self->{'metadata'} = $metadata;
 …
     if (defined $xslt && ($xslt ne "")) {
         # perform xslt
         my $transformed_xml = $self->apply_xslt($xslt,$filename);
+        my $transformed_xml = $self->apply_xslt($xslt,$filename_full_path);
         # feed transformed file (now in memory as string) into XML parser
 …
+    }
     else {
         $self->{'parser'}->parsefile($filename);
+        $self->{'parser'}->parsefile($filename_full_path);
+    }
     };
 …
     # parsefile may either croak somewhere in XML::Parser (e.g. because
     # the document is not well formed) or die somewhere in XMLPlug or a
+    # the document is not well formed) or die somewhere in ReadXMLFile or a
     # derived plugin (e.g. because we're attempting to process a
     # document whose DOCTYPE is not meant for this plugin). For the
 …
+}
-# the following two methods are for if you want to do the parsing from a
-# plugin that inherits from this. it seems that you can't call the parse
-# methods directly. WHY???
+#
-# [Stefan 27/5/07] These two methods may not be necessary any more as I've
-# fixed XMLPlug so $self is no longer required to be a global variable
-# (that was why inheritance wasn't working quite right with XMLPlug I
-# think). I don't really know what other plugins rely on these methods
-# though so have left them here for now.
-sub parse_file {
-    my $self = shift (@_);
-    my ($filename) = @_;
-    $self->{'parser'}->parsefile($filename);
+}
-sub parse_string {
-    my $self = shift (@_);
-    my ($xml_string) = @_;
-    $self->{'parser'}->parse($xml_string);
+}
 sub get_default_process_exp {
 …
     my ($expat, $name, $sysid, $pubid, $internal) = @_;
     die "XMLPlug Cannot process XML document with DOCTYPE of $name";
+    die "ReadXMLFile Cannot process XML document with DOCTYPE of $name";
+}
 …
     $self->{'doc_obj'} = new doc ($self->{'filename'}, "indexed_doc");
     $self->{'doc_obj'}->set_OIDtype ($self->{'processor'}->{'OIDtype'}, $self->{'processor'}->{'OIDmetadata'});
+    $self->{'doc_obj'}->add_utf8_metadata($self->{'doc_obj'}->get_top_section(), "Plugin", "$self->{'plugin_type'}");
+    # do we want other auto metadata here (see BasePlugin.read_into_doc_obj)
+}
 …
     my $self = shift(@_);
     my $doc_obj = $self->{'doc_obj'};
+    # do we want other auto stuff here, see BasePlugin.read_into_doc_obj
     # include any metadata passed in from previous plugins
     # note that this metadata is associated with the top level section
 …
     # add an OID
     $doc_obj->set_OID();
+    $self->add_OID();
     $doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "Plugin", "$self->{'plugin_type'}");
 …
     $self->{'num_processed'} ++;
+    undef $self->{'doc_obj'};
+    undef $doc_obj; # is this the same as above??
+}

gsdl/trunk/perllib/plugins/SplitTextFile.pm

-              r15865
+              r15871
 ###########################################################################
+#
 # SplitPlug.pm - a plugin for splitting input files into segments that
+# SplitTextFile.pm - a plugin for splitting input files into segments that
 #                will then be individually processed.
+#
 …
 # SplitPlug is a plugin for splitting input files into segments that will
+# SplitTextFile is a plugin for splitting input files into segments that will
 # then be individually processed.
 …
 # process input files that contain several documents, you should write a
 # plugin with a process function that will handle one of those documents
 # and have it inherit from SplitPlug.  See ReferPlug for an example.
 package SplitPlug;
 use BasPlug;
+# and have it inherit from SplitTextFile.  See ReferPlug for an example.
+package SplitTextFile;
+use ReadTextFile;
 use gsprintf 'gsprintf';
 use util;
 …
 no strict 'refs'; # allow filehandles to be variables and viceversa
 # SplitPlug is a sub-class of BasPlug.
+# SplitTextFile is a sub-class of BasPlug.
 sub BEGIN {
     @SplitPlug::ISA = ('BasPlug');
+    @SplitTextFile::ISA = ('ReadTextFile');
+}
 …
 my $arguments =
     [ { 'name' => "split_exp",
     'desc' => "{SplitPlug.split_exp}",
+    'desc' => "{SplitTextFile.split_exp}",
     'type' => "regexp",
     #'deft' => &get_default_split_exp(),
 …
     'reqd' => "no" } ];
 my $options = { 'name'     => "SplitPlug",
         'desc'     => "{SplitPlug.desc}",
+my $options = { 'name'     => "SplitTextFile",
+        'desc'     => "{SplitTextFile.desc}",
         'abstract' => "yes",
         'inherits' => "yes",
 …
     push(@$pluginlist, $class);
     if(defined $arguments){ push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});}
     if(defined $options) { push(@{$hashArgOptLists->{"OptList"}},$options)};
     my $self = new BasPlug($pluginlist, $inputargs, $hashArgOptLists);
+    push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
+    push(@{$hashArgOptLists->{"OptList"}},$options);
+    my $self = new ReadTextFile($pluginlist, $inputargs, $hashArgOptLists);
     $self->{'textcat_store'} = {};
 …
     my ($verbosity, $outhandle, $failhandle) = @_;
+    $self->BasPlug::init($verbosity, $outhandle, $failhandle);
+    $self->ReadTextFile::init($verbosity, $outhandle, $failhandle);
+    # why is this is init and not in new??
     if ((!defined $self->{'process_exp'}) || ($self->{'process_exp'} eq "")) {
 …
     my ($pluginfo, $base_dir, $file, $metadata, $extrametakeys, $extrametadata, $processor, $maxdocs, $gli) = @_;
+    # returns 1 if matches process_exp, and has done blocking in the meantime
     my $matched = $self->SUPER::metadata_read($pluginfo, $base_dir, $file,
                           $metadata, $extrametakeys,
 …
     if ($text !~ /\w/) {
         gsprintf($outhandle, "$plugin_name: {BasPlug.file_has_no_text}\n",
+        gsprintf($outhandle, "$plugin_name: {ReadTextFile.file_has_no_text}\n",
              $file)
         if $self->{'verbosity'};
 …
+    }
     print $outhandle "SplitPlug found " . (scalar @segments) . " documents in $filename\n"
+    print $outhandle "SplitTextFile found " . (scalar @segments) . " documents in $filename\n"
         if $self->{'verbosity'};
 …
     $doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "Encoding", $encoding);
     my ($filemeta) = $file =~ /([^\\\/]+)$/;
     $doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "Source", &ghtml::dmsafe($filemeta));
+    $self->set_Source_metadata($doc_obj, $filemeta, $encoding);
     $doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "SourceSegment", "$segment");
     if ($self->{'cover_image'}) {

Note: See TracChangeset for help on using the changeset viewer.

Download in other formats: