Changeset 9357


Ignore:
Timestamp:
2005-03-10T12:14:13+13:00 (19 years ago)
Author:
davidb
Message:

SplitPlug enhanced to support the two pass: metadata_read, read algorithm.
This is largely done by the support of MetadataPass.pm, a new "plugin" module
that contains much of the necessary code.

File:
1 edited

Legend:

Unmodified
Added
Removed
  • trunk/gsdl/perllib/plugins/SplitPlug.pm

    r8716 r9357  
    8282    }
    8383
     84    $self->{'textcat_store'} = {};
     85    $self->{'metapass_srcdoc'} = {}; # which segments have valid metadata_srcdoc
    8486    return bless $self, $class;
    8587}
     
    116118    return q^\n\s*\n^;
    117119}
     120
     121sub metadata_read {
     122    my $self = shift (@_); 
     123    my ($pluginfo, $base_dir, $file, $metadata, $extrametakeys, $extrametadata, $processor, $maxdocs, $gli) = @_;
     124
     125    my $matched = $self->SUPER::metadata_read($pluginfo, $base_dir, $file,
     126                          $metadata, $extrametakeys,
     127                          $extrametadata, $processor,
     128                          $maxdocs, $gli);
     129    $split_matched = undef;
     130
     131    if ($matched) {
     132
     133    my $outhandle = $self->{'outhandle'};
     134    my $filename = &util::filename_cat($base_dir, $file);
     135
     136    my $plugin_name = ref ($self);
     137    $file =~ s/^[\/\\]+//; # $file often begins with / so we'll tidy it up
     138
     139    $self->{'metapass_srcdoc'}->{$file} = {};
     140
     141    # Do encoding stuff
     142    my ($language, $encoding) = $self->textcat_get_language_encoding ($filename);
     143    my $le_rec = { 'language' => $language, 'encoding' => $encoding };
     144    $self->{'textcat_store'}->{$file} = $le_rec;
     145
     146    # Read in file ($text will be in utf8)
     147    my $text = "";
     148    $self->read_file ($filename, $encoding, $language, \$text);
     149
     150    if ($text !~ /\w/) {
     151        gsprintf($outhandle, "$plugin_name: {BasPlug.file_has_no_text}\n",
     152             $file)
     153        if $self->{'verbosity'};
     154       
     155        my $failhandle = $self->{'failhandle'};
     156        print $failhandle "$file: " . ref($self) . ": file contains no text\n";
     157        $self->{'num_not_processed'} ++;
     158
     159        $self->{'textcat_store'}->{$file} = undef;
     160
     161        return 0;
     162    }
     163   
     164   
     165    # Split the text into several smaller segments
     166    my $split_exp = $self->{'split_exp'};
     167    my @segments = split(/$split_exp/, $text);
     168    print $outhandle "SplitPlug found " . (scalar @segments) . " documents in $filename\n"
     169        if $self->{'verbosity'};
     170   
     171    $self->{'split_segments'} = \@segments;
     172    $split_matched = scalar(@segments);
     173    }
     174   
     175    return $split_matched;
     176}
     177
    118178
    119179
     
    136196    return undef;
    137197    }
    138     my $plugin_name = ref ($self);
    139198    $file =~ s/^[\/\\]+//; # $file often begins with / so we'll tidy it up
    140199
    141     # Do encoding stuff
    142     my ($language, $encoding) = $self->textcat_get_language_encoding ($filename);
    143 
    144     # Read in file ($text will be in utf8)
    145     my $text = "";
    146     $self->read_file ($filename, $encoding, $language, \$text);
    147 
    148     if ($text !~ /\w/) {
    149     my $outhandle = $self->{'outhandle'};
    150     gsprintf($outhandle, "$plugin_name: {BasPlug.file_has_no_text}\n",
    151             $file)
    152         if $self->{'verbosity'};
    153 
    154     my $failhandle = $self->{'failhandle'};
    155     print $failhandle "$file: " . ref($self) . ": file contains no text\n";
    156     $self->{'num_not_processed'} ++;
    157 
     200    my $le_rec = $self->{'textcat_store'}->{$file};
     201    if (!defined $le_rec) {
     202    # means no text was found;
    158203    return 0; # not processed but no point in passing it on
    159204    }
    160    
    161    
    162     # Split the text into several smaller segments
    163     my $split_exp = $self->{'split_exp'};
    164     my @segments = split(/$split_exp/, $text);
    165     print $outhandle "SplitPlug found " . (scalar @segments) . " documents in $filename\n"
    166     if $self->{'verbosity'};
     205
     206    my $language = $le_rec->{'language'};
     207    my $encoding = $le_rec->{'encoding'};
     208    $self->{'textcat_store'}->{$file} = undef;
     209
     210    my $segments = $self->{'split_segments'};
     211
     212    $self->{'split_segments'} = undef;
    167213
    168214    # Process each segment in turn
     
    170216    $segment = 0;
    171217    $count = 0;
    172     foreach $segtext (@segments) {
     218    foreach $segtext (@$segments) {
    173219    $segment++;
     220
     221    if (defined $self->{'metapass_srcdoc'}->{$file}->{$segment}) {
     222        # metadata is attached to a srcdoc
     223        next;
     224    }
    174225
    175226    # create a new document
     
    223274    }
    224275
     276    delete $self->{'metapass_srcdoc'}->{$file};
     277
    225278    # Return number of document objects produced
    226279    return $count;
Note: See TracChangeset for help on using the changeset viewer.