Changeset 9357
- Timestamp:
- 2005-03-10T12:14:13+13:00 (19 years ago)
- File:
-
- 1 edited
Legend:
- Unmodified
- Added
- Removed
-
trunk/gsdl/perllib/plugins/SplitPlug.pm
r8716 r9357 82 82 } 83 83 84 $self->{'textcat_store'} = {}; 85 $self->{'metapass_srcdoc'} = {}; # which segments have valid metadata_srcdoc 84 86 return bless $self, $class; 85 87 } … … 116 118 return q^\n\s*\n^; 117 119 } 120 121 sub metadata_read { 122 my $self = shift (@_); 123 my ($pluginfo, $base_dir, $file, $metadata, $extrametakeys, $extrametadata, $processor, $maxdocs, $gli) = @_; 124 125 my $matched = $self->SUPER::metadata_read($pluginfo, $base_dir, $file, 126 $metadata, $extrametakeys, 127 $extrametadata, $processor, 128 $maxdocs, $gli); 129 $split_matched = undef; 130 131 if ($matched) { 132 133 my $outhandle = $self->{'outhandle'}; 134 my $filename = &util::filename_cat($base_dir, $file); 135 136 my $plugin_name = ref ($self); 137 $file =~ s/^[\/\\]+//; # $file often begins with / so we'll tidy it up 138 139 $self->{'metapass_srcdoc'}->{$file} = {}; 140 141 # Do encoding stuff 142 my ($language, $encoding) = $self->textcat_get_language_encoding ($filename); 143 my $le_rec = { 'language' => $language, 'encoding' => $encoding }; 144 $self->{'textcat_store'}->{$file} = $le_rec; 145 146 # Read in file ($text will be in utf8) 147 my $text = ""; 148 $self->read_file ($filename, $encoding, $language, \$text); 149 150 if ($text !~ /\w/) { 151 gsprintf($outhandle, "$plugin_name: {BasPlug.file_has_no_text}\n", 152 $file) 153 if $self->{'verbosity'}; 154 155 my $failhandle = $self->{'failhandle'}; 156 print $failhandle "$file: " . ref($self) . ": file contains no text\n"; 157 $self->{'num_not_processed'} ++; 158 159 $self->{'textcat_store'}->{$file} = undef; 160 161 return 0; 162 } 163 164 165 # Split the text into several smaller segments 166 my $split_exp = $self->{'split_exp'}; 167 my @segments = split(/$split_exp/, $text); 168 print $outhandle "SplitPlug found " . (scalar @segments) . " documents in $filename\n" 169 if $self->{'verbosity'}; 170 171 $self->{'split_segments'} = \@segments; 172 $split_matched = scalar(@segments); 173 } 174 175 return $split_matched; 176 } 177 118 178 119 179 … … 136 196 return undef; 137 197 } 138 my $plugin_name = ref ($self);139 198 $file =~ s/^[\/\\]+//; # $file often begins with / so we'll tidy it up 140 199 141 # Do encoding stuff 142 my ($language, $encoding) = $self->textcat_get_language_encoding ($filename); 143 144 # Read in file ($text will be in utf8) 145 my $text = ""; 146 $self->read_file ($filename, $encoding, $language, \$text); 147 148 if ($text !~ /\w/) { 149 my $outhandle = $self->{'outhandle'}; 150 gsprintf($outhandle, "$plugin_name: {BasPlug.file_has_no_text}\n", 151 $file) 152 if $self->{'verbosity'}; 153 154 my $failhandle = $self->{'failhandle'}; 155 print $failhandle "$file: " . ref($self) . ": file contains no text\n"; 156 $self->{'num_not_processed'} ++; 157 200 my $le_rec = $self->{'textcat_store'}->{$file}; 201 if (!defined $le_rec) { 202 # means no text was found; 158 203 return 0; # not processed but no point in passing it on 159 204 } 160 161 162 # Split the text into several smaller segments 163 my $split_exp = $self->{'split_exp'}; 164 my @segments = split(/$split_exp/, $text); 165 print $outhandle "SplitPlug found " . (scalar @segments) . " documents in $filename\n" 166 if $self->{'verbosity'}; 205 206 my $language = $le_rec->{'language'}; 207 my $encoding = $le_rec->{'encoding'}; 208 $self->{'textcat_store'}->{$file} = undef; 209 210 my $segments = $self->{'split_segments'}; 211 212 $self->{'split_segments'} = undef; 167 213 168 214 # Process each segment in turn … … 170 216 $segment = 0; 171 217 $count = 0; 172 foreach $segtext (@ segments) {218 foreach $segtext (@$segments) { 173 219 $segment++; 220 221 if (defined $self->{'metapass_srcdoc'}->{$file}->{$segment}) { 222 # metadata is attached to a srcdoc 223 next; 224 } 174 225 175 226 # create a new document … … 223 274 } 224 275 276 delete $self->{'metapass_srcdoc'}->{$file}; 277 225 278 # Return number of document objects produced 226 279 return $count;
Note:
See TracChangeset
for help on using the changeset viewer.