Changeset 34840
- Timestamp:
- 2021-02-13T23:37:22+13:00 (3 years ago)
- File:
-
- 1 copied
Legend:
- Unmodified
- Added
- Removed
-
main/trunk/greenstone2/perllib/plugins/SplitTextFile.pm--for-gs311
r34646 r34840 114 114 sub get_default_split_exp { 115 115 return q^\n\s*\n^; 116 } 117 118 sub split_text_into_segments { 119 my $self = shift (@_); 120 my ($textref) = @_; 121 122 123 # Split the text into several smaller segments 124 my $split_exp = $self->{'split_exp'}; 125 my @tmp = split(/$split_exp/i, $$textref); 126 127 my @segments =(); 128 ## get rid of empty segments 129 foreach my $seg (@tmp){ 130 if ($seg ne ""){ 131 push @segments, $seg; 132 } 133 } 134 135 return \@segments; 116 136 } 117 137 … … 160 180 161 181 # Split the text into several smaller segments 162 my $split_exp = $self->{'split_exp'}; 163 my @tmp = split(/$split_exp/i, $text); 164 my @segments =(); 165 ## get rid of empty segments 166 foreach my $seg (@tmp){ 167 if ($seg ne ""){ 168 push @segments, $seg; 169 } 170 } 171 172 print $outhandle "SplitTextFile found " . (scalar @segments) . " documents in $filename\n" 182 # my $split_exp = $self->{'split_exp'}; 183 # my @tmp = split(/$split_exp/i, $text); 184 # my @segments =(); 185 # ## get rid of empty segments 186 # foreach my $seg (@tmp){ 187 # if ($seg ne ""){ 188 # push @segments, $seg; 189 # } 190 # } 191 # 192 # print $outhandle "SplitTextFile found " . (scalar @segments) . " documents in $filename\n" 193 # if $self->{'verbosity'}; 194 # 195 # $self->{'split_segments'}->{$file} = \@segments; 196 # return scalar(@segments); 197 198 my $segments = $self->split_text_into_segments(\$text); 199 200 my $num_segments = scalar(@$segments); 201 202 print $outhandle "SplitTextFile found $num_segments documents in $filename\n" 173 203 if $self->{'verbosity'}; 174 204 175 $self->{'split_segments'}->{$file} = \@segments;176 177 return scalar(@segments);205 $self->{'split_segments'}->{$file} = $segments; 206 207 return $num_segments; 178 208 } 179 209 … … 243 273 #$doc_obj->add_metadata($doc_obj->get_top_section(), "FileFormat", "Split"); 244 274 275 # include any metadata passed in from previous plugins 276 # note that this metadata is associated with the top level section 277 $self->extra_metadata ($doc_obj, $doc_obj->get_top_section(), $metadata); 278 245 279 # Calculate a "base" document ID. 246 280 if (!defined $id) { … … 248 282 } 249 283 250 # include any metadata passed in from previous plugins251 # note that this metadata is associated with the top level section252 $self->extra_metadata ($doc_obj, $doc_obj->get_top_section(), $metadata);284 # # include any metadata passed in from previous plugins 285 # # note that this metadata is associated with the top level section 286 # $self->extra_metadata ($doc_obj, $doc_obj->get_top_section(), $metadata); 253 287 254 288 # do plugin specific processing of doc_obj
Note:
See TracChangeset
for help on using the changeset viewer.