Changeset 36482
- Timestamp:
- 2022-08-25T11:23:53+12:00 (20 months ago)
- Location:
- main/trunk/greenstone2/perllib
- Files:
-
- 2 edited
Legend:
- Unmodified
- Added
- Removed
-
main/trunk/greenstone2/perllib/plugins/CSVPlugin.pm
r36481 r36482 57 57 'type' => "string", 58 58 'reqd' => "no", 59 'deft' => "Filename" } 60 ]; 59 'deft' => "Filename" }, 60 { 'name' => "store_raw_values_as_text", 61 'desc' => "{CSVPlugin.store_raw_values_as_text}", 62 'type' => "flag", 63 'reqd' => "no"}, 64 { 'name' => "no_document_if_source_unspecified", 65 'desc' => "{CSVPlugin.no_document_if_source_unspecified}", 66 'type' => "flag", 67 'reqd' => "no"}, 68 { 'name' => "no_document_if_source_missing", 69 'desc' => "{CSVPlugin.no_document_if_source_missing}", 70 'type' => "flag", 71 'reqd' => "no"}, 72 { 'name' => "store_field_values_as_document_text", 73 'desc' => "{CSVPlugin.store_field_values_as_document_text}", 74 'type' => "flag", 75 'reqd' => "no"}, 76 77 78 ]; 61 79 62 80 … … 186 204 my $found_filename_field = 0; 187 205 my $filename_field = $self->{'filename_field'}; 188 print STDERR "looking for $filename_field field\n";189 206 for (my $i = 0; $i < scalar(@csv_file_fields); $i++) { 190 207 # Remove any spaces from the field names, and surrounding quotes too … … 200 217 201 218 if (!$found_filename_field) { 202 $self->print_ error($outhandle, $failhandle, $gli, $filename_full_path, "No $filename_field field in CSV file, metadata cannot be assigned to documents, will use metadata only dummy documents");219 $self->print_warning($outhandle, $failhandle, $gli, $filename_full_path, "No $filename_field field in CSV file, metadata cannot be assigned to documents, will use metadata only dummy documents"); 203 220 204 221 } … … 216 233 for (my $i=0; $i<$md_vals_len; $i++) { 217 234 my $md_val = $md_vals[$i]; 218 print STDERR "$count: md val = $md_val\n";219 235 # Only bother with non-empty values 220 236 if ($md_val ne "" && defined($csv_file_fields[$i])) { … … 266 282 # We can't associate any metadata without knowing the file to associate it with 267 283 my $has_srcdoc = 0; 284 my $missing_srcdoc = 0; 268 285 my $csv_line_filename="";; 269 286 if ($found_filename_field) { … … 271 288 my $csv_line_filename_array = $csv_line_metadata{$filename_field}; 272 289 if (!defined $csv_line_filename_array) { 273 $self->print_ error($outhandle, $failhandle, $gli, $filename_full_path, "No $filename_field metadata in CSV line num $count");290 $self->print_warning($outhandle, $failhandle, $gli, $filename_full_path, "No $filename_field metadata in CSV line num $count"); 274 291 } else { 275 292 $csv_line_filename = shift(@$csv_line_filename_array); … … 279 296 280 297 delete $csv_line_metadata{$filename_field}; 298 } else { 299 $self->print_warning($outhandle, $failhandle, $gli, $filename_full_path, "$csv_line_filename in $filename_field metadata in CSV line num $count is not found"); 300 $missing_srcdoc = 1; # there was one mentioned but its not found 281 301 } 282 302 } … … 284 304 } 285 305 if ($has_srcdoc) { 286 print STDERR "storing meta in extra meta for $csv_line_filename\n";306 print $outhandle "Storing metadata, segment $count, for document $csv_line_filename\n" if ($verbosity > 2); 287 307 $self->store_meta_in_extrametadata($csv_line_filename, \%csv_line_metadata, $file, $filename_full_path, $extrametakeys, $extrametadata, $extrametafile); 288 308 } else { 289 print STDERR "storing meta for $count, $csv_line_filename\n"; 290 $metadata_store->{$count} = \%csv_line_metadata; 309 my $store_for_dummy = 1; 310 if ($missing_srcdoc && $self->{'no_document_if_source_missing'}) { 311 $self->print_warning($outhandle, $failhandle, $gli, $filename_full_path,"Not storing metadata for line $count as source doc is missing"); 312 $store_for_dummy = 0; 313 } elsif(!$missing_srcdoc && $self->{'no_document_if_source_unspecified'}) { 314 $self->print_warning($outhandle, $failhandle, $gli, $filename_full_path,"Not storing metadata for line $count as source doc is unspecified"); 315 $store_for_dummy = 0; 316 } 317 if ($store_for_dummy) { 318 319 print $outhandle "Storing metadata for dummy document, segment $count\n" if ($verbosity > 2); 320 $metadata_store->{$count} = \%csv_line_metadata; 321 } 291 322 } 292 323 } # while csv_line = csv->getline … … 333 364 334 365 my $id; 335 print STDERR "num keys = ".scalar(keys (%$metadata_store))."\n"; 336 foreach $segment (sort keys (%$metadata_store)) {337 print $outhandle "processing segment $segment \n"366 367 foreach $segment (sort { $a <=> $b } keys (%$metadata_store)) { 368 print $outhandle "processing segment $segment as its own document\n" 338 369 if $self->{'verbosity'} > 1; 339 370 $count++; 340 371 # create a new document 341 372 my $doc_obj = new doc ($filename_full_path, "indexed_doc", $self->{'file_rename_method'}); 342 $doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "Language", $language); 343 $doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "Encoding", $encoding); 373 my $cursection = $doc_obj->get_top_section(); 374 $doc_obj->add_utf8_metadata($cursection, "Language", $language); 375 $doc_obj->add_utf8_metadata($cursection, "Encoding", $encoding); 344 376 345 377 $self->set_Source_metadata($doc_obj, $filename_full_path, $filename_encoding); 346 378 347 $doc_obj->add_utf8_metadata($ doc_obj->get_top_section(), "SourceSegment", "$segment");379 $doc_obj->add_utf8_metadata($cursection, "SourceSegment", "$segment"); 348 380 if ($self->{'cover_image'}) { 349 381 $self->associate_cover_image($doc_obj, $filename_full_path); 350 382 } 351 $doc_obj->add_utf8_metadata($ doc_obj->get_top_section(), "Plugin", "$self->{'plugin_type'}");383 $doc_obj->add_utf8_metadata($cursection, "Plugin", "$self->{'plugin_type'}"); 352 384 353 385 # include any metadata passed in from previous plugins 354 386 # note that this metadata is associated with the top level section 355 $self->extra_metadata ($doc_obj, $ doc_obj->get_top_section(), $metadata);387 $self->extra_metadata ($doc_obj, $cursection, $metadata); 356 388 357 389 # add our stored metadata from metadata_read pass 358 390 my $segment_metadata = $metadata_store->{$segment}; 359 $self->extra_metadata($doc_obj, $doc_obj->get_top_section(), $segment_metadata); 360 391 $self->extra_metadata($doc_obj, $cursection, $segment_metadata); 392 if ($self->{'store_field_values_as_document_text'}) { 393 my $new_text = ""; 394 foreach my $f (keys %$segment_metadata) { 395 my $values = $segment_metadata->{$f}; 396 $new_text .= join (", ", @$values).", "; 397 } 398 399 $doc_obj->add_utf8_text($cursection, $new_text); 400 } 361 401 # do any automatic metadata extraction - does this make sense?? 362 402 #$self->auto_extract_metadata ($doc_obj); … … 385 425 } 386 426 427 sub print_warning { 428 my $self = shift(@_); 429 my ($outhandle, $failhandle, $gli, $file, $error) = @_; 430 431 print $outhandle "CSVPlugin Warning: $file: $error\n"; 432 print $failhandle "CSVPlugin Warning: $file: $error\n"; 433 print STDERR "<ProcessingError n='$file' r='$error'/>\n" if ($gli); 434 435 } 387 436 sub print_error 388 437 { -
main/trunk/greenstone2/perllib/strings.properties
r36481 r36482 884 884 CSVPlugin.desc:A plugin for files in comma-separated value format. Metadata can be assigned to source documents (specified in the Filename field), or new documents created for each line of the file. 885 885 886 CSVPlugin.filename_field:Which field in the CSV file to use for specifying source documents. 887 888 CSVPlugin.store_field_values_as_document_text:Store all the metadata values as the text of the document. Only applies if there is no source document specified. Useful for searching. 889 890 CSVPlugin.no_document_if_source_unspecified:IF there is no source document specified, don't create a dummy document. 891 892 CSVPlugin.no_document_if_source_missing:If there is a specified source document, but it is not there, don't create a dummy document. 893 886 894 CSVDeprecatedPlugin.desc:An old plugin for files in comma-separated value format. A new document will be created for each line of the file. 887 895
Note:
See TracChangeset
for help on using the changeset viewer.