Ignore:
Timestamp:
2022-08-25T11:23:53+12:00 (20 months ago)
Author:
kjdon
Message:

added some new options for CSVPlugin

File:
1 edited

Legend:

Unmodified
Added
Removed
  • main/trunk/greenstone2/perllib/plugins/CSVPlugin.pm

    r36481 r36482  
    5757    'type' => "string",
    5858    'reqd' => "no",
    59     'deft' => "Filename" }
    60       ];
     59    'deft' => "Filename" },
     60      { 'name' => "store_raw_values_as_text",
     61    'desc' => "{CSVPlugin.store_raw_values_as_text}",
     62    'type' => "flag",
     63    'reqd' => "no"},
     64       { 'name' => "no_document_if_source_unspecified",
     65    'desc' => "{CSVPlugin.no_document_if_source_unspecified}",
     66    'type' => "flag",
     67    'reqd' => "no"},
     68       { 'name' => "no_document_if_source_missing",
     69    'desc' => "{CSVPlugin.no_document_if_source_missing}",
     70    'type' => "flag",
     71    'reqd' => "no"},
     72       { 'name' => "store_field_values_as_document_text",
     73    'desc' => "{CSVPlugin.store_field_values_as_document_text}",
     74    'type' => "flag",
     75    'reqd' => "no"},
     76
     77
     78     ];
    6179
    6280
     
    186204    my $found_filename_field = 0;
    187205    my $filename_field = $self->{'filename_field'};
    188     print STDERR "looking for $filename_field field\n";
    189206    for (my $i = 0; $i < scalar(@csv_file_fields); $i++) {
    190207    # Remove any spaces from the field names, and surrounding quotes too
     
    200217   
    201218    if (!$found_filename_field) {
    202     $self->print_error($outhandle, $failhandle, $gli, $filename_full_path, "No $filename_field field in CSV file, metadata cannot be assigned to documents, will use metadata only dummy documents");
     219    $self->print_warning($outhandle, $failhandle, $gli, $filename_full_path, "No $filename_field field in CSV file, metadata cannot be assigned to documents, will use metadata only dummy documents");
    203220   
    204221    }
     
    216233    for (my $i=0; $i<$md_vals_len; $i++) {
    217234        my $md_val = $md_vals[$i];
    218         print STDERR "$count: md val = $md_val\n";
    219235        # Only bother with non-empty values
    220236        if ($md_val ne "" && defined($csv_file_fields[$i])) {
     
    266282    # We can't associate any metadata without knowing the file to associate it with
    267283    my $has_srcdoc = 0;
     284    my $missing_srcdoc = 0;
    268285    my $csv_line_filename="";;
    269286    if ($found_filename_field) {
     
    271288        my $csv_line_filename_array = $csv_line_metadata{$filename_field};
    272289        if (!defined $csv_line_filename_array) {
    273         $self->print_error($outhandle, $failhandle, $gli, $filename_full_path, "No $filename_field metadata in CSV line num $count");
     290        $self->print_warning($outhandle, $failhandle, $gli, $filename_full_path, "No $filename_field metadata in CSV line num $count");
    274291        } else {
    275292        $csv_line_filename = shift(@$csv_line_filename_array);
     
    279296           
    280297            delete $csv_line_metadata{$filename_field};
     298        } else {
     299            $self->print_warning($outhandle, $failhandle, $gli, $filename_full_path, "$csv_line_filename in $filename_field metadata in CSV line num $count is not found");
     300            $missing_srcdoc = 1; # there was one mentioned but its not found
    281301        }
    282302        }
     
    284304    }
    285305    if ($has_srcdoc) {
    286         print STDERR "storing meta in extra meta for $csv_line_filename\n";
     306        print $outhandle "Storing metadata, segment $count, for document $csv_line_filename\n" if ($verbosity > 2);
    287307        $self->store_meta_in_extrametadata($csv_line_filename, \%csv_line_metadata, $file, $filename_full_path, $extrametakeys, $extrametadata, $extrametafile);
    288308    } else {
    289         print STDERR "storing meta for $count, $csv_line_filename\n";
    290         $metadata_store->{$count} = \%csv_line_metadata;
     309        my $store_for_dummy = 1;
     310        if ($missing_srcdoc && $self->{'no_document_if_source_missing'}) {
     311        $self->print_warning($outhandle, $failhandle, $gli, $filename_full_path,"Not storing metadata for line $count as source doc is missing");
     312        $store_for_dummy = 0;
     313        } elsif(!$missing_srcdoc && $self->{'no_document_if_source_unspecified'}) {
     314        $self->print_warning($outhandle, $failhandle, $gli, $filename_full_path,"Not storing metadata for line $count as source doc is unspecified");
     315        $store_for_dummy = 0;
     316        }
     317        if ($store_for_dummy) {
     318       
     319        print $outhandle "Storing metadata for dummy document, segment $count\n" if ($verbosity > 2);
     320        $metadata_store->{$count} = \%csv_line_metadata;
     321        }
    291322    }
    292323    } # while csv_line = csv->getline
     
    333364
    334365    my $id;
    335     print STDERR "num keys  = ".scalar(keys (%$metadata_store))."\n";
    336     foreach $segment (sort keys (%$metadata_store)) {
    337     print $outhandle "processing segment $segment\n"
     366
     367    foreach $segment (sort { $a <=> $b } keys (%$metadata_store)) {
     368    print $outhandle "processing segment $segment as its own document\n"
    338369        if $self->{'verbosity'} > 1;   
    339370    $count++;
    340371    # create a new document
    341372    my $doc_obj = new doc ($filename_full_path, "indexed_doc", $self->{'file_rename_method'});
    342     $doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "Language", $language);
    343     $doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "Encoding", $encoding);
     373    my $cursection = $doc_obj->get_top_section();
     374    $doc_obj->add_utf8_metadata($cursection, "Language", $language);
     375    $doc_obj->add_utf8_metadata($cursection, "Encoding", $encoding);
    344376
    345377    $self->set_Source_metadata($doc_obj, $filename_full_path, $filename_encoding);
    346378
    347     $doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "SourceSegment", "$segment");
     379    $doc_obj->add_utf8_metadata($cursection, "SourceSegment", "$segment");
    348380    if ($self->{'cover_image'}) {
    349381        $self->associate_cover_image($doc_obj, $filename_full_path);
    350382    }
    351     $doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "Plugin", "$self->{'plugin_type'}");
     383    $doc_obj->add_utf8_metadata($cursection, "Plugin", "$self->{'plugin_type'}");
    352384
    353385    # include any metadata passed in from previous plugins
    354386    # note that this metadata is associated with the top level section
    355     $self->extra_metadata ($doc_obj, $doc_obj->get_top_section(), $metadata);
     387    $self->extra_metadata ($doc_obj, $cursection, $metadata);
    356388
    357389    # add our stored metadata from metadata_read pass
    358390    my $segment_metadata = $metadata_store->{$segment};
    359     $self->extra_metadata($doc_obj, $doc_obj->get_top_section(), $segment_metadata);
    360 
     391    $self->extra_metadata($doc_obj, $cursection, $segment_metadata);
     392    if ($self->{'store_field_values_as_document_text'}) {
     393        my $new_text = "";
     394        foreach my $f (keys %$segment_metadata) {
     395        my $values = $segment_metadata->{$f};
     396        $new_text .= join (", ", @$values).", ";
     397        }
     398
     399        $doc_obj->add_utf8_text($cursection, $new_text);
     400    }
    361401    # do any automatic metadata extraction - does this make sense??
    362402    #$self->auto_extract_metadata ($doc_obj);
     
    385425}
    386426
     427sub print_warning {
     428    my $self = shift(@_);
     429    my ($outhandle, $failhandle, $gli, $file, $error) = @_;
     430
     431    print $outhandle "CSVPlugin Warning: $file: $error\n";
     432    print $failhandle "CSVPlugin Warning: $file: $error\n";
     433    print STDERR "<ProcessingError n='$file' r='$error'/>\n" if ($gli);
     434
     435}
    387436sub print_error
    388437{
Note: See TracChangeset for help on using the changeset viewer.