Changeset 37183 for gs3-installations


Ignore:
Timestamp:
2023-01-25T23:14:25+13:00 (15 months ago)
Author:
davidb
Message:

Further refinement of idea, with emphasis on using plugins arguments rather than having things hardwired

File:
1 edited

Legend:

Unmodified
Added
Removed
  • gs3-installations/whakatohea-dl/trunk/sites/wmtb/collect/tipple-waiata/perllib/plugins/TippleExportJSONPlugin.pm

    r37180 r37183  
    4444
    4545
     46
    4647my $arguments = [
    47 ];
    48 
    49 # my $arguments = [
    5048#       { 'name' => "process_exp",
    5149#   'desc' => "{BaseImporter.process_exp}",
    5250#   'type' => "regexp",
    5351#   'reqd' => "no",
    54 #   'deft' => &get_default_process_exp() }
    55 
    56 # ];
     52#   'deft' => &get_default_process_exp() },
     53    { 'name' => "split_exp",
     54      'desc' => "{SplitJSONFile.split_exp}",
     55      'type' => "string",
     56#      'deft' => "contentGroups,contentItems",
     57      'deft' => "contentItems",
     58      'reqd' => "no" },
     59    { 'name' => "metadata_exp",
     60      'desc' => "{SplitJSONFile.metadata_exp}",
     61      'type' => "string",
     62      'deft' => "WAIATA",
     63      'deft' => "",
     64      'reqd' => "no" },   
     65];
     66
     67# Other document-level metadata types to consider:
     68#
     69# .contentGroups:   
     70#    COMPOSER
     71#    GENRE
     72#    HAPU
     73#    OCCASION
     74#    TOPIC
     75#    WRITER
     76#
     77# .contentItems:
     78#   CONTENT_PAGE
     79#   TK_LABEL
     80
     81# =>
     82#    'deft' => "COMPOSER,GENRE,HAPU,OCCASION,TOPIC,WRITER , WAIATA,CONTENT_PAGE,TK_LABEL",
    5783
    5884
     
    75101    my $self = new SplitJSONFile($pluginlist, $inputargs, $hashArgOptLists);
    76102
    77     return bless $self, $class;
     103    my $blessed_self = bless $self, $class;
     104
     105    my $metadata_exp = $self->{'metadata_exp'};
     106    my @metadata_exps = split(/\s*,\s*/,$metadata_exp);
     107
     108    $self->{'metadata_exp_lookup'} = {};
     109    foreach my $md_exp_and_opt_mapping (@metadata_exps) {
     110    my ($md_exp,$opt_mapping) = ($md_exp_and_opt_mapping =~ m/^(.+?)(?:->(.+))$/);
     111    $blessed_self->{'metadata_exp_lookup'}->{$md_exp} = { 'exists' => 1, 'gs_metadata_name' => $opt_mapping }; # note: $opt_mapping might be undef
     112    }
     113   
     114    return $blessed_self;
    78115}
    79116
     
    83120#    return q^(?i)\.json$^;
    84121#}
    85 
    86 sub file_block_readXXXXXX {
    87     my $self = shift (@_);
    88     my ($pluginfo, $base_dir, $file, $block_hash, $metadata, $gli) = @_;
    89 
    90     my ($filename_full_path, $filename_no_path) = &util::get_full_filenames($base_dir, $file);
    91 
    92     if (!-f $filename_full_path || !$self->can_process_this_file($filename_full_path)) {
    93     return undef; # can't recognise
    94     }
    95 
    96     # set this so we know this is a metadata file - needed for incremental
    97     # build
    98     # if this file changes, then we need to reimport everything
    99     $block_hash->{'metadata_files'}->{$filename_full_path} = 1;
    100 
    101     return 1;
    102 }
    103122
    104123
     
    241260    my $outhandle = $self->{'outhandle'};
    242261    my $verbosity = $self->{'verbosity'};
    243    
     262
     263    my $metadata_exp = $self->{'metadata_exp'};
     264    my $metadata_exp_lookup = $self->{'metadata_exp_lookup'};
     265       
    244266    my $cursection = $doc_obj->get_top_section();
    245267
     
    250272    my $json_unicode_str = $json_pretty->encode($json_rec); # expects unicode string
    251273   
    252     if ($verbosity>2) {
     274    if ($verbosity>=4) {
    253275
    254276    my $json_utf8_printable_str = Encode::encode("utf8",$json_unicode_str);
     
    266288    my $tipple_type = $json_rec->{'type'};
    267289
    268     $tipple_type = ucfirst(lc($tipple_type));
    269 
    270     $doc_obj->add_utf8_metadata($cursection, "Title",$tipple_name);
    271     $doc_obj->add_utf8_metadata($cursection, "Type",$tipple_type);
    272 
    273     # .documents
    274     #   .locale
    275     #     .code
    276     #   .roles
    277     #     .type
    278     #   .sections
    279     #     .caption + .content
    280     #
    281    
    282     my $tipple_documents = $json_rec->{'documents'};
    283     foreach my $tipple_document (@$tipple_documents) {
    284     my $tipple_locale   = $tipple_document->{'locale'};
    285     my $tipple_roles    = $tipple_document->{'roles'};
    286     my $tipple_sections = $tipple_document->{'sections'};
    287 
    288     my $md_name_prefix = $tipple_locale->{'code'};
    289     $md_name_prefix .= "_".$tipple_roles->[0]->{'type'} if defined $tipple_roles->[0]->{'type'};
    290 
    291     foreach my $tipple_section (@$tipple_sections) {
    292         my $md_val_caption = $tipple_section->{'caption'};
    293         my $md_val_content = $tipple_section->{'content'};
    294 
    295         if (defined $md_val_caption) {
    296         my $md_name_caption = "${md_name_prefix}_caption";
    297         $doc_obj->add_utf8_metadata($cursection,$md_name_caption,$md_val_caption);
     290    my $tipple_type_formatted = ucfirst(lc($tipple_type));
     291   
     292    my $is_metadata_name_match = 0;
     293    my $gs_metadata_name;
     294   
     295    if ($metadata_exp eq "") {
     296    $is_metadata_name_match = 1;
     297    $gs_metadata_name= $tipple_type_formatted;
     298    }
     299    elsif (defined $metadata_exp_lookup->{$tipple_type}) {
     300    $is_metadata_name_match = 1;
     301    if (defined $metadata_exp_lookup->{$tipple_type}->{'gs_metadata_name'}) {
     302        $gs_metadata_name = $metadata_exp_lookup->{$tipple_type}->{'gs_metadata_name'};
     303    }
     304    else {
     305        $gs_metadata_name= $tipple_type_formatted;
     306    }
     307    }
     308   
     309    if ($is_metadata_name_match) {
     310   
     311    $doc_obj->add_utf8_metadata($cursection, "Title",$tipple_name);
     312    $doc_obj->add_utf8_metadata($cursection, "Type", $tipple_type_formatted);
     313   
     314    # .documents
     315    #   .locale
     316    #     .code
     317    #   .roles
     318    #     .type
     319    #   .sections
     320    #     .caption + .content
     321    #
     322   
     323    my $tipple_documents = $json_rec->{'documents'};
     324    foreach my $tipple_document (@$tipple_documents) {
     325
     326        # 'documents' in tipple corresponds to 'section of document' in greenstone
     327       
     328        my $tipple_locale   = $tipple_document->{'locale'};
     329        my $tipple_roles    = $tipple_document->{'roles'};
     330        my $tipple_sections = $tipple_document->{'sections'};
     331       
     332        my $md_name_prefix = $tipple_locale->{'code'};
     333        $md_name_prefix .= "_".$tipple_roles->[0]->{'type'} if defined $tipple_roles->[0]->{'type'};
     334       
     335        foreach my $tipple_section (@$tipple_sections) {
     336        my $md_val_caption = $tipple_section->{'caption'};
     337        my $md_val_content = $tipple_section->{'content'};
     338       
     339        if (defined $md_val_caption) {
     340            my $md_name_caption = "${md_name_prefix}_caption";
     341            $doc_obj->add_utf8_metadata($cursection,$md_name_caption,$md_val_caption);
     342        }
     343       
     344        if (defined $md_val_content) {
     345            my $md_name_content = "${md_name_prefix}_content";
     346            $doc_obj->add_utf8_metadata($cursection,$md_name_content,$md_val_content);
     347        }
    298348        }
    299 
    300         if (defined $md_val_content) {
    301         my $md_name_content = "${md_name_prefix}_content";
    302         $doc_obj->add_utf8_metadata($cursection,$md_name_content,$md_val_content);
     349       
     350    }
     351
     352    # .mediaItems
     353    #   .file
     354    #     .sourceUri
     355    #     .contentType
     356   
     357
     358    my $tipple_media_items = $json_rec->{'mediaItems'};
     359    foreach my $tipple_media_item (@$tipple_media_items) {
     360        my $tipple_file = $tipple_media_item->{'file'};
     361        if (defined $tipple_file) {
     362       
     363        my $tipple_source_uri   = $tipple_file->{'sourceUri'};
     364        my $tipple_content_type = $tipple_file->{'contentType'};
     365       
     366        $doc_obj->add_utf8_metadata($cursection,"sourceUri",  $tipple_source_uri);
     367        $doc_obj->add_utf8_metadata($cursection,"contentType",$tipple_content_type);
     368       
    303369        }
    304370    }
    305    
    306     }
    307 
    308     # .mediaItems
    309     #   .file
    310     #     .sourceUri
    311     #     .contentType
    312    
    313 
    314     my $tipple_media_items = $json_rec->{'mediaItems'};
    315     foreach my $tipple_media_item (@$tipple_media_items) {
    316     my $tipple_file = $tipple_media_item->{'file'};
    317     if (defined $tipple_file) {
    318        
    319         my $tipple_source_uri   = $tipple_file->{'sourceUri'};
    320         my $tipple_content_type = $tipple_file->{'contentType'};
    321        
    322         $doc_obj->add_utf8_metadata($cursection,"sourceUri",  $tipple_source_uri);
    323         $doc_obj->add_utf8_metadata($cursection,"contentType",$tipple_content_type);
    324        
    325     }
    326371    }
    327372
     
    332377}
    333378
    334 
    335 # The following is strongly based on 'read()' in SplitTextFile:
    336 #
    337 # (1) Changed to break-up a JSON file into segements rather than a text file split regex
    338 #
    339 # (2) Removed $self->{'metapass_srcdoc'}, which is related to when content in the file
    340 #     being process (the JSON file in this case) attaching as metadata to a different file
    341 #
    342 # (3) Removed reliance on $self->{'split_segments'}, again because this plugin has no
    343 #     ability to set up content in the JSON file as metadata to attach to a different file
    344 
    345 sub readXXXXX {
    346     my $self = shift (@_);
    347     my ($pluginfo, $base_dir, $file, $block_hash, $metadata, $processor, $maxdocs, $total_count, $gli) = @_;
    348     my $outhandle = $self->{'outhandle'};
    349     my $verbosity = $self->{'verbosity'};
    350 
    351     # can we process this file??
    352     my ($filename_full_path, $filename_no_path) = &util::get_full_filenames($base_dir, $file);
    353     return undef unless $self->can_process_this_file($filename_full_path);
    354 
    355     $file =~ s/^[\/\\]+//; # $file often begins with / so we'll tidy it up
    356 
    357     my $le_rec = $self->{'textcat_store'}->{$file};
    358     if (!defined $le_rec) {
    359     # means no text was found;
    360     return 0; # not processed but no point in passing it on
    361     }
    362 
    363     print STDERR "<Processing n='$file' p='$self->{'plugin_type'}'>\n" if ($gli);
    364     print $outhandle "$self->{'plugin_type'} processing $file\n"
    365         if $self->{'verbosity'} > 1;   
    366 
    367     my $language = $le_rec->{'language'};
    368     my $encoding = $le_rec->{'encoding'};
    369     $self->{'textcat_store'}->{$file} = undef;
    370 
    371 
    372     ## my $tipple_hashmap = decode_json($$textref);
    373     my $tipple_hashmap = {};
    374    
    375     # Tipple JSON Structure
    376     # .contentGroup
    377     #   .id
    378     #   .documents
    379     #   .mediaItems
    380 
    381     my ($count, $segment, $segtext, $status, $id);
    382     $segment = 0;
    383     $count = 0;
    384 
    385     # Process each contentGroup item (equivalent to segment in SplitTextPlugin) in turn
    386     foreach my $gs_doc (@{$tipple_hashmap->{'contentGroup'}}) {
    387     $segment++;
    388 
    389     my $gs_id = $gs_doc->{'id'};
    390     my $gs_doc_parts = $gs_doc->{'documents'};
    391     print STDERR "**** tipple id = $gs_id]\n";
    392    
    393     # create a new document
    394     my $doc_obj = new doc ($filename_full_path, "indexed_doc", $self->{'file_rename_method'});
    395     $doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "Language", $language);
    396     $doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "Encoding", $encoding);
    397 
    398     my ($filemeta) = $file =~ /([^\\\/]+)$/;
    399     my $plugin_filename_encoding = $self->{'filename_encoding'};
    400     my $filename_encoding = $self->deduce_filename_encoding($file,$metadata,$plugin_filename_encoding);
    401     $self->set_Source_metadata($doc_obj, $filename_full_path, $filename_encoding);
    402 
    403     $doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "SourceSegment", "$segment");
    404     if ($self->{'cover_image'}) {
    405         $self->associate_cover_image($doc_obj, $filename_full_path);
    406     }
    407     $doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "Plugin", "$self->{'plugin_type'}");
    408 
    409 
    410     # include any metadata passed in from previous plugins
    411     # note that this metadata is associated with the top level section
    412     $self->extra_metadata ($doc_obj, $doc_obj->get_top_section(), $metadata);
    413 
    414     # do plugin specific processing of doc_obj
    415     print $outhandle "segment $segment\n" if ($self->{'verbosity'});
    416     print STDERR "<Processing s='$segment' n='$file' p='$self->{'plugin_type'}'>\n" if ($gli);
    417     $status = $self->process (\$segtext, $pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli);
    418     if (!defined $status) {
    419         print $outhandle "WARNING: no plugin could process segment $segment of $file\n"
    420         if ($verbosity >= 2);
    421         print STDERR "<ProcessingError s='$segment' n='$file'>\n" if $gli;
    422         next;
    423     }
    424     # If the plugin returned 0, it threw away this part
    425     if ($status == 0) {
    426         next;
    427     }
    428     $count += $status;
    429 
    430     # do any automatic metadata extraction
    431     $self->auto_extract_metadata ($doc_obj);
    432 
    433     # This used to be done earlier on in routine, however $id generated
    434     # isn't used until here!
    435     # Calculate a "base" document ID.
    436     if (!defined $id) {
    437         $id = $self->get_base_OID($doc_obj);
    438     }
    439    
    440     # add an OID
    441     $self->add_segment_OID($doc_obj, $id, $segment);
    442 
    443     # process the document
    444     $processor->process($doc_obj);
    445 
    446     $self->{'num_processed'} ++;
    447 
    448     if ($maxdocs != -1 && $self->{'num_processed'} >= $maxdocs) {
    449         last;
    450     }
    451     }
    452 
    453     # Return number of document objects produced
    454     return $count; 
    455 }
    456 
    457    
    458 
    459 sub print_error
    460 {
    461 
    462     my $self = shift(@_);
    463     my ($outhandle, $failhandle, $gli, $file, $error) = @_;
    464 
    465     print $outhandle "TippleExportJSONPlugin Error: $file: $error\n";
    466     print $failhandle "TippleExportJSONPlugin Error: $file: $error\n";
    467     print STDERR "<ProcessingError n='$file' r='$error'/>\n" if ($gli);
    468 }
     379   
     380
     381# sub print_error
     382# {
     383
     384#     my $self = shift(@_);
     385#     my ($outhandle, $failhandle, $gli, $file, $error) = @_;
     386
     387#     print $outhandle "TippleExportJSONPlugin Error: $file: $error\n";
     388#     print $failhandle "TippleExportJSONPlugin Error: $file: $error\n";
     389#     print STDERR "<ProcessingError n='$file' r='$error'/>\n" if ($gli);
     390# }
    469391
    4703921;
Note: See TracChangeset for help on using the changeset viewer.