Context Navigation

← Previous Change
Next Change →

Changeset 37183 for gs3-installations

Timestamp:

2023-01-25T23:14:25+13:00 (15 months ago)

Author:

davidb

Message:

Further refinement of idea, with emphasis on using plugins arguments rather than having things hardwired

File:

: 1 edited

gs3-installations/whakatohea-dl/trunk/sites/wmtb/collect/tipple-waiata/perllib/plugins/TippleExportJSONPlugin.pm (modified) (7 diffs)

Legend:

: Unmodified
: Added
: Removed

gs3-installations/whakatohea-dl/trunk/sites/wmtb/collect/tipple-waiata/perllib/plugins/TippleExportJSONPlugin.pm

-              r37180
+              r37183
 my $arguments = [
-];
-# my $arguments = [
 #       { 'name' => "process_exp",
 #   'desc' => "{BaseImporter.process_exp}",
 #   'type' => "regexp",
 #   'reqd' => "no",
+#   'deft' => &get_default_process_exp() }
+# ];
+#   'deft' => &get_default_process_exp() },
+    { 'name' => "split_exp",
+      'desc' => "{SplitJSONFile.split_exp}",
+      'type' => "string",
+#      'deft' => "contentGroups,contentItems",
+      'deft' => "contentItems",
+      'reqd' => "no" },
+    { 'name' => "metadata_exp",
+      'desc' => "{SplitJSONFile.metadata_exp}",
+      'type' => "string",
+      'deft' => "WAIATA",
+      'deft' => "",
+      'reqd' => "no" },
+];
+# Other document-level metadata types to consider:
+#
+# .contentGroups:
+#    COMPOSER
+#    GENRE
+#    HAPU
+#    OCCASION
+#    TOPIC
+#    WRITER
+#
+# .contentItems:
+#   CONTENT_PAGE
+#   TK_LABEL
+# =>
+#    'deft' => "COMPOSER,GENRE,HAPU,OCCASION,TOPIC,WRITER , WAIATA,CONTENT_PAGE,TK_LABEL",
 …
     my $self = new SplitJSONFile($pluginlist, $inputargs, $hashArgOptLists);
+    return bless $self, $class;
+    my $blessed_self = bless $self, $class;
+    my $metadata_exp = $self->{'metadata_exp'};
+    my @metadata_exps = split(/\s*,\s*/,$metadata_exp);
+    $self->{'metadata_exp_lookup'} = {};
+    foreach my $md_exp_and_opt_mapping (@metadata_exps) {
+    my ($md_exp,$opt_mapping) = ($md_exp_and_opt_mapping =~ m/^(.+?)(?:->(.+))$/);
+    $blessed_self->{'metadata_exp_lookup'}->{$md_exp} = { 'exists' => 1, 'gs_metadata_name' => $opt_mapping }; # note: $opt_mapping might be undef
+    }
+    return $blessed_self;
+}
 …
 #    return q^(?i)\.json$^;
 #}
-sub file_block_readXXXXXX {
-    my $self = shift (@_);
-    my ($pluginfo, $base_dir, $file, $block_hash, $metadata, $gli) = @_;
-    my ($filename_full_path, $filename_no_path) = &util::get_full_filenames($base_dir, $file);
-    if (!-f $filename_full_path || !$self->can_process_this_file($filename_full_path)) {
-    return undef; # can't recognise
+    }
-    # set this so we know this is a metadata file - needed for incremental
-    # build
-    # if this file changes, then we need to reimport everything
-    $block_hash->{'metadata_files'}->{$filename_full_path} = 1;
-    return 1;
+}
 …
     my $outhandle = $self->{'outhandle'};
     my $verbosity = $self->{'verbosity'};
+    my $metadata_exp = $self->{'metadata_exp'};
+    my $metadata_exp_lookup = $self->{'metadata_exp_lookup'};
     my $cursection = $doc_obj->get_top_section();
 …
     my $json_unicode_str = $json_pretty->encode($json_rec); # expects unicode string
     if ($verbosity>2) {
+    if ($verbosity>=4) {
     my $json_utf8_printable_str = Encode::encode("utf8",$json_unicode_str);
 …
     my $tipple_type = $json_rec->{'type'};
+    $tipple_type = ucfirst(lc($tipple_type));
+    $doc_obj->add_utf8_metadata($cursection, "Title",$tipple_name);
+    $doc_obj->add_utf8_metadata($cursection, "Type",$tipple_type);
+    # .documents
+    #   .locale
+    #     .code
+    #   .roles
+    #     .type
+    #   .sections
+    #     .caption + .content
+    #
+    my $tipple_documents = $json_rec->{'documents'};
+    foreach my $tipple_document (@$tipple_documents) {
+    my $tipple_locale   = $tipple_document->{'locale'};
+    my $tipple_roles    = $tipple_document->{'roles'};
+    my $tipple_sections = $tipple_document->{'sections'};
+    my $md_name_prefix = $tipple_locale->{'code'};
+    $md_name_prefix .= "_".$tipple_roles->[0]->{'type'} if defined $tipple_roles->[0]->{'type'};
+    foreach my $tipple_section (@$tipple_sections) {
+        my $md_val_caption = $tipple_section->{'caption'};
+        my $md_val_content = $tipple_section->{'content'};
+        if (defined $md_val_caption) {
+        my $md_name_caption = "${md_name_prefix}_caption";
+        $doc_obj->add_utf8_metadata($cursection,$md_name_caption,$md_val_caption);
+    my $tipple_type_formatted = ucfirst(lc($tipple_type));
+    my $is_metadata_name_match = 0;
+    my $gs_metadata_name;
+    if ($metadata_exp eq "") {
+    $is_metadata_name_match = 1;
+    $gs_metadata_name= $tipple_type_formatted;
+    }
+    elsif (defined $metadata_exp_lookup->{$tipple_type}) {
+    $is_metadata_name_match = 1;
+    if (defined $metadata_exp_lookup->{$tipple_type}->{'gs_metadata_name'}) {
+        $gs_metadata_name = $metadata_exp_lookup->{$tipple_type}->{'gs_metadata_name'};
+    }
+    else {
+        $gs_metadata_name= $tipple_type_formatted;
+    }
+    }
+    if ($is_metadata_name_match) {
+    $doc_obj->add_utf8_metadata($cursection, "Title",$tipple_name);
+    $doc_obj->add_utf8_metadata($cursection, "Type", $tipple_type_formatted);
+    # .documents
+    #   .locale
+    #     .code
+    #   .roles
+    #     .type
+    #   .sections
+    #     .caption + .content
+    #
+    my $tipple_documents = $json_rec->{'documents'};
+    foreach my $tipple_document (@$tipple_documents) {
+        # 'documents' in tipple corresponds to 'section of document' in greenstone
+        my $tipple_locale   = $tipple_document->{'locale'};
+        my $tipple_roles    = $tipple_document->{'roles'};
+        my $tipple_sections = $tipple_document->{'sections'};
+        my $md_name_prefix = $tipple_locale->{'code'};
+        $md_name_prefix .= "_".$tipple_roles->[0]->{'type'} if defined $tipple_roles->[0]->{'type'};
+        foreach my $tipple_section (@$tipple_sections) {
+        my $md_val_caption = $tipple_section->{'caption'};
+        my $md_val_content = $tipple_section->{'content'};
+        if (defined $md_val_caption) {
+            my $md_name_caption = "${md_name_prefix}_caption";
+            $doc_obj->add_utf8_metadata($cursection,$md_name_caption,$md_val_caption);
+        }
+        if (defined $md_val_content) {
+            my $md_name_content = "${md_name_prefix}_content";
+            $doc_obj->add_utf8_metadata($cursection,$md_name_content,$md_val_content);
+        }
+        }
+        if (defined $md_val_content) {
+        my $md_name_content = "${md_name_prefix}_content";
+        $doc_obj->add_utf8_metadata($cursection,$md_name_content,$md_val_content);
+    }
+    # .mediaItems
+    #   .file
+    #     .sourceUri
+    #     .contentType
+    my $tipple_media_items = $json_rec->{'mediaItems'};
+    foreach my $tipple_media_item (@$tipple_media_items) {
+        my $tipple_file = $tipple_media_item->{'file'};
+        if (defined $tipple_file) {
+        my $tipple_source_uri   = $tipple_file->{'sourceUri'};
+        my $tipple_content_type = $tipple_file->{'contentType'};
+        $doc_obj->add_utf8_metadata($cursection,"sourceUri",  $tipple_source_uri);
+        $doc_obj->add_utf8_metadata($cursection,"contentType",$tipple_content_type);
+        }
+    }
+    }
-    # .mediaItems
-    #   .file
-    #     .sourceUri
-    #     .contentType
-    my $tipple_media_items = $json_rec->{'mediaItems'};
-    foreach my $tipple_media_item (@$tipple_media_items) {
-    my $tipple_file = $tipple_media_item->{'file'};
-    if (defined $tipple_file) {
-        my $tipple_source_uri   = $tipple_file->{'sourceUri'};
-        my $tipple_content_type = $tipple_file->{'contentType'};
-        $doc_obj->add_utf8_metadata($cursection,"sourceUri",  $tipple_source_uri);
-        $doc_obj->add_utf8_metadata($cursection,"contentType",$tipple_content_type);
+    }
+    }
 …
+}
+# The following is strongly based on 'read()' in SplitTextFile:
+#
+# (1) Changed to break-up a JSON file into segements rather than a text file split regex
+#
+# (2) Removed $self->{'metapass_srcdoc'}, which is related to when content in the file
+#     being process (the JSON file in this case) attaching as metadata to a different file
+#
+# (3) Removed reliance on $self->{'split_segments'}, again because this plugin has no
+#     ability to set up content in the JSON file as metadata to attach to a different file
+sub readXXXXX {
+    my $self = shift (@_);
+    my ($pluginfo, $base_dir, $file, $block_hash, $metadata, $processor, $maxdocs, $total_count, $gli) = @_;
+    my $outhandle = $self->{'outhandle'};
+    my $verbosity = $self->{'verbosity'};
+    # can we process this file??
+    my ($filename_full_path, $filename_no_path) = &util::get_full_filenames($base_dir, $file);
+    return undef unless $self->can_process_this_file($filename_full_path);
+    $file =~ s/^[\/\\]+//; # $file often begins with / so we'll tidy it up
+    my $le_rec = $self->{'textcat_store'}->{$file};
+    if (!defined $le_rec) {
+    # means no text was found;
+    return 0; # not processed but no point in passing it on
+    }
+    print STDERR "<Processing n='$file' p='$self->{'plugin_type'}'>\n" if ($gli);
+    print $outhandle "$self->{'plugin_type'} processing $file\n"
+        if $self->{'verbosity'} > 1;
+    my $language = $le_rec->{'language'};
+    my $encoding = $le_rec->{'encoding'};
+    $self->{'textcat_store'}->{$file} = undef;
+    ## my $tipple_hashmap = decode_json($$textref);
+    my $tipple_hashmap = {};
+    # Tipple JSON Structure
+    # .contentGroup
+    #   .id
+    #   .documents
+    #   .mediaItems
+    my ($count, $segment, $segtext, $status, $id);
+    $segment = 0;
+    $count = 0;
+    # Process each contentGroup item (equivalent to segment in SplitTextPlugin) in turn
+    foreach my $gs_doc (@{$tipple_hashmap->{'contentGroup'}}) {
+    $segment++;
+    my $gs_id = $gs_doc->{'id'};
+    my $gs_doc_parts = $gs_doc->{'documents'};
+    print STDERR "**** tipple id = $gs_id]\n";
+    # create a new document
+    my $doc_obj = new doc ($filename_full_path, "indexed_doc", $self->{'file_rename_method'});
+    $doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "Language", $language);
+    $doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "Encoding", $encoding);
+    my ($filemeta) = $file =~ /([^\\\/]+)$/;
+    my $plugin_filename_encoding = $self->{'filename_encoding'};
+    my $filename_encoding = $self->deduce_filename_encoding($file,$metadata,$plugin_filename_encoding);
+    $self->set_Source_metadata($doc_obj, $filename_full_path, $filename_encoding);
+    $doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "SourceSegment", "$segment");
+    if ($self->{'cover_image'}) {
+        $self->associate_cover_image($doc_obj, $filename_full_path);
+    }
+    $doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "Plugin", "$self->{'plugin_type'}");
+    # include any metadata passed in from previous plugins
+    # note that this metadata is associated with the top level section
+    $self->extra_metadata ($doc_obj, $doc_obj->get_top_section(), $metadata);
+    # do plugin specific processing of doc_obj
+    print $outhandle "segment $segment\n" if ($self->{'verbosity'});
+    print STDERR "<Processing s='$segment' n='$file' p='$self->{'plugin_type'}'>\n" if ($gli);
+    $status = $self->process (\$segtext, $pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli);
+    if (!defined $status) {
+        print $outhandle "WARNING: no plugin could process segment $segment of $file\n"
+        if ($verbosity >= 2);
+        print STDERR "<ProcessingError s='$segment' n='$file'>\n" if $gli;
+        next;
+    }
+    # If the plugin returned 0, it threw away this part
+    if ($status == 0) {
+        next;
+    }
+    $count += $status;
+    # do any automatic metadata extraction
+    $self->auto_extract_metadata ($doc_obj);
+    # This used to be done earlier on in routine, however $id generated
+    # isn't used until here!
+    # Calculate a "base" document ID.
+    if (!defined $id) {
+        $id = $self->get_base_OID($doc_obj);
+    }
+    # add an OID
+    $self->add_segment_OID($doc_obj, $id, $segment);
+    # process the document
+    $processor->process($doc_obj);
+    $self->{'num_processed'} ++;
+    if ($maxdocs != -1 && $self->{'num_processed'} >= $maxdocs) {
+        last;
+    }
+    }
+    # Return number of document objects produced
+    return $count;
+}
+sub print_error
+{
+    my $self = shift(@_);
+    my ($outhandle, $failhandle, $gli, $file, $error) = @_;
+    print $outhandle "TippleExportJSONPlugin Error: $file: $error\n";
+    print $failhandle "TippleExportJSONPlugin Error: $file: $error\n";
+    print STDERR "<ProcessingError n='$file' r='$error'/>\n" if ($gli);
+}
+# sub print_error
+# {
+#     my $self = shift(@_);
+#     my ($outhandle, $failhandle, $gli, $file, $error) = @_;
+#     print $outhandle "TippleExportJSONPlugin Error: $file: $error\n";
+#     print $failhandle "TippleExportJSONPlugin Error: $file: $error\n";
+#     print STDERR "<ProcessingError n='$file' r='$error'/>\n" if ($gli);
+# }
 ;

Note: See TracChangeset for help on using the changeset viewer.

Context Navigation

Changeset 37183 for gs3-installations

Legend:

gs3-installations/whakatohea-dl/trunk/sites/wmtb/collect/tipple-waiata/perllib/plugins/TippleExportJSONPlugin.pm

Download in other formats: