Changeset 37183
- Timestamp:
- 2023-01-25T23:14:25+13:00 (15 months ago)
- File:
-
- 1 edited
Legend:
- Unmodified
- Added
- Removed
-
gs3-installations/whakatohea-dl/trunk/sites/wmtb/collect/tipple-waiata/perllib/plugins/TippleExportJSONPlugin.pm
r37180 r37183 44 44 45 45 46 46 47 my $arguments = [ 47 ];48 49 # my $arguments = [50 48 # { 'name' => "process_exp", 51 49 # 'desc' => "{BaseImporter.process_exp}", 52 50 # 'type' => "regexp", 53 51 # 'reqd' => "no", 54 # 'deft' => &get_default_process_exp() } 55 56 # ]; 52 # 'deft' => &get_default_process_exp() }, 53 { 'name' => "split_exp", 54 'desc' => "{SplitJSONFile.split_exp}", 55 'type' => "string", 56 # 'deft' => "contentGroups,contentItems", 57 'deft' => "contentItems", 58 'reqd' => "no" }, 59 { 'name' => "metadata_exp", 60 'desc' => "{SplitJSONFile.metadata_exp}", 61 'type' => "string", 62 'deft' => "WAIATA", 63 'deft' => "", 64 'reqd' => "no" }, 65 ]; 66 67 # Other document-level metadata types to consider: 68 # 69 # .contentGroups: 70 # COMPOSER 71 # GENRE 72 # HAPU 73 # OCCASION 74 # TOPIC 75 # WRITER 76 # 77 # .contentItems: 78 # CONTENT_PAGE 79 # TK_LABEL 80 81 # => 82 # 'deft' => "COMPOSER,GENRE,HAPU,OCCASION,TOPIC,WRITER , WAIATA,CONTENT_PAGE,TK_LABEL", 57 83 58 84 … … 75 101 my $self = new SplitJSONFile($pluginlist, $inputargs, $hashArgOptLists); 76 102 77 return bless $self, $class; 103 my $blessed_self = bless $self, $class; 104 105 my $metadata_exp = $self->{'metadata_exp'}; 106 my @metadata_exps = split(/\s*,\s*/,$metadata_exp); 107 108 $self->{'metadata_exp_lookup'} = {}; 109 foreach my $md_exp_and_opt_mapping (@metadata_exps) { 110 my ($md_exp,$opt_mapping) = ($md_exp_and_opt_mapping =~ m/^(.+?)(?:->(.+))$/); 111 $blessed_self->{'metadata_exp_lookup'}->{$md_exp} = { 'exists' => 1, 'gs_metadata_name' => $opt_mapping }; # note: $opt_mapping might be undef 112 } 113 114 return $blessed_self; 78 115 } 79 116 … … 83 120 # return q^(?i)\.json$^; 84 121 #} 85 86 sub file_block_readXXXXXX {87 my $self = shift (@_);88 my ($pluginfo, $base_dir, $file, $block_hash, $metadata, $gli) = @_;89 90 my ($filename_full_path, $filename_no_path) = &util::get_full_filenames($base_dir, $file);91 92 if (!-f $filename_full_path || !$self->can_process_this_file($filename_full_path)) {93 return undef; # can't recognise94 }95 96 # set this so we know this is a metadata file - needed for incremental97 # build98 # if this file changes, then we need to reimport everything99 $block_hash->{'metadata_files'}->{$filename_full_path} = 1;100 101 return 1;102 }103 122 104 123 … … 241 260 my $outhandle = $self->{'outhandle'}; 242 261 my $verbosity = $self->{'verbosity'}; 243 262 263 my $metadata_exp = $self->{'metadata_exp'}; 264 my $metadata_exp_lookup = $self->{'metadata_exp_lookup'}; 265 244 266 my $cursection = $doc_obj->get_top_section(); 245 267 … … 250 272 my $json_unicode_str = $json_pretty->encode($json_rec); # expects unicode string 251 273 252 if ($verbosity> 2) {274 if ($verbosity>=4) { 253 275 254 276 my $json_utf8_printable_str = Encode::encode("utf8",$json_unicode_str); … … 266 288 my $tipple_type = $json_rec->{'type'}; 267 289 268 $tipple_type = ucfirst(lc($tipple_type)); 269 270 $doc_obj->add_utf8_metadata($cursection, "Title",$tipple_name); 271 $doc_obj->add_utf8_metadata($cursection, "Type",$tipple_type); 272 273 # .documents 274 # .locale 275 # .code 276 # .roles 277 # .type 278 # .sections 279 # .caption + .content 280 # 281 282 my $tipple_documents = $json_rec->{'documents'}; 283 foreach my $tipple_document (@$tipple_documents) { 284 my $tipple_locale = $tipple_document->{'locale'}; 285 my $tipple_roles = $tipple_document->{'roles'}; 286 my $tipple_sections = $tipple_document->{'sections'}; 287 288 my $md_name_prefix = $tipple_locale->{'code'}; 289 $md_name_prefix .= "_".$tipple_roles->[0]->{'type'} if defined $tipple_roles->[0]->{'type'}; 290 291 foreach my $tipple_section (@$tipple_sections) { 292 my $md_val_caption = $tipple_section->{'caption'}; 293 my $md_val_content = $tipple_section->{'content'}; 294 295 if (defined $md_val_caption) { 296 my $md_name_caption = "${md_name_prefix}_caption"; 297 $doc_obj->add_utf8_metadata($cursection,$md_name_caption,$md_val_caption); 290 my $tipple_type_formatted = ucfirst(lc($tipple_type)); 291 292 my $is_metadata_name_match = 0; 293 my $gs_metadata_name; 294 295 if ($metadata_exp eq "") { 296 $is_metadata_name_match = 1; 297 $gs_metadata_name= $tipple_type_formatted; 298 } 299 elsif (defined $metadata_exp_lookup->{$tipple_type}) { 300 $is_metadata_name_match = 1; 301 if (defined $metadata_exp_lookup->{$tipple_type}->{'gs_metadata_name'}) { 302 $gs_metadata_name = $metadata_exp_lookup->{$tipple_type}->{'gs_metadata_name'}; 303 } 304 else { 305 $gs_metadata_name= $tipple_type_formatted; 306 } 307 } 308 309 if ($is_metadata_name_match) { 310 311 $doc_obj->add_utf8_metadata($cursection, "Title",$tipple_name); 312 $doc_obj->add_utf8_metadata($cursection, "Type", $tipple_type_formatted); 313 314 # .documents 315 # .locale 316 # .code 317 # .roles 318 # .type 319 # .sections 320 # .caption + .content 321 # 322 323 my $tipple_documents = $json_rec->{'documents'}; 324 foreach my $tipple_document (@$tipple_documents) { 325 326 # 'documents' in tipple corresponds to 'section of document' in greenstone 327 328 my $tipple_locale = $tipple_document->{'locale'}; 329 my $tipple_roles = $tipple_document->{'roles'}; 330 my $tipple_sections = $tipple_document->{'sections'}; 331 332 my $md_name_prefix = $tipple_locale->{'code'}; 333 $md_name_prefix .= "_".$tipple_roles->[0]->{'type'} if defined $tipple_roles->[0]->{'type'}; 334 335 foreach my $tipple_section (@$tipple_sections) { 336 my $md_val_caption = $tipple_section->{'caption'}; 337 my $md_val_content = $tipple_section->{'content'}; 338 339 if (defined $md_val_caption) { 340 my $md_name_caption = "${md_name_prefix}_caption"; 341 $doc_obj->add_utf8_metadata($cursection,$md_name_caption,$md_val_caption); 342 } 343 344 if (defined $md_val_content) { 345 my $md_name_content = "${md_name_prefix}_content"; 346 $doc_obj->add_utf8_metadata($cursection,$md_name_content,$md_val_content); 347 } 298 348 } 299 300 if (defined $md_val_content) { 301 my $md_name_content = "${md_name_prefix}_content"; 302 $doc_obj->add_utf8_metadata($cursection,$md_name_content,$md_val_content); 349 350 } 351 352 # .mediaItems 353 # .file 354 # .sourceUri 355 # .contentType 356 357 358 my $tipple_media_items = $json_rec->{'mediaItems'}; 359 foreach my $tipple_media_item (@$tipple_media_items) { 360 my $tipple_file = $tipple_media_item->{'file'}; 361 if (defined $tipple_file) { 362 363 my $tipple_source_uri = $tipple_file->{'sourceUri'}; 364 my $tipple_content_type = $tipple_file->{'contentType'}; 365 366 $doc_obj->add_utf8_metadata($cursection,"sourceUri", $tipple_source_uri); 367 $doc_obj->add_utf8_metadata($cursection,"contentType",$tipple_content_type); 368 303 369 } 304 370 } 305 306 }307 308 # .mediaItems309 # .file310 # .sourceUri311 # .contentType312 313 314 my $tipple_media_items = $json_rec->{'mediaItems'};315 foreach my $tipple_media_item (@$tipple_media_items) {316 my $tipple_file = $tipple_media_item->{'file'};317 if (defined $tipple_file) {318 319 my $tipple_source_uri = $tipple_file->{'sourceUri'};320 my $tipple_content_type = $tipple_file->{'contentType'};321 322 $doc_obj->add_utf8_metadata($cursection,"sourceUri", $tipple_source_uri);323 $doc_obj->add_utf8_metadata($cursection,"contentType",$tipple_content_type);324 325 }326 371 } 327 372 … … 332 377 } 333 378 334 335 # The following is strongly based on 'read()' in SplitTextFile: 336 # 337 # (1) Changed to break-up a JSON file into segements rather than a text file split regex 338 # 339 # (2) Removed $self->{'metapass_srcdoc'}, which is related to when content in the file 340 # being process (the JSON file in this case) attaching as metadata to a different file 341 # 342 # (3) Removed reliance on $self->{'split_segments'}, again because this plugin has no 343 # ability to set up content in the JSON file as metadata to attach to a different file 344 345 sub readXXXXX { 346 my $self = shift (@_); 347 my ($pluginfo, $base_dir, $file, $block_hash, $metadata, $processor, $maxdocs, $total_count, $gli) = @_; 348 my $outhandle = $self->{'outhandle'}; 349 my $verbosity = $self->{'verbosity'}; 350 351 # can we process this file?? 352 my ($filename_full_path, $filename_no_path) = &util::get_full_filenames($base_dir, $file); 353 return undef unless $self->can_process_this_file($filename_full_path); 354 355 $file =~ s/^[\/\\]+//; # $file often begins with / so we'll tidy it up 356 357 my $le_rec = $self->{'textcat_store'}->{$file}; 358 if (!defined $le_rec) { 359 # means no text was found; 360 return 0; # not processed but no point in passing it on 361 } 362 363 print STDERR "<Processing n='$file' p='$self->{'plugin_type'}'>\n" if ($gli); 364 print $outhandle "$self->{'plugin_type'} processing $file\n" 365 if $self->{'verbosity'} > 1; 366 367 my $language = $le_rec->{'language'}; 368 my $encoding = $le_rec->{'encoding'}; 369 $self->{'textcat_store'}->{$file} = undef; 370 371 372 ## my $tipple_hashmap = decode_json($$textref); 373 my $tipple_hashmap = {}; 374 375 # Tipple JSON Structure 376 # .contentGroup 377 # .id 378 # .documents 379 # .mediaItems 380 381 my ($count, $segment, $segtext, $status, $id); 382 $segment = 0; 383 $count = 0; 384 385 # Process each contentGroup item (equivalent to segment in SplitTextPlugin) in turn 386 foreach my $gs_doc (@{$tipple_hashmap->{'contentGroup'}}) { 387 $segment++; 388 389 my $gs_id = $gs_doc->{'id'}; 390 my $gs_doc_parts = $gs_doc->{'documents'}; 391 print STDERR "**** tipple id = $gs_id]\n"; 392 393 # create a new document 394 my $doc_obj = new doc ($filename_full_path, "indexed_doc", $self->{'file_rename_method'}); 395 $doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "Language", $language); 396 $doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "Encoding", $encoding); 397 398 my ($filemeta) = $file =~ /([^\\\/]+)$/; 399 my $plugin_filename_encoding = $self->{'filename_encoding'}; 400 my $filename_encoding = $self->deduce_filename_encoding($file,$metadata,$plugin_filename_encoding); 401 $self->set_Source_metadata($doc_obj, $filename_full_path, $filename_encoding); 402 403 $doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "SourceSegment", "$segment"); 404 if ($self->{'cover_image'}) { 405 $self->associate_cover_image($doc_obj, $filename_full_path); 406 } 407 $doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "Plugin", "$self->{'plugin_type'}"); 408 409 410 # include any metadata passed in from previous plugins 411 # note that this metadata is associated with the top level section 412 $self->extra_metadata ($doc_obj, $doc_obj->get_top_section(), $metadata); 413 414 # do plugin specific processing of doc_obj 415 print $outhandle "segment $segment\n" if ($self->{'verbosity'}); 416 print STDERR "<Processing s='$segment' n='$file' p='$self->{'plugin_type'}'>\n" if ($gli); 417 $status = $self->process (\$segtext, $pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli); 418 if (!defined $status) { 419 print $outhandle "WARNING: no plugin could process segment $segment of $file\n" 420 if ($verbosity >= 2); 421 print STDERR "<ProcessingError s='$segment' n='$file'>\n" if $gli; 422 next; 423 } 424 # If the plugin returned 0, it threw away this part 425 if ($status == 0) { 426 next; 427 } 428 $count += $status; 429 430 # do any automatic metadata extraction 431 $self->auto_extract_metadata ($doc_obj); 432 433 # This used to be done earlier on in routine, however $id generated 434 # isn't used until here! 435 # Calculate a "base" document ID. 436 if (!defined $id) { 437 $id = $self->get_base_OID($doc_obj); 438 } 439 440 # add an OID 441 $self->add_segment_OID($doc_obj, $id, $segment); 442 443 # process the document 444 $processor->process($doc_obj); 445 446 $self->{'num_processed'} ++; 447 448 if ($maxdocs != -1 && $self->{'num_processed'} >= $maxdocs) { 449 last; 450 } 451 } 452 453 # Return number of document objects produced 454 return $count; 455 } 456 457 458 459 sub print_error 460 { 461 462 my $self = shift(@_); 463 my ($outhandle, $failhandle, $gli, $file, $error) = @_; 464 465 print $outhandle "TippleExportJSONPlugin Error: $file: $error\n"; 466 print $failhandle "TippleExportJSONPlugin Error: $file: $error\n"; 467 print STDERR "<ProcessingError n='$file' r='$error'/>\n" if ($gli); 468 } 379 380 381 # sub print_error 382 # { 383 384 # my $self = shift(@_); 385 # my ($outhandle, $failhandle, $gli, $file, $error) = @_; 386 387 # print $outhandle "TippleExportJSONPlugin Error: $file: $error\n"; 388 # print $failhandle "TippleExportJSONPlugin Error: $file: $error\n"; 389 # print STDERR "<ProcessingError n='$file' r='$error'/>\n" if ($gli); 390 # } 469 391 470 392 1;
Note:
See TracChangeset
for help on using the changeset viewer.