########################################################################### # # AMCMetadataJSONPlugin.pm -- A plugin for JSON files resulting from page # scrape of artists on the Australian Music Centre # website # # A component of the Greenstone digital library software # from the New Zealand Digital Library Project at the # University of Waikato, New Zealand. # # Copyright 2016 New Zealand Digital Library Project # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation; either version 2 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program; if not, write to the Free Software # Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. # ########################################################################### package AMCMetadataJSONPlugin; use BaseImporter; use MetadataRead; use strict; no strict 'refs'; use multiread; use Encode; use JSON; # methods with identical signatures take precedence in the order given in the ISA list. sub BEGIN { @AMCMetadataJSONPlugin::ISA = ('MetadataRead', 'BaseImporter'); } my $arguments = [ { 'name' => "process_exp", 'desc' => "{BaseImporter.process_exp}", 'type' => "regexp", 'reqd' => "no", 'deft' => &get_default_process_exp() } ]; my $options = { 'name' => "AMCMetadataJSONPlugin", 'desc' => "{AMCMetadataJSONPlugin.desc}", 'abstract' => "no", 'inherits' => "yes", 'args' => $arguments }; sub new { my ($class) = shift (@_); my ($pluginlist,$inputargs,$hashArgOptLists) = @_; push(@$pluginlist, $class); push(@{$hashArgOptLists->{"ArgList"}},@{$arguments}); push(@{$hashArgOptLists->{"OptList"}},$options); my $self = new BaseImporter($pluginlist, $inputargs, $hashArgOptLists); return bless $self, $class; } sub get_default_process_exp { return q^(?i)\.json$^; } sub file_block_read { my $self = shift (@_); my ($pluginfo, $base_dir, $file, $block_hash, $metadata, $gli) = @_; my ($filename_full_path, $filename_no_path) = &util::get_full_filenames($base_dir, $file); if (!-f $filename_full_path || !$self->can_process_this_file($filename_full_path)) { return undef; # can't recognise } # set this so we know this is a metadata file - needed for incremental # build # if this file changes, then we need to reimport everything $block_hash->{'metadata_files'}->{$filename_full_path} = 1; return 1; } sub metadata_read { my $self = shift (@_); my ($pluginfo, $base_dir, $file, $block_hash, $extrametakeys, $extrametadata, $extrametafile, $processor, $gli, $aux) = @_; # Read metadata from JSON files my $filename = &util::filename_cat($base_dir, $file); if (!-f $filename || !$self->can_process_this_file($filename)) { return undef; } print STDERR "\n\n" if ($gli); print STDERR "AMCMetadataJSONPlugin: processing $file\n" if ($self->{'verbosity'}) > 1; my $outhandle = $self->{'outhandle'}; my $failhandle = $self->{'failhandle'}; # add the file to the block list so that it won't be processed in read, as we will do all we can with it here $self->block_filename($block_hash,$filename); # Read the JSON file to get the metadata my $json_file_content; open(JSON_FILE, "$filename"); my $json_file_reader = new multiread(); $json_file_reader->set_handle('AMCMetadataJSONPlugin::JSON_FILE'); $json_file_reader->read_file(\$json_file_content); # Would be nice if AMCMetadataJSONPlugin was extended to support a minus # option to choose the character encoding the JSON file is in # For now we will assume it is always in UTF8 my $json_file_content_bytes = encode('UTF-8', $json_file_content); $json_file_content = decode("utf8",$json_file_content_bytes); close(JSON_FILE); # Split the file into lines and read the first line (contains the metadata names) # $json_file_content =~ s/\r/\n/g; # Handle non-Unix line endings # $json_file_content =~ s/\n+/\n/g; # my @json_file_lines = split(/;/s, $json_file_content); # my $matching_file = $file; # $matching_filename =~ s/\.js/\.{ogg,mp3,wav}/; # my $json_metadata = {}; print STDERR "**** json_metadata = $json_file_content\n"; # my $json_metadata = decode_json $json_file_content; my $json_metadata = JSON->new->utf8->decode($json_file_content); # We can't associate any metadata without knowing the file to associate it with my $audio_url = $json_metadata->{"audio_url"}; if (!defined $audio_url) { $self->print_error($outhandle, $failhandle, $gli, $filename, "No audio_url metadata in JSON file: $file"); return -1; } my $local_audio_file = $audio_url; $local_audio_file =~ s/^http(s?):\/\/.*\///; my ($amc_id) = ($local_audio_file =~ m/^(.+)(?:\..+?)$/); ## print STDERR "***** local audio file = $local_audio_file\n"; # Set up metadata table based on JSON file read in foreach my $k (keys %$json_metadata) { $json_metadata->{'amc.'.$k} = [ $json_metadata->{$k} ]; delete $json_metadata->{$k}; } $json_metadata->{'amc.id'} = [ $amc_id ]; # my $json_filename = shift(@$json_trackurl_array); # $json_filename =~ s/\"//g; # strip off quotes # delete $json_metadata->{"track_url"}; # Associate the metadata now $local_audio_file = &util::filename_to_regex($local_audio_file); ### print STDERR "**** setting up mapping for: $json_filename\n"; # Set up mapping from local_audio_file to the created JSON metadata table # so it 'sticks' to the file $extrametadata->{$local_audio_file} = $json_metadata; push(@$extrametakeys, $local_audio_file); # record which file the metadata came from if (!defined $extrametafile->{$local_audio_file}) { $extrametafile->{$local_audio_file} = {}; } # maps the file to full path $extrametafile->{$local_audio_file}->{$file} = $filename; return 1; } sub print_error { my $self = shift(@_); my ($outhandle, $failhandle, $gli, $file, $error) = @_; print $outhandle "AMCMetadataJSONPlugin Error: $file: $error\n"; print $failhandle "AMCMetadataJSONPlugin Error: $file: $error\n"; print STDERR "\n" if ($gli); } 1;