source: main/trunk/greenstone2/perllib/plugins/SplitJSONFile.pm@ 37182

Last change on this file since 37182 was 37182, checked in by davidb, 15 months ago

Internal plugin to make things easier when processing JSON files. Currently hardwired for TippleExportJSON format.

File size: 4.2 KB
Line 
1###########################################################################
2#
3# SplitJSONFile.pm
4# -- A plugin for splitting JSON input files into segments that will
5# then be individually processed.
6#
7# Inherits from SplitTextFile, overiding the relevant plugin argument
8# and functions, so the specified nested field within the JSON
9# is used as the split point
10#
11# Copyright 2023 The New Zealand Digital Library Project
12#
13# A component of the Greenstone digital library software
14# from the New Zealand Digital Library Project at the
15# University of Waikato, New Zealand.
16#
17# This program is free software; you can redistribute it and/or modify
18# it under the terms of the GNU General Public License as published by
19# the Free Software Foundation; either version 2 of the License, or
20# (at your option) any later version.
21#
22# This program is distributed in the hope that it will be useful,
23# but WITHOUT ANY WARRANTY; without even the implied warranty of
24# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
25# GNU General Public License for more details.
26#
27# You should have received a copy of the GNU General Public License
28# along with this program; if not, write to the Free Software
29# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
30#
31###########################################################################
32
33package SplitJSONFile;
34
35use SplitTextFile;
36use gsprintf 'gsprintf';
37use util;
38
39use strict;
40no strict 'refs'; # allow filehandles to be variables and viceversa
41
42# SplitJSONFile is a sub-class of ReadTextFile
43sub BEGIN {
44 @SplitJSONFile::ISA = ('SplitTextFile');
45}
46
47
48my $arguments = [
49 { 'name' => "split_exp",
50 'desc' => "{SplitJSONFile.split_exp}",
51 'type' => "string",
52 'deft' => "",
53 'reqd' => "no" },
54 { 'name' => "metadata_exp",
55 'desc' => "{SplitJSONFile.metadata_exp}",
56 'type' => "string",
57 'deft' => "",
58 'reqd' => "no" },
59 { 'name' => "metadata_exp",
60 'desc' => "{SplitJSONFile.file_exp}",
61 'type' => "string",
62 'deft' => "",
63 'reqd' => "no" },
64
65 ];
66
67my $options = { 'name' => "SplitJSONFile",
68 'desc' => "{SplitJSONFile.desc}",
69 'abstract' => "yes",
70 'inherits' => "yes",
71 'args' => $arguments };
72
73
74sub new {
75 my ($class) = shift (@_);
76 my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
77 push(@$pluginlist, $class);
78
79 push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
80 push(@{$hashArgOptLists->{"OptList"}},$options);
81
82 my $self = new SplitTextFile($pluginlist, $inputargs, $hashArgOptLists);
83
84 return bless $self, $class;
85}
86
87
88sub get_default_process_exp
89{
90 return q^(?i)\.json$^;
91}
92
93# The default is to assume the top-level JSON structure is an array
94# Signal this by returning an empty string
95sub get_default_split_exp {
96 return "";
97}
98
99sub split_text_into_segments {
100 my $self = shift (@_);
101 my ($textref) = @_;
102
103 my $outhandle = $self->{'outhandle'};
104 my $verbosity = $self->{'verbosity'};
105
106 my $json = JSON->new();
107
108 # Split the text into several smaller segments
109 my $split_exp = $self->{'split_exp'};
110
111
112 my $json_text_content = JSON::from_json($$textref);
113
114 my $json_array1 = $json_text_content->{'contentGroups'};
115 my $json_array2 = $json_text_content->{'contentItems'};
116
117 # my $json_array = [ @$json_array1, @$json_array2 ];
118
119 my $json_array = [ @$json_array2 ];
120
121 if ($verbosity>1) {
122 print $outhandle "----------\n";
123 print $outhandle "SplitJSONFile -- Segments\n";
124 print $outhandle "----------\n";
125 }
126
127 my @segments = ();
128 ## get rid of empty segments
129 foreach my $seg_json_rec (@$json_array) {
130
131 #my $seg_json_unicode_str = JSON::to_json($seg_json_rec); # expect unicode string
132 #my $seg_json_unicode_str = $json->pretty()->encode($seg_json_rec); # expect unicode string
133 my $seg_json_unicode_str = $json->encode($seg_json_rec); # expects unicode string
134
135 if ($verbosity>1) {
136 my $seg_json_utf8_printable_str = Encode::encode("utf8",$seg_json_unicode_str);
137
138 print $outhandle " --------\n";
139 print $outhandle " $seg_json_utf8_printable_str\n";
140 print $outhandle " --------\n";
141 }
142
143 if ($seg_json_unicode_str ne ""){
144 push @segments, $seg_json_unicode_str;
145 }
146 }
147
148 return \@segments;
149}
150
151
152
1531;
Note: See TracBrowser for help on using the repository browser.