source: main/trunk/greenstone2/perllib/plugins/CSVPlugin.pm@ 36480

Last change on this file since 36480 was 36480, checked in by kjdon, 3 months ago

new and improved CSVPlugin - handles input encoding, spaces inside fields, assigning metadata to source docs as well as generating dummy docs

File size: 13.3 KB
Line 
1###########################################################################
2#
3# CSVPlugin.pm -- A plugin for files in comma-separated value format
4#
5# A component of the Greenstone digital library software
6# from the New Zealand Digital Library Project at the
7# University of Waikato, New Zealand.
8#
9# Copyright 2006 New Zealand Digital Library Project
10#
11# This program is free software; you can redistribute it and/or modify
12# it under the terms of the GNU General Public License as published by
13# the Free Software Foundation; either version 2 of the License, or
14# (at your option) any later version.
15#
16# This program is distributed in the hope that it will be useful,
17# but WITHOUT ANY WARRANTY; without even the implied warranty of
18# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19# GNU General Public License for more details.
20#
21# You should have received a copy of the GNU General Public License
22# along with this program; if not, write to the Free Software
23# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24#
25###########################################################################
26
27package NewCSVPlugin;
28
29use extrametautil;
30
31use ReadTextFile;
32use SplitTextFile; # for a couple routines, but we not inheriting
33use MetadataRead;
34use CSVFieldSeparator;
35
36use strict;
37no strict 'refs'; # allow filehandles to be variables and viceversa
38
39use Text::CSV;
40
41sub BEGIN {
42 @NewCSVPlugin::ISA = ('MetadataRead', 'ReadTextFile', 'CSVFieldSeparator');
43 binmode(STDERR, ":utf8");
44
45}
46
47
48my $arguments =
49 [
50 { 'name' => "process_exp",
51 'desc' => "{BaseImporter.process_exp}",
52 'type' => "regexp",
53 'reqd' => "no",
54 'deft' => &get_default_process_exp() },
55 { 'name' => "filename_field",
56 'desc' => "{CSVPlugin.filename_field}",
57 'type' => "string",
58 'reqd' => "no",
59 'deft' => "Filename" }
60 ];
61
62
63my $options = { 'name' => "NewCSVPlugin",
64 'desc' => "{CSVPlugin.desc}",
65 'abstract' => "no",
66 'inherits' => "yes",
67 'explodes' => "yes",
68 'args' => $arguments };
69
70
71# This plugin processes files with the suffix ".csv"
72sub get_default_process_exp {
73 return q^(?i)(\.csv)$^;
74}
75
76sub new
77{
78 my ($class) = shift (@_);
79 my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
80 push(@$pluginlist, $class);
81
82 push(@{$hashArgOptLists->{"ArgList"}}, @{$arguments});
83 push(@{$hashArgOptLists->{"OptList"}}, $options);
84
85 new CSVFieldSeparator($pluginlist, $inputargs, $hashArgOptLists);
86 my $self = new ReadTextFile($pluginlist, $inputargs, $hashArgOptLists);
87
88 $self->{'textcat_store'} = {};
89 $self->{'metapass_srcdoc'} = {}; # which segments have valid metadata_srcdoc
90 return bless $self, $class;
91}
92
93# mark the file as a metadata file
94sub file_block_read {
95 my $self = shift (@_);
96 my ($pluginfo, $base_dir, $file, $block_hash, $metadata, $gli) = @_;
97
98 my $filename_full_path = &FileUtils::filenameConcatenate($base_dir, $file);
99 return undef unless $self->can_process_this_file($filename_full_path);
100
101 if (($ENV{'GSDLOS'} =~ m/^windows$/) && ($^O ne "cygwin")) {
102 # convert to full name - paths stored in block hash are long filenames
103 $filename_full_path = &util::upgrade_if_dos_filename($filename_full_path);
104 }
105# kjdon - upgrade method converts everyhting to lower case drive letter.
106# so would we need the following stuff???
107# my $lower_drive = $filename_full_path;
108# $lower_drive =~ s/^([A-Z]):/\l$1:/i;
109
110# my $upper_drive = $filename_full_path;
111# $upper_drive =~ s/^([A-Z]):/\u$1:/i;
112
113# $block_hash->{'metadata_files'}->{$lower_drive} = 1;
114# $block_hash->{'metadata_files'}->{$upper_drive} = 1;
115
116# }
117# else {
118### $block_hash->{'metadata_files'}->{$filename_full_path} = 1;
119 # }
120 $block_hash->{'metadata_files'}->{$filename_full_path} = 1;
121 return undef; #1
122}
123
124sub metadata_read
125{
126 my $self = shift (@_);
127 my ($pluginfo, $base_dir, $file, $block_hash,
128 $extrametakeys, $extrametadata, $extrametafile,
129 $processor, $gli, $aux) = @_;
130
131 # can we process this file??
132 my ($filename_full_path, $filename_no_path) = &util::get_full_filenames($base_dir, $file);
133 return undef unless $self->can_process_this_file_for_metadata($filename_full_path);
134
135 print STDERR "\n<Processing n='$file' p='NewCSVPlugin'>\n" if ($gli);
136 print STDERR "NewCSVPlugin: processing $file\n" if ($self->{'verbosity'}) > 1;
137
138 my $outhandle = $self->{'outhandle'};
139 my $failhandle = $self->{'failhandle'};
140 my $verbosity = $self->{'verbosity'};
141
142 # don't add to block list, as we may do some processing in read.
143
144 # Do encoding stuff
145 my ($language, $content_encoding) = $self->textcat_get_language_encoding ($filename_full_path);
146 if ($self->{'verbosity'} > 2) {
147 print $outhandle "NewCSVPlugin: reading $file as ($content_encoding,$language)\n";
148 }
149 # store these values for read
150 my $le_rec = { 'language' => $language, 'encoding' => $content_encoding };
151 $self->{'textcat_store'}->{$file} = $le_rec;
152
153 my $metadata_store = {};
154 $self->{'metadata_store'}->{$file} = $metadata_store; # used to record metadata for segments with no src doc
155
156 my $CSV_FILE;
157 open($CSV_FILE, "<:encoding($content_encoding)", "$filename_full_path");
158 my $separate_char = $self->{'csv_field_separator'};
159
160 my $md_val_sep = $self->{'metadata_value_separator'};
161 undef $md_val_sep if ($md_val_sep eq "");
162
163 my $csv_file_field_line;
164 if ($separate_char =~ m/^auto$/i) {
165
166 $csv_file_field_line = <$CSV_FILE>;
167 $separate_char = $self->resolve_auto($csv_file_field_line,$self->{'plugin_type'});
168 seek $CSV_FILE, 0, 0; # move pointer back to start of file, as we want to read in the fields using csv.
169 }
170
171 my $csv = Text::CSV->new();
172 $csv->sep_char($separate_char);
173 $csv->binary(1);
174
175 my @csv_file_fields = undef;
176
177 my $first_row = $csv->getline ($CSV_FILE);
178 if (defined $first_row) {
179 @csv_file_fields = @$first_row;
180 }
181 else {
182 $self->print_error($outhandle, $failhandle, $gli, $filename_full_path, "Error: Badly formatted CSV header line: $csv_file_field_line");
183 return -1;
184 }
185
186 my $found_filename_field = 0;
187 my $filename_field = $self->{'filename_field'};
188 print STDERR "looking for $filename_field field\n";
189 for (my $i = 0; $i < scalar(@csv_file_fields); $i++) {
190 # Remove any spaces from the field names, and surrounding quotes too
191 $csv_file_fields[$i] =~ s/ //g;
192 $csv_file_fields[$i] =~ s/^"//;
193 $csv_file_fields[$i] =~ s/"$//;
194
195 if ($csv_file_fields[$i] eq $filename_field) {
196 $found_filename_field = 1;
197 }
198 }
199
200
201 if (!$found_filename_field) {
202 $self->print_error($outhandle, $failhandle, $gli, $filename_full_path, "No $filename_field field in CSV file, metadata cannot be assigned to documents, will use metadata only dummy documents");
203
204 }
205
206 my $count = 0;
207 while (my $csv_line = $csv->getline($CSV_FILE)) {
208 my @md_vals = @$csv_line;
209 $count++;
210
211 # Build a hash of metadata name to metadata value for this line
212 my %csv_line_metadata;
213
214 my $md_vals_len = scalar(@md_vals);
215
216 for (my $i=0; $i<$md_vals_len; $i++) {
217 my $md_val = $md_vals[$i];
218 print STDERR "$count: md val = $md_val\n";
219 # Only bother with non-empty values
220 if ($md_val ne "" && defined($csv_file_fields[$i])) {
221
222 my $md_name = $csv_file_fields[$i];
223 $csv_line_metadata{$md_name} = [];
224 if (defined $md_val_sep) {
225
226 my @within_md_vals = split(/${md_val_sep}/,$md_val);
227
228 # protect square brackets in metadata values by hex entity encoding them
229 # As unescaped square bracket chars in metadata
230 # have special meaning in GS' Java runtime code
231 my @escaped_within_md_vals = ();
232 for my $meta_value (@within_md_vals) {
233
234 $meta_value =~ s/\[/&\#091;/g;
235 $meta_value =~ s/\]/&\#093;/g;
236 push(@escaped_within_md_vals, $meta_value);
237 }
238 push (@{$csv_line_metadata{$md_name}}, @escaped_within_md_vals);
239 }
240 else {
241 # protect square brackets in metadata values by hex entity encoding them
242 my $escaped_metadata_value = $md_val;
243 $escaped_metadata_value =~ s/\[/&\#091;/g;
244 $escaped_metadata_value =~ s/\]/&\#093;/g;
245 push (@{$csv_line_metadata{$md_name}}, $escaped_metadata_value);
246 }
247 }
248 }
249
250 my $csv_line_section_array = $csv_line_metadata{"Section"};
251 my $section_suffix = "";
252 if (defined $csv_line_section_array) {
253 my $section_value = shift(@$csv_line_section_array);
254 if ($section_value =~ /[\d.]+/m){
255 my $section_suffix = "///Section/" . $section_value;
256 foreach my $metaname (keys %csv_line_metadata) {
257 my $new_name = $metaname . $section_suffix;
258 $csv_line_metadata{$new_name} = delete $csv_line_metadata{$metaname};
259 }
260 } else{
261 unshift(@$csv_line_section_array, $section_value);
262 }
263 }
264
265 # do we have filename field?
266 # We can't associate any metadata without knowing the file to associate it with
267 my $has_srcdoc = 0;
268 my $csv_line_filename="";;
269 if ($found_filename_field) {
270 # is there a srcdoc mentioned?
271 my $csv_line_filename_array = $csv_line_metadata{$filename_field};
272 if (!defined $csv_line_filename_array) {
273 $self->print_error($outhandle, $failhandle, $gli, $filename_full_path, "No $filename_field metadata in CSV line num $count");
274 } else {
275 $csv_line_filename = shift(@$csv_line_filename_array);
276 # TODO - have an option for whether we do this or not
277 if (&FileUtils::fileExists(&FileUtils::filenameConcatenate($base_dir, $csv_line_filename))) {
278 $has_srcdoc = 1;
279
280 delete $csv_line_metadata{$filename_field};
281 }
282 }
283
284 }
285 if ($has_srcdoc) {
286 print STDERR "storing meta in extra meta for $csv_line_filename\n";
287 $self->store_meta_in_extrametadata($csv_line_filename, \%csv_line_metadata, $file, $filename_full_path, $extrametakeys, $extrametadata, $extrametafile);
288 } else {
289 print STDERR "storing meta for $count, $csv_line_filename\n";
290 $metadata_store->{$count} = \%csv_line_metadata;
291 }
292 } # while csv_line = csv->getline
293 close ($CSV_FILE);
294}
295
296#adapted from read in splittextfile
297sub read {
298 my $self = shift (@_);
299 my ($pluginfo, $base_dir, $file, $block_hash, $metadata, $processor, $maxdocs, $total_count, $gli) = @_;
300 my $outhandle = $self->{'outhandle'};
301 my $verbosity = $self->{'verbosity'};
302
303 # can we process this file??
304 my ($filename_full_path, $filename_no_path) = &util::get_full_filenames($base_dir, $file);
305 return undef unless $self->can_process_this_file($filename_full_path);
306
307 $file =~ s/^[\/\\]+//; # $file often begins with / so we'll tidy it up
308
309 my $le_rec = $self->{'textcat_store'}->{$file};
310 if (!defined $le_rec) {
311 # means no text was found;
312 return 0; # not processed but no point in passing it on
313 }
314
315 print STDERR "<Processing n='$file' p='$self->{'plugin_type'}'>\n" if ($gli);
316 print $outhandle "$self->{'plugin_type'} processing $file\n"
317 if $self->{'verbosity'} > 1;
318
319 my $language = $le_rec->{'language'};
320 my $encoding = $le_rec->{'encoding'};
321 $self->{'textcat_store'}->{$file} = undef;
322
323 my $metadata_store = $self->{'metadata_store'}->{$file}; # a hash of seg num to metadata hash
324
325
326 # Process each segment in turn
327 my $segment = 0; #which segment/record number we have
328 my $count = 0; # num doc objs produced
329
330 my ($filemeta) = $file =~ /([^\\\/]+)$/; #why?
331 my $plugin_filename_encoding = $self->{'filename_encoding'};
332 my $filename_encoding = $self->deduce_filename_encoding($file,$metadata,$plugin_filename_encoding);
333
334 my $id;
335 print STDERR "num keys = ".scalar(keys (%$metadata_store))."\n";
336 foreach $segment (sort keys (%$metadata_store)) {
337 print $outhandle "processing segment $segment\n"
338 if $self->{'verbosity'} > 1;
339 $count++;
340 # create a new document
341 my $doc_obj = new doc ($filename_full_path, "indexed_doc", $self->{'file_rename_method'});
342 $doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "Language", $language);
343 $doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "Encoding", $encoding);
344
345 $self->set_Source_metadata($doc_obj, $filename_full_path, $filename_encoding);
346
347 $doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "SourceSegment", "$segment");
348 if ($self->{'cover_image'}) {
349 $self->associate_cover_image($doc_obj, $filename_full_path);
350 }
351 $doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "Plugin", "$self->{'plugin_type'}");
352
353 # include any metadata passed in from previous plugins
354 # note that this metadata is associated with the top level section
355 $self->extra_metadata ($doc_obj, $doc_obj->get_top_section(), $metadata);
356
357 # add our stored metadata from metadata_read pass
358 my $segment_metadata = $metadata_store->{$segment};
359 $self->extra_metadata($doc_obj, $doc_obj->get_top_section(), $segment_metadata);
360
361 # do any automatic metadata extraction - does this make sense??
362 #$self->auto_extract_metadata ($doc_obj);
363
364 # Calculate a "base" document ID.
365 if (!defined $id) {
366 $id = &SplitTextFile::get_base_OID($self,$doc_obj);
367 }
368
369 # add an OID
370 &SplitTextFile::add_segment_OID($self, $doc_obj, $id, $segment);
371
372 # process the document
373 $processor->process($doc_obj);
374
375 $self->{'num_processed'} ++;
376 if ($maxdocs != -1 && $self->{'num_processed'} >= $maxdocs) {
377 last;
378 }
379 }
380
381 delete $self->{'metadata_store'}->{$file};
382
383 # Return number of document objects produced
384 return $count;
385}
386
387sub print_error
388{
389
390 my $self = shift(@_);
391 my ($outhandle, $failhandle, $gli, $file, $error) = @_;
392
393 print $outhandle "NewCSVPlugin Error: $file: $error\n";
394 print $failhandle "NewCSVPlugin Error: $file: $error\n";
395 print STDERR "<ProcessingError n='$file' r='$error'/>\n" if ($gli);
396}
397
398
3991;
Note: See TracBrowser for help on using the repository browser.