source: main/trunk/greenstone2/perllib/plugins/CSVPlugin.pm@ 36533

Last change on this file since 36533 was 36533, checked in by kjdon, 20 months ago

added CSVPLugin option use_namespace_for_field_names - prepend the specified namespace to each field name. This namespace will need to be used in -filename_field and -OIDmetadata options, and in format statments

File size: 15.4 KB
Line 
1###########################################################################
2#
3# CSVPlugin.pm -- A plugin for files in comma-separated value format
4#
5# A component of the Greenstone digital library software
6# from the New Zealand Digital Library Project at the
7# University of Waikato, New Zealand.
8#
9# Copyright 2006 New Zealand Digital Library Project
10#
11# This program is free software; you can redistribute it and/or modify
12# it under the terms of the GNU General Public License as published by
13# the Free Software Foundation; either version 2 of the License, or
14# (at your option) any later version.
15#
16# This program is distributed in the hope that it will be useful,
17# but WITHOUT ANY WARRANTY; without even the implied warranty of
18# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19# GNU General Public License for more details.
20#
21# You should have received a copy of the GNU General Public License
22# along with this program; if not, write to the Free Software
23# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24#
25###########################################################################
26
27package CSVPlugin;
28
29use extrametautil;
30
31use ReadTextFile;
32use SplitTextFile; # for a couple routines, but we not inheriting
33use MetadataRead;
34use CSVFieldSeparator;
35
36use strict;
37no strict 'refs'; # allow filehandles to be variables and viceversa
38
39use Text::CSV;
40
41sub BEGIN {
42 @CSVPlugin::ISA = ('MetadataRead', 'ReadTextFile', 'CSVFieldSeparator');
43 binmode(STDERR, ":utf8");
44
45}
46
47
48my $arguments =
49 [
50 { 'name' => "process_exp",
51 'desc' => "{BaseImporter.process_exp}",
52 'type' => "regexp",
53 'reqd' => "no",
54 'deft' => &get_default_process_exp() },
55 { 'name' => "filename_field",
56 'desc' => "{CSVPlugin.filename_field}",
57 'type' => "string",
58 'reqd' => "no",
59 'deft' => "Filename" },
60 { 'name' => "no_document_if_source_unspecified",
61 'desc' => "{CSVPlugin.no_document_if_source_unspecified}",
62 'type' => "flag",
63 'reqd' => "no"},
64 { 'name' => "no_document_if_source_missing",
65 'desc' => "{CSVPlugin.no_document_if_source_missing}",
66 'type' => "flag",
67 'reqd' => "no"},
68 { 'name' => "use_namespace_for_field_names",
69 'desc' => "{CSVPlugin.use_namespace_for_field_names}",
70 'type' => "string",
71 'reqd' => "no"},
72 { 'name' => "store_field_values_as_document_text",
73 'desc' => "{CSVPlugin.store_field_values_as_document_text}",
74 'type' => "flag",
75 'reqd' => "no"},
76
77
78 ];
79
80
81my $options = { 'name' => "CSVPlugin",
82 'desc' => "{CSVPlugin.desc}",
83 'abstract' => "no",
84 'inherits' => "yes",
85 'explodes' => "yes",
86 'args' => $arguments };
87
88
89# This plugin processes files with the suffix ".csv"
90sub get_default_process_exp {
91 return q^(?i)(\.csv)$^;
92}
93
94sub new
95{
96 my ($class) = shift (@_);
97 my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
98 push(@$pluginlist, $class);
99
100 push(@{$hashArgOptLists->{"ArgList"}}, @{$arguments});
101 push(@{$hashArgOptLists->{"OptList"}}, $options);
102
103 new CSVFieldSeparator($pluginlist, $inputargs, $hashArgOptLists);
104 my $self = new ReadTextFile($pluginlist, $inputargs, $hashArgOptLists);
105
106 $self->{'textcat_store'} = {};
107 $self->{'metapass_srcdoc'} = {}; # which segments have valid metadata_srcdoc
108
109 if ($self->{'use_namespace_for_field_names'} =~ /^\s*$/) {
110 $self->{'use_namespace_for_field_names'} = undef;
111 }
112
113 return bless $self, $class;
114}
115
116
117# mark the file as a metadata file
118sub file_block_read {
119 my $self = shift (@_);
120 my ($pluginfo, $base_dir, $file, $block_hash, $metadata, $gli) = @_;
121
122 my $filename_full_path = &FileUtils::filenameConcatenate($base_dir, $file);
123 return undef unless $self->can_process_this_file($filename_full_path);
124
125 if (($ENV{'GSDLOS'} =~ m/^windows$/) && ($^O ne "cygwin")) {
126 # convert to full name - paths stored in block hash are long filenames
127 $filename_full_path = &util::upgrade_if_dos_filename($filename_full_path);
128 }
129# kjdon - upgrade method converts everyhting to lower case drive letter.
130# so would we need the following stuff???
131# my $lower_drive = $filename_full_path;
132# $lower_drive =~ s/^([A-Z]):/\l$1:/i;
133
134# my $upper_drive = $filename_full_path;
135# $upper_drive =~ s/^([A-Z]):/\u$1:/i;
136
137# $block_hash->{'metadata_files'}->{$lower_drive} = 1;
138# $block_hash->{'metadata_files'}->{$upper_drive} = 1;
139
140# }
141# else {
142### $block_hash->{'metadata_files'}->{$filename_full_path} = 1;
143 # }
144 $block_hash->{'metadata_files'}->{$filename_full_path} = 1;
145 return undef; #1
146}
147
148sub metadata_read
149{
150 my $self = shift (@_);
151 my ($pluginfo, $base_dir, $file, $block_hash,
152 $extrametakeys, $extrametadata, $extrametafile,
153 $processor, $gli, $aux) = @_;
154
155 # can we process this file??
156 my ($filename_full_path, $filename_no_path) = &util::get_full_filenames($base_dir, $file);
157 return undef unless $self->can_process_this_file_for_metadata($filename_full_path);
158
159 print STDERR "\n<Processing n='$file' p='CSVPlugin'>\n" if ($gli);
160 print STDERR "CSVPlugin: processing $file\n" if ($self->{'verbosity'}) > 1;
161
162 my $outhandle = $self->{'outhandle'};
163 my $failhandle = $self->{'failhandle'};
164 my $verbosity = $self->{'verbosity'};
165
166 # don't add to block list, as we may do some processing in read.
167
168 # Do encoding stuff
169 my ($language, $content_encoding) = $self->textcat_get_language_encoding ($filename_full_path);
170 if ($self->{'verbosity'} > 2) {
171 print $outhandle "CSVPlugin: reading $file as ($content_encoding,$language)\n";
172 }
173 # store these values for read
174 my $le_rec = { 'language' => $language, 'encoding' => $content_encoding };
175 $self->{'textcat_store'}->{$file} = $le_rec;
176
177 my $metadata_store = {};
178 $self->{'metadata_store'}->{$file} = $metadata_store; # used to record metadata for segments with no src doc
179
180 my $CSV_FILE;
181 open($CSV_FILE, "<:encoding($content_encoding)", "$filename_full_path");
182 my $separate_char = $self->{'csv_field_separator'};
183
184 my $md_val_sep = $self->{'metadata_value_separator'};
185 undef $md_val_sep if ($md_val_sep eq "");
186
187 my $csv_file_field_line;
188 if ($separate_char =~ m/^auto$/i) {
189
190 $csv_file_field_line = <$CSV_FILE>;
191 $separate_char = $self->resolve_auto($csv_file_field_line,$self->{'plugin_type'});
192 seek $CSV_FILE, 0, 0; # move pointer back to start of file, as we want to read in the fields using csv.
193 }
194
195 my $csv = Text::CSV->new();
196 $csv->sep_char($separate_char);
197 $csv->binary(1);
198
199 my @csv_file_fields = undef;
200
201 my $first_row = $csv->getline ($CSV_FILE);
202 if (defined $first_row) {
203 @csv_file_fields = @$first_row;
204 }
205 else {
206 $self->print_error($outhandle, $failhandle, $gli, $filename_full_path, "Error: Badly formatted CSV header line: $csv_file_field_line");
207 return -1;
208 }
209
210 my $found_filename_field = 0;
211 my $filename_field = $self->{'filename_field'};
212 for (my $i = 0; $i < scalar(@csv_file_fields); $i++) {
213 # Remove any spaces from the field names, and surrounding quotes too
214 $csv_file_fields[$i] =~ s/ //g;
215 $csv_file_fields[$i] =~ s/^"//;
216 $csv_file_fields[$i] =~ s/"$//;
217
218 if ($self->{'use_namespace_for_field_names'}) {
219 $csv_file_fields[$i] = $self->{'use_namespace_for_field_names'}. "." . $csv_file_fields[$i];
220 }
221 if ($csv_file_fields[$i] eq $filename_field) {
222 $found_filename_field = 1;
223 }
224 }
225
226
227 if (!$found_filename_field) {
228 $self->print_warning($outhandle, $failhandle, $gli, $filename_full_path, "No $filename_field field in CSV file, metadata cannot be assigned to documents, will use metadata only dummy documents");
229
230 }
231
232 my $count = 0;
233 while (my $csv_line = $csv->getline($CSV_FILE)) {
234 my @md_vals = @$csv_line;
235 $count++;
236
237 # Build a hash of metadata name to metadata value for this line
238 my %csv_line_metadata;
239
240 my $md_vals_len = scalar(@md_vals);
241
242 for (my $i=0; $i<$md_vals_len; $i++) {
243 my $md_val = $md_vals[$i];
244 # Only bother with non-empty values
245 if ($md_val ne "" && defined($csv_file_fields[$i])) {
246
247 my $md_name = $csv_file_fields[$i];
248 $csv_line_metadata{$md_name} = [];
249 if (defined $md_val_sep) {
250
251 my @within_md_vals = split(/${md_val_sep}/,$md_val);
252
253 # protect square brackets in metadata values by hex entity encoding them
254 # As unescaped square bracket chars in metadata
255 # have special meaning in GS' Java runtime code
256 my @escaped_within_md_vals = ();
257 for my $meta_value (@within_md_vals) {
258
259 $meta_value =~ s/\[/&\#091;/g;
260 $meta_value =~ s/\]/&\#093;/g;
261 push(@escaped_within_md_vals, $meta_value);
262 }
263 push (@{$csv_line_metadata{$md_name}}, @escaped_within_md_vals);
264 }
265 else {
266 # protect square brackets in metadata values by hex entity encoding them
267 my $escaped_metadata_value = $md_val;
268 $escaped_metadata_value =~ s/\[/&\#091;/g;
269 $escaped_metadata_value =~ s/\]/&\#093;/g;
270 push (@{$csv_line_metadata{$md_name}}, $escaped_metadata_value);
271 }
272 }
273 }
274
275 my $csv_line_section_array = $csv_line_metadata{"Section"};
276 my $section_suffix = "";
277 if (defined $csv_line_section_array) {
278 my $section_value = shift(@$csv_line_section_array);
279 if ($section_value =~ /[\d.]+/m){
280 my $section_suffix = "///Section/" . $section_value;
281 foreach my $metaname (keys %csv_line_metadata) {
282 my $new_name = $metaname . $section_suffix;
283 $csv_line_metadata{$new_name} = delete $csv_line_metadata{$metaname};
284 }
285 } else{
286 unshift(@$csv_line_section_array, $section_value);
287 }
288 }
289
290 # do we have filename field?
291 # We can't associate any metadata without knowing the file to associate it with
292 my $has_srcdoc = 0;
293 my $missing_srcdoc = 0;
294 my $csv_line_filename="";;
295 if ($found_filename_field) {
296 # is there a srcdoc mentioned?
297 my $csv_line_filename_array = $csv_line_metadata{$filename_field};
298 if (!defined $csv_line_filename_array) {
299 $self->print_warning($outhandle, $failhandle, $gli, $filename_full_path, "No $filename_field metadata in CSV line num $count");
300 } else {
301 $csv_line_filename = shift(@$csv_line_filename_array);
302 # TODO - have an option for whether we do this or not
303 if (&FileUtils::fileExists(&FileUtils::filenameConcatenate($base_dir, $csv_line_filename))) {
304 $has_srcdoc = 1;
305
306 delete $csv_line_metadata{$filename_field};
307 } else {
308 $self->print_warning($outhandle, $failhandle, $gli, $filename_full_path, "$csv_line_filename in $filename_field metadata in CSV line num $count is not found");
309 $missing_srcdoc = 1; # there was one mentioned but its not found
310 }
311 }
312
313 }
314 if ($has_srcdoc) {
315 print $outhandle "Storing metadata, segment $count, for document $csv_line_filename\n" if ($verbosity > 2);
316 $self->store_meta_in_extrametadata($csv_line_filename, \%csv_line_metadata, $file, $filename_full_path, $extrametakeys, $extrametadata, $extrametafile);
317 } else {
318 my $store_for_dummy = 1;
319 if ($missing_srcdoc && $self->{'no_document_if_source_missing'}) {
320 $self->print_warning($outhandle, $failhandle, $gli, $filename_full_path,"Not storing metadata for line $count as source doc is missing");
321 $store_for_dummy = 0;
322 } elsif(!$missing_srcdoc && $self->{'no_document_if_source_unspecified'}) {
323 $self->print_warning($outhandle, $failhandle, $gli, $filename_full_path,"Not storing metadata for line $count as source doc is unspecified");
324 $store_for_dummy = 0;
325 }
326 if ($store_for_dummy) {
327
328 print $outhandle "Storing metadata for dummy document, segment $count\n" if ($verbosity > 2);
329 $metadata_store->{$count} = \%csv_line_metadata;
330 }
331 }
332 } # while csv_line = csv->getline
333 close ($CSV_FILE);
334}
335
336#adapted from read in splittextfile
337sub read {
338 my $self = shift (@_);
339 my ($pluginfo, $base_dir, $file, $block_hash, $metadata, $processor, $maxdocs, $total_count, $gli) = @_;
340 my $outhandle = $self->{'outhandle'};
341 my $verbosity = $self->{'verbosity'};
342
343 # can we process this file??
344 my ($filename_full_path, $filename_no_path) = &util::get_full_filenames($base_dir, $file);
345 return undef unless $self->can_process_this_file($filename_full_path);
346
347 $file =~ s/^[\/\\]+//; # $file often begins with / so we'll tidy it up
348
349 my $le_rec = $self->{'textcat_store'}->{$file};
350 if (!defined $le_rec) {
351 # means no text was found;
352 return 0; # not processed but no point in passing it on
353 }
354
355 print STDERR "<Processing n='$file' p='$self->{'plugin_type'}'>\n" if ($gli);
356 print $outhandle "$self->{'plugin_type'} processing $file\n"
357 if $self->{'verbosity'} > 1;
358
359 my $language = $le_rec->{'language'};
360 my $encoding = $le_rec->{'encoding'};
361 $self->{'textcat_store'}->{$file} = undef;
362
363 my $metadata_store = $self->{'metadata_store'}->{$file}; # a hash of seg num to metadata hash
364
365
366 # Process each segment in turn
367 my $segment = 0; #which segment/record number we have
368 my $count = 0; # num doc objs produced
369
370 my ($filemeta) = $file =~ /([^\\\/]+)$/; #why?
371 my $plugin_filename_encoding = $self->{'filename_encoding'};
372 my $filename_encoding = $self->deduce_filename_encoding($file,$metadata,$plugin_filename_encoding);
373
374 my $id;
375
376 foreach $segment (sort { $a <=> $b } keys (%$metadata_store)) {
377 print $outhandle "processing segment $segment as its own document\n"
378 if $self->{'verbosity'} > 1;
379 $count++;
380 # create a new document
381 my $doc_obj = new doc ($filename_full_path, "indexed_doc", $self->{'file_rename_method'});
382 my $cursection = $doc_obj->get_top_section();
383 $doc_obj->add_utf8_metadata($cursection, "Language", $language);
384 $doc_obj->add_utf8_metadata($cursection, "Encoding", $encoding);
385
386 $self->set_Source_metadata($doc_obj, $filename_full_path, $filename_encoding);
387
388 $doc_obj->add_utf8_metadata($cursection, "SourceSegment", "$segment");
389 if ($self->{'cover_image'}) {
390 $self->associate_cover_image($doc_obj, $filename_full_path);
391 }
392 $doc_obj->add_utf8_metadata($cursection, "Plugin", "$self->{'plugin_type'}");
393
394 # include any metadata passed in from previous plugins
395 # note that this metadata is associated with the top level section
396 $self->extra_metadata ($doc_obj, $cursection, $metadata);
397
398 # add our stored metadata from metadata_read pass
399 my $segment_metadata = $metadata_store->{$segment};
400 $self->extra_metadata($doc_obj, $cursection, $segment_metadata);
401 if ($self->{'store_field_values_as_document_text'}) {
402 my $new_text = "";
403 foreach my $f (keys %$segment_metadata) {
404 my $values = $segment_metadata->{$f};
405 $new_text .= join (", ", @$values).", ";
406 }
407
408 $doc_obj->add_utf8_text($cursection, $new_text);
409 }
410 # do any automatic metadata extraction - does this make sense??
411 #$self->auto_extract_metadata ($doc_obj);
412
413 # Calculate a "base" document ID.
414 if (!defined $id) {
415 $id = &SplitTextFile::get_base_OID($self,$doc_obj);
416 }
417
418 # add an OID
419 &SplitTextFile::add_segment_OID($self, $doc_obj, $id, $segment);
420
421 # process the document
422 $processor->process($doc_obj);
423
424 $self->{'num_processed'} ++;
425 if ($maxdocs != -1 && $self->{'num_processed'} >= $maxdocs) {
426 last;
427 }
428 }
429
430 delete $self->{'metadata_store'}->{$file};
431
432 # Return number of document objects produced
433 return $count;
434}
435
436sub print_warning {
437 my $self = shift(@_);
438 my ($outhandle, $failhandle, $gli, $file, $error) = @_;
439
440 print $outhandle "CSVPlugin Warning: $file: $error\n";
441 print $failhandle "CSVPlugin Warning: $file: $error\n";
442 print STDERR "<ProcessingError n='$file' r='$error'/>\n" if ($gli);
443
444}
445sub print_error
446{
447
448 my $self = shift(@_);
449 my ($outhandle, $failhandle, $gli, $file, $error) = @_;
450
451 print $outhandle "CSVPlugin Error: $file: $error\n";
452 print $failhandle "CSVPlugin Error: $file: $error\n";
453 print STDERR "<ProcessingError n='$file' r='$error'/>\n" if ($gli);
454}
455
456
4571;
Note: See TracBrowser for help on using the repository browser.