Context Navigation

source: main/trunk/greenstone2/perllib/plugins/CSVPlugin.pm@ 36481

Last change on this file since 36481 was 36481, checked in by kjdon, 20 months ago
now fixed up the files to use the new names
File size: 13.3 KB

Line
1	###########################################################################
2	#
3	# CSVPlugin.pm -- A plugin for files in comma-separated value format
4	#
5	# A component of the Greenstone digital library software
6	# from the New Zealand Digital Library Project at the
7	# University of Waikato, New Zealand.
8	#
9	# Copyright 2006 New Zealand Digital Library Project
10	#
11	# This program is free software; you can redistribute it and/or modify
12	# it under the terms of the GNU General Public License as published by
13	# the Free Software Foundation; either version 2 of the License, or
14	# (at your option) any later version.
15	#
16	# This program is distributed in the hope that it will be useful,
17	# but WITHOUT ANY WARRANTY; without even the implied warranty of
18	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19	# GNU General Public License for more details.
20	#
21	# You should have received a copy of the GNU General Public License
22	# along with this program; if not, write to the Free Software
23	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24	#
25	###########################################################################
26
27	package CSVPlugin;
28
29	use extrametautil;
30
31	use ReadTextFile;
32	use SplitTextFile; # for a couple routines, but we not inheriting
33	use MetadataRead;
34	use CSVFieldSeparator;
35
36	use strict;
37	no strict 'refs'; # allow filehandles to be variables and viceversa
38
39	use Text::CSV;
40
41	sub BEGIN {
42	@CSVPlugin::ISA = ('MetadataRead', 'ReadTextFile', 'CSVFieldSeparator');
43	binmode(STDERR, ":utf8");
44
45	}
46
47
48	my $arguments =
49	[
50	{ 'name' => "process_exp",
51	'desc' => "{BaseImporter.process_exp}",
52	'type' => "regexp",
53	'reqd' => "no",
54	'deft' => &get_default_process_exp() },
55	{ 'name' => "filename_field",
56	'desc' => "{CSVPlugin.filename_field}",
57	'type' => "string",
58	'reqd' => "no",
59	'deft' => "Filename" }
60	];
61
62
63	my $options = { 'name' => "CSVPlugin",
64	'desc' => "{CSVPlugin.desc}",
65	'abstract' => "no",
66	'inherits' => "yes",
67	'explodes' => "yes",
68	'args' => $arguments };
69
70
71	# This plugin processes files with the suffix ".csv"
72	sub get_default_process_exp {
73	return q^(?i)(\.csv)$^;
74	}
75
76	sub new
77	{
78	my ($class) = shift (@_);
79	my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
80	push(@$pluginlist, $class);
81
82	push(@{$hashArgOptLists->{"ArgList"}}, @{$arguments});
83	push(@{$hashArgOptLists->{"OptList"}}, $options);
84
85	new CSVFieldSeparator($pluginlist, $inputargs, $hashArgOptLists);
86	my $self = new ReadTextFile($pluginlist, $inputargs, $hashArgOptLists);
87
88	$self->{'textcat_store'} = {};
89	$self->{'metapass_srcdoc'} = {}; # which segments have valid metadata_srcdoc
90	return bless $self, $class;
91	}
92
93	# mark the file as a metadata file
94	sub file_block_read {
95	my $self = shift (@_);
96	my ($pluginfo, $base_dir, $file, $block_hash, $metadata, $gli) = @_;
97
98	my $filename_full_path = &FileUtils::filenameConcatenate($base_dir, $file);
99	return undef unless $self->can_process_this_file($filename_full_path);
100
101	if (($ENV{'GSDLOS'} =~ m/^windows$/) && ($^O ne "cygwin")) {
102	# convert to full name - paths stored in block hash are long filenames
103	$filename_full_path = &util::upgrade_if_dos_filename($filename_full_path);
104	}
105	# kjdon - upgrade method converts everyhting to lower case drive letter.
106	# so would we need the following stuff???
107	# my $lower_drive = $filename_full_path;
108	# $lower_drive =~ s/^([A-Z]):/\l$1:/i;
109
110	# my $upper_drive = $filename_full_path;
111	# $upper_drive =~ s/^([A-Z]):/\u$1:/i;
112
113	# $block_hash->{'metadata_files'}->{$lower_drive} = 1;
114	# $block_hash->{'metadata_files'}->{$upper_drive} = 1;
115
116	# }
117	# else {
118	### $block_hash->{'metadata_files'}->{$filename_full_path} = 1;
119	# }
120	$block_hash->{'metadata_files'}->{$filename_full_path} = 1;
121	return undef; #1
122	}
123
124	sub metadata_read
125	{
126	my $self = shift (@_);
127	my ($pluginfo, $base_dir, $file, $block_hash,
128	$extrametakeys, $extrametadata, $extrametafile,
129	$processor, $gli, $aux) = @_;
130
131	# can we process this file??
132	my ($filename_full_path, $filename_no_path) = &util::get_full_filenames($base_dir, $file);
133	return undef unless $self->can_process_this_file_for_metadata($filename_full_path);
134
135	print STDERR "\n<Processing n='$file' p='CSVPlugin'>\n" if ($gli);
136	print STDERR "CSVPlugin: processing $file\n" if ($self->{'verbosity'}) > 1;
137
138	my $outhandle = $self->{'outhandle'};
139	my $failhandle = $self->{'failhandle'};
140	my $verbosity = $self->{'verbosity'};
141
142	# don't add to block list, as we may do some processing in read.
143
144	# Do encoding stuff
145	my ($language, $content_encoding) = $self->textcat_get_language_encoding ($filename_full_path);
146	if ($self->{'verbosity'} > 2) {
147	print $outhandle "CSVPlugin: reading $file as ($content_encoding,$language)\n";
148	}
149	# store these values for read
150	my $le_rec = { 'language' => $language, 'encoding' => $content_encoding };
151	$self->{'textcat_store'}->{$file} = $le_rec;
152
153	my $metadata_store = {};
154	$self->{'metadata_store'}->{$file} = $metadata_store; # used to record metadata for segments with no src doc
155
156	my $CSV_FILE;
157	open($CSV_FILE, "<:encoding($content_encoding)", "$filename_full_path");
158	my $separate_char = $self->{'csv_field_separator'};
159
160	my $md_val_sep = $self->{'metadata_value_separator'};
161	undef $md_val_sep if ($md_val_sep eq "");
162
163	my $csv_file_field_line;
164	if ($separate_char =~ m/^auto$/i) {
165
166	$csv_file_field_line = <$CSV_FILE>;
167	$separate_char = $self->resolve_auto($csv_file_field_line,$self->{'plugin_type'});
168	seek $CSV_FILE, 0, 0; # move pointer back to start of file, as we want to read in the fields using csv.
169	}
170
171	my $csv = Text::CSV->new();
172	$csv->sep_char($separate_char);
173	$csv->binary(1);
174
175	my @csv_file_fields = undef;
176
177	my $first_row = $csv->getline ($CSV_FILE);
178	if (defined $first_row) {
179	@csv_file_fields = @$first_row;
180	}
181	else {
182	$self->print_error($outhandle, $failhandle, $gli, $filename_full_path, "Error: Badly formatted CSV header line: $csv_file_field_line");
183	return -1;
184	}
185
186	my $found_filename_field = 0;
187	my $filename_field = $self->{'filename_field'};
188	print STDERR "looking for $filename_field field\n";
189	for (my $i = 0; $i < scalar(@csv_file_fields); $i++) {
190	# Remove any spaces from the field names, and surrounding quotes too
191	$csv_file_fields[$i] =~ s/ //g;
192	$csv_file_fields[$i] =~ s/^"//;
193	$csv_file_fields[$i] =~ s/"$//;
194
195	if ($csv_file_fields[$i] eq $filename_field) {
196	$found_filename_field = 1;
197	}
198	}
199
200
201	if (!$found_filename_field) {
202	$self->print_error($outhandle, $failhandle, $gli, $filename_full_path, "No $filename_field field in CSV file, metadata cannot be assigned to documents, will use metadata only dummy documents");
203
204	}
205
206	my $count = 0;
207	while (my $csv_line = $csv->getline($CSV_FILE)) {
208	my @md_vals = @$csv_line;
209	$count++;
210
211	# Build a hash of metadata name to metadata value for this line
212	my %csv_line_metadata;
213
214	my $md_vals_len = scalar(@md_vals);
215
216	for (my $i=0; $i<$md_vals_len; $i++) {
217	my $md_val = $md_vals[$i];
218	print STDERR "$count: md val = $md_val\n";
219	# Only bother with non-empty values
220	if ($md_val ne "" && defined($csv_file_fields[$i])) {
221
222	my $md_name = $csv_file_fields[$i];
223	$csv_line_metadata{$md_name} = [];
224	if (defined $md_val_sep) {
225
226	my @within_md_vals = split(/${md_val_sep}/,$md_val);
227
228	# protect square brackets in metadata values by hex entity encoding them
229	# As unescaped square bracket chars in metadata
230	# have special meaning in GS' Java runtime code
231	my @escaped_within_md_vals = ();
232	for my $meta_value (@within_md_vals) {
233
234	$meta_value =~ s/\[/&\#091;/g;
235	$meta_value =~ s/\]/&\#093;/g;
236	push(@escaped_within_md_vals, $meta_value);
237	}
238	push (@{$csv_line_metadata{$md_name}}, @escaped_within_md_vals);
239	}
240	else {
241	# protect square brackets in metadata values by hex entity encoding them
242	my $escaped_metadata_value = $md_val;
243	$escaped_metadata_value =~ s/\[/&\#091;/g;
244	$escaped_metadata_value =~ s/\]/&\#093;/g;
245	push (@{$csv_line_metadata{$md_name}}, $escaped_metadata_value);
246	}
247	}
248	}
249
250	my $csv_line_section_array = $csv_line_metadata{"Section"};
251	my $section_suffix = "";
252	if (defined $csv_line_section_array) {
253	my $section_value = shift(@$csv_line_section_array);
254	if ($section_value =~ /[\d.]+/m){
255	my $section_suffix = "///Section/" . $section_value;
256	foreach my $metaname (keys %csv_line_metadata) {
257	my $new_name = $metaname . $section_suffix;
258	$csv_line_metadata{$new_name} = delete $csv_line_metadata{$metaname};
259	}
260	} else{
261	unshift(@$csv_line_section_array, $section_value);
262	}
263	}
264
265	# do we have filename field?
266	# We can't associate any metadata without knowing the file to associate it with
267	my $has_srcdoc = 0;
268	my $csv_line_filename="";;
269	if ($found_filename_field) {
270	# is there a srcdoc mentioned?
271	my $csv_line_filename_array = $csv_line_metadata{$filename_field};
272	if (!defined $csv_line_filename_array) {
273	$self->print_error($outhandle, $failhandle, $gli, $filename_full_path, "No $filename_field metadata in CSV line num $count");
274	} else {
275	$csv_line_filename = shift(@$csv_line_filename_array);
276	# TODO - have an option for whether we do this or not
277	if (&FileUtils::fileExists(&FileUtils::filenameConcatenate($base_dir, $csv_line_filename))) {
278	$has_srcdoc = 1;
279
280	delete $csv_line_metadata{$filename_field};
281	}
282	}
283
284	}
285	if ($has_srcdoc) {
286	print STDERR "storing meta in extra meta for $csv_line_filename\n";
287	$self->store_meta_in_extrametadata($csv_line_filename, \%csv_line_metadata, $file, $filename_full_path, $extrametakeys, $extrametadata, $extrametafile);
288	} else {
289	print STDERR "storing meta for $count, $csv_line_filename\n";
290	$metadata_store->{$count} = \%csv_line_metadata;
291	}
292	} # while csv_line = csv->getline
293	close ($CSV_FILE);
294	}
295
296	#adapted from read in splittextfile
297	sub read {
298	my $self = shift (@_);
299	my ($pluginfo, $base_dir, $file, $block_hash, $metadata, $processor, $maxdocs, $total_count, $gli) = @_;
300	my $outhandle = $self->{'outhandle'};
301	my $verbosity = $self->{'verbosity'};
302
303	# can we process this file??
304	my ($filename_full_path, $filename_no_path) = &util::get_full_filenames($base_dir, $file);
305	return undef unless $self->can_process_this_file($filename_full_path);
306
307	$file =~ s/^[\/\\]+//; # $file often begins with / so we'll tidy it up
308
309	my $le_rec = $self->{'textcat_store'}->{$file};
310	if (!defined $le_rec) {
311	# means no text was found;
312	return 0; # not processed but no point in passing it on
313	}
314
315	print STDERR "<Processing n='$file' p='$self->{'plugin_type'}'>\n" if ($gli);
316	print $outhandle "$self->{'plugin_type'} processing $file\n"
317	if $self->{'verbosity'} > 1;
318
319	my $language = $le_rec->{'language'};
320	my $encoding = $le_rec->{'encoding'};
321	$self->{'textcat_store'}->{$file} = undef;
322
323	my $metadata_store = $self->{'metadata_store'}->{$file}; # a hash of seg num to metadata hash
324
325
326	# Process each segment in turn
327	my $segment = 0; #which segment/record number we have
328	my $count = 0; # num doc objs produced
329
330	my ($filemeta) = $file =~ /([^\\\/]+)$/; #why?
331	my $plugin_filename_encoding = $self->{'filename_encoding'};
332	my $filename_encoding = $self->deduce_filename_encoding($file,$metadata,$plugin_filename_encoding);
333
334	my $id;
335	print STDERR "num keys = ".scalar(keys (%$metadata_store))."\n";
336	foreach $segment (sort keys (%$metadata_store)) {
337	print $outhandle "processing segment $segment\n"
338	if $self->{'verbosity'} > 1;
339	$count++;
340	# create a new document
341	my $doc_obj = new doc ($filename_full_path, "indexed_doc", $self->{'file_rename_method'});
342	$doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "Language", $language);
343	$doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "Encoding", $encoding);
344
345	$self->set_Source_metadata($doc_obj, $filename_full_path, $filename_encoding);
346
347	$doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "SourceSegment", "$segment");
348	if ($self->{'cover_image'}) {
349	$self->associate_cover_image($doc_obj, $filename_full_path);
350	}
351	$doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "Plugin", "$self->{'plugin_type'}");
352
353	# include any metadata passed in from previous plugins
354	# note that this metadata is associated with the top level section
355	$self->extra_metadata ($doc_obj, $doc_obj->get_top_section(), $metadata);
356
357	# add our stored metadata from metadata_read pass
358	my $segment_metadata = $metadata_store->{$segment};
359	$self->extra_metadata($doc_obj, $doc_obj->get_top_section(), $segment_metadata);
360
361	# do any automatic metadata extraction - does this make sense??
362	#$self->auto_extract_metadata ($doc_obj);
363
364	# Calculate a "base" document ID.
365	if (!defined $id) {
366	$id = &SplitTextFile::get_base_OID($self,$doc_obj);
367	}
368
369	# add an OID
370	&SplitTextFile::add_segment_OID($self, $doc_obj, $id, $segment);
371
372	# process the document
373	$processor->process($doc_obj);
374
375	$self->{'num_processed'} ++;
376	if ($maxdocs != -1 && $self->{'num_processed'} >= $maxdocs) {
377	last;
378	}
379	}
380
381	delete $self->{'metadata_store'}->{$file};
382
383	# Return number of document objects produced
384	return $count;
385	}
386
387	sub print_error
388	{
389
390	my $self = shift(@_);
391	my ($outhandle, $failhandle, $gli, $file, $error) = @_;
392
393	print $outhandle "CSVPlugin Error: $file: $error\n";
394	print $failhandle "CSVPlugin Error: $file: $error\n";
395	print STDERR "<ProcessingError n='$file' r='$error'/>\n" if ($gli);
396	}
397
398
399	1;

Note: See TracBrowser for help on using the repository browser.

Download in other formats: