Context Navigation

source: main/trunk/greenstone2/perllib/plugins/CSVPlugin.pm@ 28782

Last change on this file since 28782 was 28782, checked in by ak19, 10 years ago
Routine for reading in text files failed to 'decode' from UTF-8 to trigger Unicode aware strings. Methods changed to user Superclass to ensure this is now done consitently
Property svn:keywords set to `Author Date Id Revision`
File size: 5.0 KB

Line
1	###########################################################################
2	#
3	# CSVPlugin.pm -- A plugin for files in comma-separated value format
4	#
5	# A component of the Greenstone digital library software
6	# from the New Zealand Digital Library Project at the
7	# University of Waikato, New Zealand.
8	#
9	# Copyright 2006 New Zealand Digital Library Project
10	#
11	# This program is free software; you can redistribute it and/or modify
12	# it under the terms of the GNU General Public License as published by
13	# the Free Software Foundation; either version 2 of the License, or
14	# (at your option) any later version.
15	#
16	# This program is distributed in the hope that it will be useful,
17	# but WITHOUT ANY WARRANTY; without even the implied warranty of
18	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19	# GNU General Public License for more details.
20	#
21	# You should have received a copy of the GNU General Public License
22	# along with this program; if not, write to the Free Software
23	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24	#
25	###########################################################################
26
27	package CSVPlugin;
28
29
30	use SplitTextFile;
31	use MetadataRead;
32	use strict;
33	no strict 'refs'; # allow filehandles to be variables and viceversa
34
35
36	# CSVPlugin is a sub-class of SplitTextFile.
37	sub BEGIN {
38	@CSVPlugin::ISA = ('MetadataRead', 'SplitTextFile');
39	}
40
41
42	my $arguments =
43	[ { 'name' => "process_exp",
44	'desc' => "{BasePlugin.process_exp}",
45	'type' => "regexp",
46	'reqd' => "no",
47	'deft' => &get_default_process_exp() },
48	{ 'name' => "split_exp",
49	'desc' => "{SplitTextFile.split_exp}",
50	'type' => "regexp",
51	'reqd' => "no",
52	'deft' => &get_default_split_exp(),
53	'hiddengli' => "yes" }
54	];
55
56
57	my $options = { 'name' => "CSVPlugin",
58	'desc' => "{CSVPlugin.desc}",
59	'abstract' => "no",
60	'inherits' => "yes",
61	'explodes' => "yes",
62	'args' => $arguments };
63
64
65	# This plugin processes files with the suffix ".csv"
66	sub get_default_process_exp {
67	return q^(?i)(\.csv)$^;
68	}
69
70
71	# This plugin splits the input text by line
72	sub get_default_split_exp {
73	return q^\r?\n^;
74	}
75
76
77	sub new
78	{
79	my ($class) = shift (@_);
80	my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
81	push(@$pluginlist, $class);
82
83	push(@{$hashArgOptLists->{"ArgList"}}, @{$arguments});
84	push(@{$hashArgOptLists->{"OptList"}}, $options);
85
86	my $self = new SplitTextFile($pluginlist, $inputargs, $hashArgOptLists);
87
88	return bless $self, $class;
89	}
90
91
92	sub read_file
93	{
94	my $self = shift (@_);
95	my ($filename, $encoding, $language, $textref) = @_;
96
97	# Read in file the usual ReadTextFile way
98	# This ensure that $textref is a unicode aware string
99	$self->SUPER::read_file(@_);
100
101	#
102	# Now top-up the processing of the text with what this plugin
103	# needs
104	#
105
106	# Remove any blank lines so the data is split and processed properly
107	$$textref =~ s/\n(\s*)\n/\n/g;
108
109	# The first line contains the metadata element names
110	$$textref =~ s/^(.*?)\r?\n//;
111	my @csv_file_fields = ();
112	my $csv_file_field_line = $1 . ","; # To make the regular expressions simpler
113	while ($csv_file_field_line ne "") {
114	# Handle quoted values
115	if ($csv_file_field_line =~ s/^\"(.*?)\"\,//) {
116	my $csv_file_field = $1;
117	$csv_file_field =~ s/ //g; # Remove any spaces from the field names
118	push(@csv_file_fields, $csv_file_field);
119	}
120	# Normal comma-separated case
121	elsif ($csv_file_field_line =~ s/^(.*?)\,//) {
122	my $csv_file_field = $1;
123	$csv_file_field =~ s/ //g; # Remove any spaces from the field names
124	push(@csv_file_fields, $csv_file_field);
125	}
126	# The line must be formatted incorrectly
127	else {
128	print STDERR "Error: Badly formatted CSV field line: $csv_file_field_line.\n";
129	last;
130	}
131	}
132	$self->{'csv_file_fields'} = \@csv_file_fields;
133	}
134
135
136	sub process
137	{
138	my $self = shift (@_);
139	my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli) = @_;
140	my $outhandle = $self->{'outhandle'};
141
142	my $section = $doc_obj->get_top_section();
143	my $csv_line = $$textref;
144	my @csv_file_fields = @{$self->{'csv_file_fields'}};
145
146	# Add the raw line as the document text
147	$doc_obj->add_utf8_text($section, $csv_line);
148
149	# Build a hash of metadata name to metadata value for this line
150	my $i = 0;
151	$csv_line .= ","; # To make the regular expressions simpler
152	while ($csv_line ne "") {
153	# Metadata values containing commas are quoted
154	if ($csv_line =~ s/^\"(.*?)\"\,//) {
155	# Only bother with non-empty values
156	if ($1 ne "" && defined($csv_file_fields[$i])) {
157	$doc_obj->add_utf8_metadata($section, $csv_file_fields[$i], $1);
158	}
159	}
160	# Normal comma-separated case
161	elsif ($csv_line =~ s/^(.*?)\,//) {
162	# Only bother with non-empty values
163	if ($1 ne "" && defined($csv_file_fields[$i])) {
164	$doc_obj->add_utf8_metadata($section, $csv_file_fields[$i], $1);
165	}
166	}
167	# The line must be formatted incorrectly
168	else {
169	print STDERR "Error: Badly formatted CSV line: $csv_line.\n";
170	last;
171	}
172
173	$i++;
174	}
175
176	# Record was processed successfully
177	return 1;
178	}
179
180
181	1;

Note: See TracBrowser for help on using the repository browser.

Download in other formats: