source: gsdl/trunk/perllib/plugins/CSVPlugin.pm@ 15918

Last change on this file since 15918 was 15872, checked in by kjdon, 16 years ago

plugin overhaul: plugins renamed to xxPlugin, and in some cases the names are made more sensible. They now use the new base plugins. Hopefully we have better code reuse. Some of the plugins still need work done as I didn't want to spend another month doing this before committing it. Alos, I haven't really tested anything yet...

  • Property svn:keywords set to Author Date Id Revision
File size: 5.2 KB
Line 
1###########################################################################
2#
3# CSVPlugin.pm -- A plugin for files in comma-separated value format
4#
5# A component of the Greenstone digital library software
6# from the New Zealand Digital Library Project at the
7# University of Waikato, New Zealand.
8#
9# Copyright 2006 New Zealand Digital Library Project
10#
11# This program is free software; you can redistribute it and/or modify
12# it under the terms of the GNU General Public License as published by
13# the Free Software Foundation; either version 2 of the License, or
14# (at your option) any later version.
15#
16# This program is distributed in the hope that it will be useful,
17# but WITHOUT ANY WARRANTY; without even the implied warranty of
18# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19# GNU General Public License for more details.
20#
21# You should have received a copy of the GNU General Public License
22# along with this program; if not, write to the Free Software
23# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24#
25###########################################################################
26
27package CSVPlugin;
28
29
30use SplitTextFile;
31use strict;
32no strict 'refs'; # allow filehandles to be variables and viceversa
33
34
35# CSVPlugin is a sub-class of SplitTextFile.
36sub BEGIN {
37 @CSVPlugin::ISA = ('SplitTextFile');
38}
39
40
41my $arguments =
42 [ { 'name' => "process_exp",
43 'desc' => "{BasePlugin.process_exp}",
44 'type' => "regexp",
45 'reqd' => "no",
46 'deft' => &get_default_process_exp() },
47 { 'name' => "split_exp",
48 'desc' => "{SplitTextFile.split_exp}",
49 'type' => "regexp",
50 'reqd' => "no",
51 'deft' => &get_default_split_exp(),
52 'hiddengli' => "yes" }
53 ];
54
55
56my $options = { 'name' => "CSVPlugin",
57 'desc' => "{CSVPlugin.desc}",
58 'abstract' => "no",
59 'inherits' => "yes",
60 'explodes' => "yes",
61 'args' => $arguments };
62
63
64# This plugin processes files with the suffix ".csv"
65sub get_default_process_exp {
66 return q^(?i)(\.csv)$^;
67}
68
69
70# This plugin splits the input text by line
71sub get_default_split_exp {
72 return q^\r?\n^;
73}
74
75
76sub new
77{
78 my ($class) = shift (@_);
79 my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
80 push(@$pluginlist, $class);
81
82 push(@{$hashArgOptLists->{"ArgList"}}, @{$arguments});
83 push(@{$hashArgOptLists->{"OptList"}}, $options);
84
85 my $self = new SplitTextFile($pluginlist, $inputargs, $hashArgOptLists);
86
87 return bless $self, $class;
88}
89
90
91sub read_file
92{
93 my $self = shift (@_);
94 my ($filename, $encoding, $language, $textref) = @_;
95 my $outhandle = $self->{'outhandle'};
96
97 # Read the CSV file content
98 open(FILE, $filename);
99 my $reader = new multiread();
100 $reader->set_handle('CSVPlugin::FILE');
101 $reader->set_encoding($encoding);
102 $reader->read_file($textref);
103 close(FILE);
104
105 # Remove any blank lines so the data is split and processed properly
106 $$textref =~ s/\n(\s*)\n/\n/g;
107
108 # The first line contains the metadata element names
109 $$textref =~ s/^(.*?)\r?\n//;
110 my @csv_file_fields = ();
111 my $csv_file_field_line = $1 . ","; # To make the regular expressions simpler
112 while ($csv_file_field_line ne "") {
113 # Handle quoted values
114 if ($csv_file_field_line =~ s/^\"(.*?)\"\,//) {
115 my $csv_file_field = $1;
116 $csv_file_field =~ s/ //g; # Remove any spaces from the field names
117 push(@csv_file_fields, $csv_file_field);
118 }
119 # Normal comma-separated case
120 elsif ($csv_file_field_line =~ s/^(.*?)\,//) {
121 my $csv_file_field = $1;
122 $csv_file_field =~ s/ //g; # Remove any spaces from the field names
123 push(@csv_file_fields, $csv_file_field);
124 }
125 # The line must be formatted incorrectly
126 else {
127 print STDERR "Error: Badly formatted CSV field line: $csv_file_field_line.\n";
128 last;
129 }
130 }
131 $self->{'csv_file_fields'} = \@csv_file_fields;
132}
133
134
135sub process
136{
137 my $self = shift (@_);
138 my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli) = @_;
139 my $outhandle = $self->{'outhandle'};
140
141 my $section = $doc_obj->get_top_section();
142 my $csv_line = $$textref;
143 my @csv_file_fields = @{$self->{'csv_file_fields'}};
144
145 # Report that we're processing the file
146 print STDERR "\n<Processing n='$file' p='CSVPlugin'>\n" if ($gli);
147 print $outhandle "CSVPlugin: processing $file\n" if ($self->{'verbosity'}) > 1;
148
149 # Add the raw line as the document text
150 $doc_obj->add_utf8_text($section, $csv_line);
151
152 # Build a hash of metadata name to metadata value for this line
153 my $i = 0;
154 $csv_line .= ","; # To make the regular expressions simpler
155 while ($csv_line ne "") {
156 # Metadata values containing commas are quoted
157 if ($csv_line =~ s/^\"(.*?)\"\,//) {
158 # Only bother with non-empty values
159 if ($1 ne "" && defined($csv_file_fields[$i])) {
160 $doc_obj->add_utf8_metadata($section, $csv_file_fields[$i], $1);
161 }
162 }
163 # Normal comma-separated case
164 elsif ($csv_line =~ s/^(.*?)\,//) {
165 # Only bother with non-empty values
166 if ($1 ne "" && defined($csv_file_fields[$i])) {
167 $doc_obj->add_utf8_metadata($section, $csv_file_fields[$i], $1);
168 }
169 }
170 # The line must be formatted incorrectly
171 else {
172 print STDERR "Error: Badly formatted CSV line: $csv_line.\n";
173 last;
174 }
175
176 $i++;
177 }
178
179 # Record was processed successfully
180 return 1;
181}
182
183
1841;
Note: See TracBrowser for help on using the repository browser.