source: main/trunk/greenstone2/perllib/plugins/CSVPlugin.pm@ 21742

Last change on this file since 21742 was 16104, checked in by kjdon, 16 years ago

tried to make the 'xxxplugin processing file' print statements more consistent. They are now done in read (or read_into_doc_obj) and not process

  • Property svn:keywords set to Author Date Id Revision
File size: 5.0 KB
Line 
1###########################################################################
2#
3# CSVPlugin.pm -- A plugin for files in comma-separated value format
4#
5# A component of the Greenstone digital library software
6# from the New Zealand Digital Library Project at the
7# University of Waikato, New Zealand.
8#
9# Copyright 2006 New Zealand Digital Library Project
10#
11# This program is free software; you can redistribute it and/or modify
12# it under the terms of the GNU General Public License as published by
13# the Free Software Foundation; either version 2 of the License, or
14# (at your option) any later version.
15#
16# This program is distributed in the hope that it will be useful,
17# but WITHOUT ANY WARRANTY; without even the implied warranty of
18# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19# GNU General Public License for more details.
20#
21# You should have received a copy of the GNU General Public License
22# along with this program; if not, write to the Free Software
23# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24#
25###########################################################################
26
27package CSVPlugin;
28
29
30use SplitTextFile;
31use strict;
32no strict 'refs'; # allow filehandles to be variables and viceversa
33
34
35# CSVPlugin is a sub-class of SplitTextFile.
36sub BEGIN {
37 @CSVPlugin::ISA = ('SplitTextFile');
38}
39
40
41my $arguments =
42 [ { 'name' => "process_exp",
43 'desc' => "{BasePlugin.process_exp}",
44 'type' => "regexp",
45 'reqd' => "no",
46 'deft' => &get_default_process_exp() },
47 { 'name' => "split_exp",
48 'desc' => "{SplitTextFile.split_exp}",
49 'type' => "regexp",
50 'reqd' => "no",
51 'deft' => &get_default_split_exp(),
52 'hiddengli' => "yes" }
53 ];
54
55
56my $options = { 'name' => "CSVPlugin",
57 'desc' => "{CSVPlugin.desc}",
58 'abstract' => "no",
59 'inherits' => "yes",
60 'explodes' => "yes",
61 'args' => $arguments };
62
63
64# This plugin processes files with the suffix ".csv"
65sub get_default_process_exp {
66 return q^(?i)(\.csv)$^;
67}
68
69
70# This plugin splits the input text by line
71sub get_default_split_exp {
72 return q^\r?\n^;
73}
74
75
76sub new
77{
78 my ($class) = shift (@_);
79 my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
80 push(@$pluginlist, $class);
81
82 push(@{$hashArgOptLists->{"ArgList"}}, @{$arguments});
83 push(@{$hashArgOptLists->{"OptList"}}, $options);
84
85 my $self = new SplitTextFile($pluginlist, $inputargs, $hashArgOptLists);
86
87 return bless $self, $class;
88}
89
90
91sub read_file
92{
93 my $self = shift (@_);
94 my ($filename, $encoding, $language, $textref) = @_;
95 my $outhandle = $self->{'outhandle'};
96
97 # Read the CSV file content
98 open(FILE, $filename);
99 my $reader = new multiread();
100 $reader->set_handle('CSVPlugin::FILE');
101 $reader->set_encoding($encoding);
102 $reader->read_file($textref);
103 close(FILE);
104
105 # Remove any blank lines so the data is split and processed properly
106 $$textref =~ s/\n(\s*)\n/\n/g;
107
108 # The first line contains the metadata element names
109 $$textref =~ s/^(.*?)\r?\n//;
110 my @csv_file_fields = ();
111 my $csv_file_field_line = $1 . ","; # To make the regular expressions simpler
112 while ($csv_file_field_line ne "") {
113 # Handle quoted values
114 if ($csv_file_field_line =~ s/^\"(.*?)\"\,//) {
115 my $csv_file_field = $1;
116 $csv_file_field =~ s/ //g; # Remove any spaces from the field names
117 push(@csv_file_fields, $csv_file_field);
118 }
119 # Normal comma-separated case
120 elsif ($csv_file_field_line =~ s/^(.*?)\,//) {
121 my $csv_file_field = $1;
122 $csv_file_field =~ s/ //g; # Remove any spaces from the field names
123 push(@csv_file_fields, $csv_file_field);
124 }
125 # The line must be formatted incorrectly
126 else {
127 print STDERR "Error: Badly formatted CSV field line: $csv_file_field_line.\n";
128 last;
129 }
130 }
131 $self->{'csv_file_fields'} = \@csv_file_fields;
132}
133
134
135sub process
136{
137 my $self = shift (@_);
138 my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli) = @_;
139 my $outhandle = $self->{'outhandle'};
140
141 my $section = $doc_obj->get_top_section();
142 my $csv_line = $$textref;
143 my @csv_file_fields = @{$self->{'csv_file_fields'}};
144
145 # Add the raw line as the document text
146 $doc_obj->add_utf8_text($section, $csv_line);
147
148 # Build a hash of metadata name to metadata value for this line
149 my $i = 0;
150 $csv_line .= ","; # To make the regular expressions simpler
151 while ($csv_line ne "") {
152 # Metadata values containing commas are quoted
153 if ($csv_line =~ s/^\"(.*?)\"\,//) {
154 # Only bother with non-empty values
155 if ($1 ne "" && defined($csv_file_fields[$i])) {
156 $doc_obj->add_utf8_metadata($section, $csv_file_fields[$i], $1);
157 }
158 }
159 # Normal comma-separated case
160 elsif ($csv_line =~ s/^(.*?)\,//) {
161 # Only bother with non-empty values
162 if ($1 ne "" && defined($csv_file_fields[$i])) {
163 $doc_obj->add_utf8_metadata($section, $csv_file_fields[$i], $1);
164 }
165 }
166 # The line must be formatted incorrectly
167 else {
168 print STDERR "Error: Badly formatted CSV line: $csv_line.\n";
169 last;
170 }
171
172 $i++;
173 }
174
175 # Record was processed successfully
176 return 1;
177}
178
179
1801;
Note: See TracBrowser for help on using the repository browser.