source: main/trunk/greenstone2/perllib/plugins/CSVPlugin.pm@ 31492

Last change on this file since 31492 was 31492, checked in by kjdon, 7 years ago

renamed EncodingUtil to CommonUtil, BasePlugin to BaseImporter. The idea is that only top level plugins that you can specify in your collection get to have plugin in their name. Modified all other plugins to reflect these name changes

  • Property svn:keywords set to Author Date Id Revision
File size: 5.0 KB
Line 
1###########################################################################
2#
3# CSVPlugin.pm -- A plugin for files in comma-separated value format
4#
5# A component of the Greenstone digital library software
6# from the New Zealand Digital Library Project at the
7# University of Waikato, New Zealand.
8#
9# Copyright 2006 New Zealand Digital Library Project
10#
11# This program is free software; you can redistribute it and/or modify
12# it under the terms of the GNU General Public License as published by
13# the Free Software Foundation; either version 2 of the License, or
14# (at your option) any later version.
15#
16# This program is distributed in the hope that it will be useful,
17# but WITHOUT ANY WARRANTY; without even the implied warranty of
18# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19# GNU General Public License for more details.
20#
21# You should have received a copy of the GNU General Public License
22# along with this program; if not, write to the Free Software
23# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24#
25###########################################################################
26
27package CSVPlugin;
28
29
30use SplitTextFile;
31use MetadataRead;
32use strict;
33no strict 'refs'; # allow filehandles to be variables and viceversa
34
35
36# CSVPlugin is a sub-class of SplitTextFile.
37sub BEGIN {
38 @CSVPlugin::ISA = ('MetadataRead', 'SplitTextFile');
39}
40
41
42my $arguments =
43 [ { 'name' => "process_exp",
44 'desc' => "{BaseImporter.process_exp}",
45 'type' => "regexp",
46 'reqd' => "no",
47 'deft' => &get_default_process_exp() },
48 { 'name' => "split_exp",
49 'desc' => "{SplitTextFile.split_exp}",
50 'type' => "regexp",
51 'reqd' => "no",
52 'deft' => &get_default_split_exp(),
53 'hiddengli' => "yes" }
54 ];
55
56
57my $options = { 'name' => "CSVPlugin",
58 'desc' => "{CSVPlugin.desc}",
59 'abstract' => "no",
60 'inherits' => "yes",
61 'explodes' => "yes",
62 'args' => $arguments };
63
64
65# This plugin processes files with the suffix ".csv"
66sub get_default_process_exp {
67 return q^(?i)(\.csv)$^;
68}
69
70
71# This plugin splits the input text by line
72sub get_default_split_exp {
73 return q^\r?\n^;
74}
75
76
77sub new
78{
79 my ($class) = shift (@_);
80 my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
81 push(@$pluginlist, $class);
82
83 push(@{$hashArgOptLists->{"ArgList"}}, @{$arguments});
84 push(@{$hashArgOptLists->{"OptList"}}, $options);
85
86 my $self = new SplitTextFile($pluginlist, $inputargs, $hashArgOptLists);
87
88 return bless $self, $class;
89}
90
91
92sub read_file
93{
94 my $self = shift (@_);
95 my ($filename, $encoding, $language, $textref) = @_;
96
97 # Read in file the usual ReadTextFile way
98 # This ensure that $textref is a unicode aware string
99 $self->SUPER::read_file(@_);
100
101 #
102 # Now top-up the processing of the text with what this plugin
103 # needs
104 #
105
106 # Remove any blank lines so the data is split and processed properly
107 $$textref =~ s/\n(\s*)\n/\n/g;
108
109 # The first line contains the metadata element names
110 $$textref =~ s/^(.*?)\r?\n//;
111 my @csv_file_fields = ();
112 my $csv_file_field_line = $1 . ","; # To make the regular expressions simpler
113 while ($csv_file_field_line ne "") {
114 # Handle quoted values
115 if ($csv_file_field_line =~ s/^\"(.*?)\"\,//) {
116 my $csv_file_field = $1;
117 $csv_file_field =~ s/ //g; # Remove any spaces from the field names
118 push(@csv_file_fields, $csv_file_field);
119 }
120 # Normal comma-separated case
121 elsif ($csv_file_field_line =~ s/^(.*?)\,//) {
122 my $csv_file_field = $1;
123 $csv_file_field =~ s/ //g; # Remove any spaces from the field names
124 push(@csv_file_fields, $csv_file_field);
125 }
126 # The line must be formatted incorrectly
127 else {
128 print STDERR "Error: Badly formatted CSV field line: $csv_file_field_line.\n";
129 last;
130 }
131 }
132 $self->{'csv_file_fields'} = \@csv_file_fields;
133}
134
135
136sub process
137{
138 my $self = shift (@_);
139 my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli) = @_;
140 my $outhandle = $self->{'outhandle'};
141
142 my $section = $doc_obj->get_top_section();
143 my $csv_line = $$textref;
144 my @csv_file_fields = @{$self->{'csv_file_fields'}};
145
146 # Add the raw line as the document text
147 $doc_obj->add_utf8_text($section, $csv_line);
148
149 # Build a hash of metadata name to metadata value for this line
150 my $i = 0;
151 $csv_line .= ","; # To make the regular expressions simpler
152 while ($csv_line ne "") {
153 # Metadata values containing commas are quoted
154 if ($csv_line =~ s/^\"(.*?)\"\,//) {
155 # Only bother with non-empty values
156 if ($1 ne "" && defined($csv_file_fields[$i])) {
157 $doc_obj->add_utf8_metadata($section, $csv_file_fields[$i], $1);
158 }
159 }
160 # Normal comma-separated case
161 elsif ($csv_line =~ s/^(.*?)\,//) {
162 # Only bother with non-empty values
163 if ($1 ne "" && defined($csv_file_fields[$i])) {
164 $doc_obj->add_utf8_metadata($section, $csv_file_fields[$i], $1);
165 }
166 }
167 # The line must be formatted incorrectly
168 else {
169 print STDERR "Error: Badly formatted CSV line: $csv_line.\n";
170 last;
171 }
172
173 $i++;
174 }
175
176 # Record was processed successfully
177 return 1;
178}
179
180
1811;
Note: See TracBrowser for help on using the repository browser.