source: main/trunk/greenstone2/perllib/plugins/CSVDeprecatedPlugin.pm@ 36479

Last change on this file since 36479 was 36479, checked in by kjdon, 20 months ago

renaming the old CSVPlugin and MetadataCSVPlugin to Deprecated versions, prior to adding the new CSVPlugin which handles both cases, and is new and improved.

  • Property svn:keywords set to Author Date Id Revision
File size: 5.6 KB
RevLine 
[11918]1###########################################################################
2#
[15872]3# CSVPlugin.pm -- A plugin for files in comma-separated value format
[11918]4#
5# A component of the Greenstone digital library software
6# from the New Zealand Digital Library Project at the
7# University of Waikato, New Zealand.
8#
9# Copyright 2006 New Zealand Digital Library Project
10#
11# This program is free software; you can redistribute it and/or modify
12# it under the terms of the GNU General Public License as published by
13# the Free Software Foundation; either version 2 of the License, or
14# (at your option) any later version.
15#
16# This program is distributed in the hope that it will be useful,
17# but WITHOUT ANY WARRANTY; without even the implied warranty of
18# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19# GNU General Public License for more details.
20#
21# You should have received a copy of the GNU General Public License
22# along with this program; if not, write to the Free Software
23# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24#
25###########################################################################
26
[15872]27package CSVPlugin;
[11918]28
[15872]29use SplitTextFile;
[24794]30use MetadataRead;
[34249]31use CSVFieldSeparator;
32
[11918]33use strict;
[12610]34no strict 'refs'; # allow filehandles to be variables and viceversa
[11918]35
[34249]36use Text::CSV;
[11918]37
[15872]38# CSVPlugin is a sub-class of SplitTextFile.
[11918]39sub BEGIN {
[34249]40 @CSVPlugin::ISA = ('MetadataRead', 'SplitTextFile', 'CSVFieldSeparator');
[11918]41}
42
43
[12610]44my $arguments =
[34249]45 [
46 { 'name' => "process_exp",
[31492]47 'desc' => "{BaseImporter.process_exp}",
[11918]48 'type' => "regexp",
49 'reqd' => "no",
[12610]50 'deft' => &get_default_process_exp() },
51 { 'name' => "split_exp",
[15872]52 'desc' => "{SplitTextFile.split_exp}",
[12610]53 'type' => "regexp",
54 'reqd' => "no",
55 'deft' => &get_default_split_exp(),
56 'hiddengli' => "yes" }
57 ];
[11918]58
59
[15872]60my $options = { 'name' => "CSVPlugin",
61 'desc' => "{CSVPlugin.desc}",
[11918]62 'abstract' => "no",
63 'inherits' => "yes",
[12610]64 'explodes' => "yes",
[11918]65 'args' => $arguments };
66
67
[12610]68# This plugin processes files with the suffix ".csv"
69sub get_default_process_exp {
70 return q^(?i)(\.csv)$^;
71}
72
73
74# This plugin splits the input text by line
75sub get_default_split_exp {
76 return q^\r?\n^;
77}
78
79
[11918]80sub new
81{
82 my ($class) = shift (@_);
83 my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
84 push(@$pluginlist, $class);
85
[15872]86 push(@{$hashArgOptLists->{"ArgList"}}, @{$arguments});
87 push(@{$hashArgOptLists->{"OptList"}}, $options);
[11918]88
[34249]89 new CSVFieldSeparator($pluginlist, $inputargs, $hashArgOptLists);
[15872]90 my $self = new SplitTextFile($pluginlist, $inputargs, $hashArgOptLists);
[11918]91
92 return bless $self, $class;
93}
94
95
[12610]96sub read_file
[11918]97{
[12610]98 my $self = shift (@_);
99 my ($filename, $encoding, $language, $textref) = @_;
[11918]100
[28782]101 # Read in file the usual ReadTextFile way
102 # This ensure that $textref is a unicode aware string
103 $self->SUPER::read_file(@_);
[11918]104
[28782]105 #
106 # Now top-up the processing of the text with what this plugin
107 # needs
108 #
109
[12610]110 # Remove any blank lines so the data is split and processed properly
111 $$textref =~ s/\n(\s*)\n/\n/g;
112
113 # The first line contains the metadata element names
114 $$textref =~ s/^(.*?)\r?\n//;
[12627]115 my @csv_file_fields = ();
[34249]116 my $csv_file_field_line = $1;
117
118 my $separate_char = $self->{'csv_field_separator'};
119 if ($separate_char =~ m/^auto$/i) {
120 $separate_char = $self->resolve_auto($csv_file_field_line,$self->{'plugin_type'});
121 # Replace the 'auto' setting the resolved value (for use later on)
122 $self->{'separate_char'} = $separate_char;
[12610]123 }
[33389]124
[34249]125 my $csv = Text::CSV->new();
126 $csv->sep_char($separate_char);
127
128 if ($csv->parse($csv_file_field_line)) {
129 @csv_file_fields = $csv->fields;
130 }
131 else {
132 print STDERR "Error: Badly formatted CSV field line: $csv_file_field_line.\n";
133 }
134
[34250]135 #$self->{'csv_file_fields'} = \@csv_file_fields;
136 $self->{'csv_file_fields'}->{$filename} = \@csv_file_fields;
137 ###print STDERR "**** CSV file fields joined ($filename) = ", join(" ||| ", @{$self->{'csv_file_fields'}->{$filename}}), "\n";
138
[11918]139}
140
141
[12610]142sub process
[11918]143{
144 my $self = shift (@_);
[12610]145 my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli) = @_;
146 my $outhandle = $self->{'outhandle'};
[11918]147
[12610]148 my $section = $doc_obj->get_top_section();
149 my $csv_line = $$textref;
[34250]150 #my @csv_file_fields = @{$self->{'csv_file_fields'}};
151 my $filename_full_path = &FileUtils::filenameConcatenate($base_dir,$file);
152 my @csv_file_fields = @{$self->{'csv_file_fields'}->{$filename_full_path}};
153
154 ###print STDERR "**** CSV file fields joined = ", join(" ||| ", @csv_file_fields), "\n";
155
[34249]156 # Add the raw line as the document text
157 $doc_obj->add_utf8_text($section, $csv_line);
[12610]158
[34249]159 my $separate_char = $self->{'separate_char'};
[33389]160
[34249]161 my $md_val_sep = $self->{'metadata_value_separator'};
162 undef $md_val_sep if ($md_val_sep eq "");
[33389]163
[34249]164 my $csv = Text::CSV->new();
165 $csv->sep_char($separate_char);
166
167 # Build a hash of metadata name to metadata value for this line
168 if ($csv->parse($csv_line)) {
169 my @md_vals = $csv->fields;
170 my $md_vals_len = scalar(@md_vals);
[11918]171
[34249]172 for (my $i=0; $i<$md_vals_len; $i++) {
173 my $md_val = $md_vals[$i];
[12610]174 # Only bother with non-empty values
[34249]175 if ($md_val ne "" && defined($csv_file_fields[$i])) {
176 if (defined $md_val_sep) {
177 my $md_name = $csv_file_fields[$i];
178
179 my @within_md_vals = split(/${md_val_sep}/,$md_val);
180 foreach my $within_md_val (@within_md_vals) {
181 $doc_obj->add_utf8_metadata($section, $md_name, $within_md_val);
182 }
183 }
184 else {
185 $doc_obj->add_utf8_metadata($section, $csv_file_fields[$i], $md_val);
186 }
[11918]187 }
[12610]188 }
[11918]189 }
[34249]190 else {
191 print STDERR "Error: Badly formatted CSV line: $csv_line.\n";
192 }
[12610]193
194 # Record was processed successfully
195 return 1;
[11918]196}
197
198
1991;
Note: See TracBrowser for help on using the repository browser.