root/main/trunk/greenstone2/perllib/plugins/CSVPlugin.pm @ 22597

Revision 16104, 5.0 KB (checked in by kjdon, 12 years ago)

tried to make the 'xxxplugin processing file' print statements more consistent. They are now done in read (or read_into_doc_obj) and not process

  • Property svn:keywords set to Author Date Id Revision
Line 
1###########################################################################
2#
3# CSVPlugin.pm -- A plugin for files in comma-separated value format
4#
5# A component of the Greenstone digital library software
6# from the New Zealand Digital Library Project at the
7# University of Waikato, New Zealand.
8#
9# Copyright 2006 New Zealand Digital Library Project
10#
11# This program is free software; you can redistribute it and/or modify
12# it under the terms of the GNU General Public License as published by
13# the Free Software Foundation; either version 2 of the License, or
14# (at your option) any later version.
15#
16# This program is distributed in the hope that it will be useful,
17# but WITHOUT ANY WARRANTY; without even the implied warranty of
18# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
19# GNU General Public License for more details.
20#
21# You should have received a copy of the GNU General Public License
22# along with this program; if not, write to the Free Software
23# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24#
25###########################################################################
26
27package CSVPlugin;
28
29
30use SplitTextFile;
31use strict;
32no strict 'refs'; # allow filehandles to be variables and viceversa
33
34
35# CSVPlugin is a sub-class of SplitTextFile.
36sub BEGIN {
37    @CSVPlugin::ISA = ('SplitTextFile');
38}
39
40
41my $arguments =
42    [ { 'name' => "process_exp",
43    'desc' => "{BasePlugin.process_exp}",
44    'type' => "regexp",
45    'reqd' => "no",
46    'deft' => &get_default_process_exp() },
47      { 'name' => "split_exp",
48    'desc' => "{SplitTextFile.split_exp}",
49    'type' => "regexp",
50    'reqd' => "no",
51    'deft' => &get_default_split_exp(),
52        'hiddengli' => "yes" }
53      ];
54
55
56my $options = { 'name'     => "CSVPlugin",
57        'desc'     => "{CSVPlugin.desc}",
58        'abstract' => "no",
59        'inherits' => "yes",
60        'explodes' => "yes",
61        'args'     => $arguments };
62
63
64# This plugin processes files with the suffix ".csv"
65sub get_default_process_exp {
66    return q^(?i)(\.csv)$^;
67}
68
69   
70# This plugin splits the input text by line
71sub get_default_split_exp {
72    return q^\r?\n^;
73}
74
75
76sub new
77{
78    my ($class) = shift (@_);
79    my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
80    push(@$pluginlist, $class);
81
82    push(@{$hashArgOptLists->{"ArgList"}}, @{$arguments});
83    push(@{$hashArgOptLists->{"OptList"}}, $options);
84
85    my $self = new SplitTextFile($pluginlist, $inputargs, $hashArgOptLists);
86
87    return bless $self, $class;
88}
89
90
91sub read_file
92{
93    my $self = shift (@_);
94    my ($filename, $encoding, $language, $textref) = @_;
95    my $outhandle = $self->{'outhandle'};
96
97    # Read the CSV file content
98    open(FILE, $filename);
99    my $reader = new multiread();
100    $reader->set_handle('CSVPlugin::FILE');
101    $reader->set_encoding($encoding);
102    $reader->read_file($textref);
103    close(FILE);
104
105    # Remove any blank lines so the data is split and processed properly
106    $$textref =~ s/\n(\s*)\n/\n/g;
107
108    # The first line contains the metadata element names
109    $$textref =~ s/^(.*?)\r?\n//;
110    my @csv_file_fields = ();
111    my $csv_file_field_line = $1 . ",";  # To make the regular expressions simpler
112    while ($csv_file_field_line ne "") {
113    # Handle quoted values
114    if ($csv_file_field_line =~ s/^\"(.*?)\"\,//) {
115        my $csv_file_field = $1;
116        $csv_file_field =~ s/ //g;  # Remove any spaces from the field names
117        push(@csv_file_fields, $csv_file_field);
118    }
119    # Normal comma-separated case
120    elsif ($csv_file_field_line =~ s/^(.*?)\,//) {
121        my $csv_file_field = $1;
122        $csv_file_field =~ s/ //g;  # Remove any spaces from the field names
123        push(@csv_file_fields, $csv_file_field);
124    }
125    # The line must be formatted incorrectly
126    else {
127        print STDERR "Error: Badly formatted CSV field line: $csv_file_field_line.\n";
128        last;
129    }
130    }
131    $self->{'csv_file_fields'} = \@csv_file_fields;
132}
133
134
135sub process
136{
137    my $self = shift (@_);
138    my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli) = @_;
139    my $outhandle = $self->{'outhandle'};
140
141    my $section = $doc_obj->get_top_section();
142    my $csv_line = $$textref;
143    my @csv_file_fields = @{$self->{'csv_file_fields'}};
144
145    # Add the raw line as the document text
146    $doc_obj->add_utf8_text($section, $csv_line);
147
148    # Build a hash of metadata name to metadata value for this line
149    my $i = 0;
150    $csv_line .= ",";  # To make the regular expressions simpler
151    while ($csv_line ne "") {
152    # Metadata values containing commas are quoted
153    if ($csv_line =~ s/^\"(.*?)\"\,//) {
154        # Only bother with non-empty values
155        if ($1 ne "" && defined($csv_file_fields[$i])) {
156        $doc_obj->add_utf8_metadata($section, $csv_file_fields[$i], $1);
157        }
158    }
159    # Normal comma-separated case
160    elsif ($csv_line =~ s/^(.*?)\,//) {
161        # Only bother with non-empty values
162        if ($1 ne "" && defined($csv_file_fields[$i])) {
163        $doc_obj->add_utf8_metadata($section, $csv_file_fields[$i], $1);
164        }
165    }
166    # The line must be formatted incorrectly
167    else {
168        print STDERR "Error: Badly formatted CSV line: $csv_line.\n";
169        last;
170    }
171
172    $i++;
173    }
174
175    # Record was processed successfully
176    return 1;
177}
178
179
1801;
Note: See TracBrowser for help on using the browser.