source: main/trunk/greenstone2/perllib/plugins/TabSeparatedPlugin.pm@ 28836

Last change on this file since 28836 was 28782, checked in by ak19, 10 years ago

Routine for reading in text files failed to 'decode' from UTF-8 to trigger Unicode aware strings. Methods changed to user Superclass to ensure this is now done consitently

File size: 4.5 KB
Line 
1###########################################################################
2#
3# TabSeparatedPlugin.pm -- A plugin for tab-separated metadata files
4#
5# A component of the Greenstone digital library software
6# from the New Zealand Digital Library Project at the
7# University of Waikato, New Zealand.
8#
9# Copyright 2006 New Zealand Digital Library Project
10#
11# This program is free software; you can redistribute it and/or modify
12# it under the terms of the GNU General Public License as published by
13# the Free Software Foundation; either version 2 of the License, or
14# (at your option) any later version.
15#
16# This program is distributed in the hope that it will be useful,
17# but WITHOUT ANY WARRANTY; without even the implied warranty of
18# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19# GNU General Public License for more details.
20#
21# You should have received a copy of the GNU General Public License
22# along with this program; if not, write to the Free Software
23# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24#
25###########################################################################
26
27# Based on CVSPlugin
28
29package TabSeparatedPlugin;
30
31
32use SplitTextFile;
33use MetadataRead;
34use strict;
35no strict 'refs'; # allow filehandles to be variables and viceversa
36
37
38# TabSeparatedPlugin is a sub-class of SplitTextFile.
39sub BEGIN {
40 @TabSeparatedPlugin::ISA = ('MetadataRead', 'SplitTextFile');
41}
42
43
44my $arguments =
45 [ { 'name' => "process_exp",
46 'desc' => "{BasePlugin.process_exp}",
47 'type' => "regexp",
48 'reqd' => "no",
49 'deft' => &get_default_process_exp() },
50 { 'name' => "split_exp",
51 'desc' => "{SplitTextFile.split_exp}",
52 'type' => "regexp",
53 'reqd' => "no",
54 'deft' => &get_default_split_exp(),
55 'hiddengli' => "yes" }
56 ];
57
58
59my $options = { 'name' => "TabSeparatedPlugin",
60 'desc' => "{TabSeparatedPlugin.desc}",
61 'abstract' => "no",
62 'inherits' => "yes",
63 'explodes' => "yes",
64 'args' => $arguments };
65
66
67# This plugin processes files with the suffix ".tab"
68sub get_default_process_exp {
69 return q^(?i)(\.tab)$^;
70}
71
72
73# This plugin splits the input text by line
74sub get_default_split_exp {
75 return q^\r?\n^;
76}
77
78
79sub new
80{
81 my ($class) = shift (@_);
82 my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
83 push(@$pluginlist, $class);
84
85 push(@{$hashArgOptLists->{"ArgList"}}, @{$arguments});
86 push(@{$hashArgOptLists->{"OptList"}}, $options);
87
88 my $self = new SplitTextFile($pluginlist, $inputargs, $hashArgOptLists);
89
90 return bless $self, $class;
91}
92
93
94sub read_file
95{
96 my $self = shift (@_);
97 my ($filename, $encoding, $language, $textref) = @_;
98
99 # Read in file the usual ReadTextFile way
100 # This ensure that $textref is a unicode aware string
101 $self->SUPER::read_file(@_);
102
103 #
104 # Now top-up the processing of the text with what this plugin
105 # needs
106 #
107
108 # Remove any blank lines so the data is split and processed properly
109 $$textref =~ s/\n(\s*)\n/\n/g;
110
111 # The first line contains the metadata element names
112 $$textref =~ s/^(.*?)\r?\n//;
113 my @tab_file_fields = ();
114 my $tab_file_field_line = $1 . "\t"; # To make the regular expressions simpler
115 while ($tab_file_field_line ne "") {
116 if ($tab_file_field_line =~ s/^(.*?)\t//) {
117 my $tab_file_field = $1;
118 $tab_file_field =~ s/ //g; # Remove any spaces from the field names
119 push(@tab_file_fields, $tab_file_field);
120 }
121 # The line must be formatted incorrectly
122 else {
123 print STDERR "Error: Badly formatted Tab field line: $tab_file_field_line.\n";
124 last;
125 }
126 }
127 $self->{'tab_file_fields'} = \@tab_file_fields;
128}
129
130
131sub process
132{
133 my $self = shift (@_);
134 my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli) = @_;
135 my $outhandle = $self->{'outhandle'};
136
137 my $section = $doc_obj->get_top_section();
138 my $tab_line = $$textref;
139 my @tab_file_fields = @{$self->{'tab_file_fields'}};
140
141 # Add the raw line as the document text
142 $doc_obj->add_utf8_text($section, $tab_line);
143
144 # Build a hash of metadata name to metadata value for this line
145 my $i = 0;
146 $tab_line .= "\t"; # To make the regular expressions simpler
147 while ($tab_line ne "") {
148 if ($tab_line =~ s/^(.*?)\t//) {
149 # Only bother with non-empty values
150 if ($1 ne "" && defined($tab_file_fields[$i])) {
151 $doc_obj->add_utf8_metadata($section, $tab_file_fields[$i], $1);
152 }
153 }
154 # The line must be formatted incorrectly
155 else {
156 print STDERR "Error: Badly formatted Tab line: $tab_line.\n";
157 last;
158 }
159
160 $i++;
161 }
162
163 # Record was processed successfully
164 return 1;
165}
166
167
1681;
Note: See TracBrowser for help on using the repository browser.