source: main/trunk/greenstone2/perllib/plugins/TabSeparatedPlugin.pm@ 31492

Last change on this file since 31492 was 31492, checked in by kjdon, 7 years ago

renamed EncodingUtil to CommonUtil, BasePlugin to BaseImporter. The idea is that only top level plugins that you can specify in your collection get to have plugin in their name. Modified all other plugins to reflect these name changes

File size: 4.5 KB
Line 
1###########################################################################
2#
3# TabSeparatedPlugin.pm -- A plugin for tab-separated metadata files
4#
5# A component of the Greenstone digital library software
6# from the New Zealand Digital Library Project at the
7# University of Waikato, New Zealand.
8#
9# Copyright 2006 New Zealand Digital Library Project
10#
11# This program is free software; you can redistribute it and/or modify
12# it under the terms of the GNU General Public License as published by
13# the Free Software Foundation; either version 2 of the License, or
14# (at your option) any later version.
15#
16# This program is distributed in the hope that it will be useful,
17# but WITHOUT ANY WARRANTY; without even the implied warranty of
18# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19# GNU General Public License for more details.
20#
21# You should have received a copy of the GNU General Public License
22# along with this program; if not, write to the Free Software
23# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24#
25###########################################################################
26
27# Based on CVSPlugin
28
29package TabSeparatedPlugin;
30
31
32use SplitTextFile;
33use MetadataRead;
34use strict;
35no strict 'refs'; # allow filehandles to be variables and viceversa
36
37
38# TabSeparatedPlugin is a sub-class of SplitTextFile.
39sub BEGIN {
40 @TabSeparatedPlugin::ISA = ('MetadataRead', 'SplitTextFile');
41}
42
43
44my $arguments =
45 [ { 'name' => "process_exp",
46 'desc' => "{BaseImporter.process_exp}",
47 'type' => "regexp",
48 'reqd' => "no",
49 'deft' => &get_default_process_exp() },
50 { 'name' => "split_exp",
51 'desc' => "{SplitTextFile.split_exp}",
52 'type' => "regexp",
53 'reqd' => "no",
54 'deft' => &get_default_split_exp(),
55 'hiddengli' => "yes" }
56 ];
57
58
59my $options = { 'name' => "TabSeparatedPlugin",
60 'desc' => "{TabSeparatedPlugin.desc}",
61 'abstract' => "no",
62 'inherits' => "yes",
63 'explodes' => "yes",
64 'args' => $arguments };
65
66
67# This plugin processes files with the suffix ".tab"
68sub get_default_process_exp {
69 return q^(?i)(\.tab)$^;
70}
71
72
73# This plugin splits the input text by line
74sub get_default_split_exp {
75 return q^\r?\n^;
76}
77
78
79sub new
80{
81 my ($class) = shift (@_);
82 my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
83 push(@$pluginlist, $class);
84
85 push(@{$hashArgOptLists->{"ArgList"}}, @{$arguments});
86 push(@{$hashArgOptLists->{"OptList"}}, $options);
87
88 my $self = new SplitTextFile($pluginlist, $inputargs, $hashArgOptLists);
89
90 return bless $self, $class;
91}
92
93
94sub read_file
95{
96 my $self = shift (@_);
97 my ($filename, $encoding, $language, $textref) = @_;
98
99 # Read in file the usual ReadTextFile way
100 # This ensure that $textref is a unicode aware string
101 $self->SUPER::read_file(@_);
102
103 #
104 # Now top-up the processing of the text with what this plugin
105 # needs
106 #
107
108 # Remove any blank lines so the data is split and processed properly
109 $$textref =~ s/\n(\s*)\n/\n/g;
110
111 # The first line contains the metadata element names
112 $$textref =~ s/^(.*?)\r?\n//;
113 my @tab_file_fields = ();
114 my $tab_file_field_line = $1 . "\t"; # To make the regular expressions simpler
115 while ($tab_file_field_line ne "") {
116 if ($tab_file_field_line =~ s/^(.*?)\t//) {
117 my $tab_file_field = $1;
118 $tab_file_field =~ s/ //g; # Remove any spaces from the field names
119 push(@tab_file_fields, $tab_file_field);
120 }
121 # The line must be formatted incorrectly
122 else {
123 print STDERR "Error: Badly formatted Tab field line: $tab_file_field_line.\n";
124 last;
125 }
126 }
127 $self->{'tab_file_fields'} = \@tab_file_fields;
128}
129
130
131sub process
132{
133 my $self = shift (@_);
134 my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli) = @_;
135 my $outhandle = $self->{'outhandle'};
136
137 my $section = $doc_obj->get_top_section();
138 my $tab_line = $$textref;
139 my @tab_file_fields = @{$self->{'tab_file_fields'}};
140
141 # Add the raw line as the document text
142 $doc_obj->add_utf8_text($section, $tab_line);
143
144 # Build a hash of metadata name to metadata value for this line
145 my $i = 0;
146 $tab_line .= "\t"; # To make the regular expressions simpler
147 while ($tab_line ne "") {
148 if ($tab_line =~ s/^(.*?)\t//) {
149 # Only bother with non-empty values
150 if ($1 ne "" && defined($tab_file_fields[$i])) {
151 $doc_obj->add_utf8_metadata($section, $tab_file_fields[$i], $1);
152 }
153 }
154 # The line must be formatted incorrectly
155 else {
156 print STDERR "Error: Badly formatted Tab line: $tab_line.\n";
157 last;
158 }
159
160 $i++;
161 }
162
163 # Record was processed successfully
164 return 1;
165}
166
167
1681;
Note: See TracBrowser for help on using the repository browser.