Context Navigation

source: main/trunk/greenstone2/perllib/plugins/TabSeparatedPlugin.pm@ 28836

Last change on this file since 28836 was 28782, checked in by ak19, 10 years ago
Routine for reading in text files failed to 'decode' from UTF-8 to trigger Unicode aware strings. Methods changed to user Superclass to ensure this is now done consitently
File size: 4.5 KB

Line
1	###########################################################################
2	#
3	# TabSeparatedPlugin.pm -- A plugin for tab-separated metadata files
4	#
5	# A component of the Greenstone digital library software
6	# from the New Zealand Digital Library Project at the
7	# University of Waikato, New Zealand.
8	#
9	# Copyright 2006 New Zealand Digital Library Project
10	#
11	# This program is free software; you can redistribute it and/or modify
12	# it under the terms of the GNU General Public License as published by
13	# the Free Software Foundation; either version 2 of the License, or
14	# (at your option) any later version.
15	#
16	# This program is distributed in the hope that it will be useful,
17	# but WITHOUT ANY WARRANTY; without even the implied warranty of
18	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19	# GNU General Public License for more details.
20	#
21	# You should have received a copy of the GNU General Public License
22	# along with this program; if not, write to the Free Software
23	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24	#
25	###########################################################################
26
27	# Based on CVSPlugin
28
29	package TabSeparatedPlugin;
30
31
32	use SplitTextFile;
33	use MetadataRead;
34	use strict;
35	no strict 'refs'; # allow filehandles to be variables and viceversa
36
37
38	# TabSeparatedPlugin is a sub-class of SplitTextFile.
39	sub BEGIN {
40	@TabSeparatedPlugin::ISA = ('MetadataRead', 'SplitTextFile');
41	}
42
43
44	my $arguments =
45	[ { 'name' => "process_exp",
46	'desc' => "{BasePlugin.process_exp}",
47	'type' => "regexp",
48	'reqd' => "no",
49	'deft' => &get_default_process_exp() },
50	{ 'name' => "split_exp",
51	'desc' => "{SplitTextFile.split_exp}",
52	'type' => "regexp",
53	'reqd' => "no",
54	'deft' => &get_default_split_exp(),
55	'hiddengli' => "yes" }
56	];
57
58
59	my $options = { 'name' => "TabSeparatedPlugin",
60	'desc' => "{TabSeparatedPlugin.desc}",
61	'abstract' => "no",
62	'inherits' => "yes",
63	'explodes' => "yes",
64	'args' => $arguments };
65
66
67	# This plugin processes files with the suffix ".tab"
68	sub get_default_process_exp {
69	return q^(?i)(\.tab)$^;
70	}
71
72
73	# This plugin splits the input text by line
74	sub get_default_split_exp {
75	return q^\r?\n^;
76	}
77
78
79	sub new
80	{
81	my ($class) = shift (@_);
82	my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
83	push(@$pluginlist, $class);
84
85	push(@{$hashArgOptLists->{"ArgList"}}, @{$arguments});
86	push(@{$hashArgOptLists->{"OptList"}}, $options);
87
88	my $self = new SplitTextFile($pluginlist, $inputargs, $hashArgOptLists);
89
90	return bless $self, $class;
91	}
92
93
94	sub read_file
95	{
96	my $self = shift (@_);
97	my ($filename, $encoding, $language, $textref) = @_;
98
99	# Read in file the usual ReadTextFile way
100	# This ensure that $textref is a unicode aware string
101	$self->SUPER::read_file(@_);
102
103	#
104	# Now top-up the processing of the text with what this plugin
105	# needs
106	#
107
108	# Remove any blank lines so the data is split and processed properly
109	$$textref =~ s/\n(\s*)\n/\n/g;
110
111	# The first line contains the metadata element names
112	$$textref =~ s/^(.*?)\r?\n//;
113	my @tab_file_fields = ();
114	my $tab_file_field_line = $1 . "\t"; # To make the regular expressions simpler
115	while ($tab_file_field_line ne "") {
116	if ($tab_file_field_line =~ s/^(.*?)\t//) {
117	my $tab_file_field = $1;
118	$tab_file_field =~ s/ //g; # Remove any spaces from the field names
119	push(@tab_file_fields, $tab_file_field);
120	}
121	# The line must be formatted incorrectly
122	else {
123	print STDERR "Error: Badly formatted Tab field line: $tab_file_field_line.\n";
124	last;
125	}
126	}
127	$self->{'tab_file_fields'} = \@tab_file_fields;
128	}
129
130
131	sub process
132	{
133	my $self = shift (@_);
134	my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli) = @_;
135	my $outhandle = $self->{'outhandle'};
136
137	my $section = $doc_obj->get_top_section();
138	my $tab_line = $$textref;
139	my @tab_file_fields = @{$self->{'tab_file_fields'}};
140
141	# Add the raw line as the document text
142	$doc_obj->add_utf8_text($section, $tab_line);
143
144	# Build a hash of metadata name to metadata value for this line
145	my $i = 0;
146	$tab_line .= "\t"; # To make the regular expressions simpler
147	while ($tab_line ne "") {
148	if ($tab_line =~ s/^(.*?)\t//) {
149	# Only bother with non-empty values
150	if ($1 ne "" && defined($tab_file_fields[$i])) {
151	$doc_obj->add_utf8_metadata($section, $tab_file_fields[$i], $1);
152	}
153	}
154	# The line must be formatted incorrectly
155	else {
156	print STDERR "Error: Badly formatted Tab line: $tab_line.\n";
157	last;
158	}
159
160	$i++;
161	}
162
163	# Record was processed successfully
164	return 1;
165	}
166
167
168	1;

Note: See TracBrowser for help on using the repository browser.

Download in other formats: