source: main/trunk/greenstone2/perllib/plugins/IndexPlugin.pm@ 31492

Last change on this file since 31492 was 31492, checked in by kjdon, 7 years ago

renamed EncodingUtil to CommonUtil, BasePlugin to BaseImporter. The idea is that only top level plugins that you can specify in your collection get to have plugin in their name. Modified all other plugins to reflect these name changes

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 5.0 KB
Line 
1###########################################################################
2#
3# IndexPlugin.pm --
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26# This recursive plugin processes an index.txt file.
27# The index.txt file should contain the list of files to be
28# included in the collection followed by any extra metadata to
29# be associated with each file.
30
31# The index.txt file should be formatted as follows:
32# The first line may be a key (beginning with key:)
33# to name the metadata fields
34# (e.g. key: Subject Organization Date)
35# The following lines will contain a filename followed
36# by the value that metadata entry is to be set to.
37# (e.g. 'irma/iw097e 3.2 unesco 1993' will associate the
38# metadata Subject=3.2, Organization=unesco, and Date=1993
39# with the file irma/iw097e if the above key line was used)
40
41# Note that if any of the metadata fields use the Hierarchy
42# classifier plugin then the value they're set to should
43# correspond to the first field (the descriptor) in the
44# appropriate classification file.
45
46# Metadata values may be named separately using a tag
47# (e.g. <Subject>3.2) and this will override any name
48# given to them by the key line.
49# If there's no key line any unnamed metadata value will be
50# named 'Subject'.
51
52package IndexPlugin;
53
54use plugin;
55use BaseImporter;
56use doc;
57use util;
58use cfgread;
59
60use strict;
61no strict 'refs'; # allow filehandles to be variables and viceversa
62
63sub BEGIN {
64 @IndexPlugin::ISA = ('BaseImporter');
65}
66
67#my $arguments = [
68# ];
69
70my $options = { 'name' => "IndexPlugin",
71 'desc' => "{IndexPlugin.desc}",
72 'abstract' => "no",
73 'inherits' => "yes" };
74
75sub new {
76 my ($class) = shift (@_);
77 my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
78 push(@$pluginlist, $class);
79
80 #push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
81 push(@{$hashArgOptLists->{"OptList"}},$options);
82
83 my $self = new BaseImporter($pluginlist, $inputargs, $hashArgOptLists);
84
85 return bless $self, $class;
86}
87
88# return 1 if this class might recurse using $pluginfo
89sub is_recursive {
90 my $self = shift (@_);
91
92 return 1;
93}
94
95# return number of files processed, undef if can't process
96# Note that $base_dir might be "" and that $file might
97# include directories
98sub read {
99 my $self = shift (@_);
100 my ($pluginfo, $base_dir, $file, $block_hash, $metadata, $processor, $maxdocs, $total_count, $gli) = @_;
101 my $outhandle = $self->{'outhandle'};
102
103 my $indexfile = &util::filename_cat($base_dir, $file, "index.txt");
104 if (!-f $indexfile) {
105 # not a directory containing an index file
106 return undef;
107 }
108
109 # found an index.txt file
110 print STDERR "<Processing n='$file' p='IndexPlugin'>\n" if ($gli);
111 print $outhandle "IndexPlugin: processing $indexfile\n";
112
113 # read in the index.txt
114 my $list = &cfgread::read_cfg_file ($indexfile, undef, '^[^#]\w');
115 my @fields = ();
116 # see if there's a 'key:' line
117 if (defined $list->{'key:'}) {
118 @fields = @{$list->{'key:'}};
119 }
120
121 my $index_base_dir = &util::filename_cat($base_dir, $file);
122
123 # process each document
124 my $count = 0;
125 foreach my $docfile (keys (%$list)) {
126 last if ($maxdocs != -1 && ($total_count + $count) >= $maxdocs);
127 $metadata = {}; # at present we can do this as metadata
128 # will always be empty when it arrives
129 # at this plugin - this might cause
130 # problems if things change though
131
132 # note that $list->{$docfile} is an array reference
133 if ($docfile !~ /key:/i) {
134 my $i = 0;
135 for ($i = 0; $i < scalar (@{$list->{$docfile}}); $i ++) {
136 if ($list->{$docfile}->[$i] =~ /^<([^>]+)>(.+)$/) {
137 unless (defined ($metadata->{$1})) {
138 $metadata->{$1} = [];
139 }
140 push (@{$metadata->{$1}}, $2);
141 } elsif (scalar @fields >= $i) {
142 unless (defined ($metadata->{$fields[$i]})) {
143 $metadata->{$fields[$i]} = [];
144 }
145 push (@{$metadata->{$fields[$i]}}, $list->{$docfile}->[$i]);
146 } else {
147 $metadata->{'Subject'} = $list->{$docfile};
148 }
149 }
150 $count += &plugin::read ($pluginfo, $index_base_dir, $docfile, $block_hash, $metadata, $processor, $maxdocs, ($total_count +$count), $gli);
151 }
152 }
153
154 return $count; # was processed
155}
156
157
1581;
Note: See TracBrowser for help on using the repository browser.