source: trunk/gsdl/perllib/plugins/IndexPlug.pm@ 3767

Last change on this file since 3767 was 3540, checked in by kjdon, 22 years ago

added John T's changes into CVS - added info to enable retrieval of usage info in xml

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 5.7 KB
Line 
1###########################################################################
2#
3# IndexPlug.pm --
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26# This recursive plugin processes an index.txt file.
27# The index.txt file should contain the list of files to be
28# included in the collection followed by any extra metadata to
29# be associated with each file.
30
31# The index.txt file should be formatted as follows:
32# The first line may be a key (beginning with key:)
33# to name the metadata fields
34# (e.g. key: Subject Organization Date)
35# The following lines will contain a filename followed
36# by the value that metadata entry is to be set to.
37# (e.g. 'irma/iw097e 3.2 unesco 1993' will associate the
38# metadata Subject=3.2, Organization=unesco, and Date=1993
39# with the file irma/iw097e if the above key line was used)
40
41# Note that if any of the metadata fields use the Hierarchy
42# classifier plugin then the value they're set to should
43# correspond to the first field (the descriptor) in the
44# appropriate classification file.
45
46# Metadata values may be named separately using a tag
47# (e.g. <Subject>3.2) and this will override any name
48# given to them by the key line.
49# If there's no key line any unnamed metadata value will be
50# named 'Subject'.
51
52# 12/05/02 Added usage datastructure - John Thompson
53
54package IndexPlug;
55
56use plugin;
57use BasPlug;
58use doc;
59use util;
60use cfgread;
61
62sub BEGIN {
63 @ISA = ('BasPlug');
64}
65
66my $options = { 'name' => "IndexPlug",
67 'desc' => "This recursive plugin processes an index.txt file. The index.txt file should contain the list of files to be included in the collection followed by any extra metadata to be associated with each file.\n\nThe index.txt file should be formatted as follows: The first line may be a key (beginning with key:) to name the metadata fields (e.g. key: Subject Organization Date). The following lines will contain a filename followed by the value that metadata entry is to be set to. (e.g. 'irma/iw097e 3.2 unesco 1993' will associate the metadata Subject=3.2, Organization=unesco, and Date=1993 with the file irma/iw097e if the above key line was used)\n\nNote that if any of the metadata fields use the Hierarchy classifier plugin then the value they're set to should correspond to the first field (the descriptor) in the appropriate classification file.\n\nMetadata values may be named separately using a tag (e.g. &gt;Subject&lt;3.2) and this will override any name given to them by the key line. If there's no key line any unnamed metadata value will be named 'Subject'..",
68 'inherits' => "yes" };
69
70sub new {
71 my ($class) = @_;
72 my $self = new BasPlug ("IndexPlug", @_);
73
74 # 14-05-02 To allow for proper inheritance of arguments - John Thompson
75 my $option_list = $self->{'option_list'};
76 push( @{$option_list}, $options );
77
78 return bless $self, $class;
79}
80
81# return 1 if this class might recurse using $pluginfo
82sub is_recursive {
83 my $self = shift (@_);
84
85 return 1;
86}
87
88# return number of files processed, undef if can't process
89# Note that $base_dir might be "" and that $file might
90# include directories
91sub read {
92 my $self = shift (@_);
93 my ($pluginfo, $base_dir, $file, $metadata, $processor, $maxdocs) = @_;
94 my $outhandle = $self->{'outhandle'};
95
96 my $indexfile = &util::filename_cat($base_dir, $file, "index.txt");
97 if (!-f $indexfile) {
98 # not a directory containing an index file
99 return undef;
100 }
101
102 # found an index.txt file
103 print $outhandle "IndexPlug: processing $indexfile\n";
104
105 # read in the index.txt
106 my $list = &cfgread::read_cfg_file ($indexfile, undef, '^[^#]\w');
107 my @fields = ();
108 # see if there's a 'key:' line
109 if (defined $list->{'key:'}) {
110 @fields = @{$list->{'key:'}};
111 }
112
113 my $index_base_dir = &util::filename_cat($base_dir, $file);
114
115 # process each document
116 my $count = 0;
117 foreach my $docfile (keys (%$list)) {
118 last if ($maxdocs != -1 && $count >= $maxdocs);
119 $metadata = {}; # at present we can do this as metadata
120 # will always be empty when it arrives
121 # at this plugin - this might cause
122 # problems if things change though
123
124 # note that $list->{$docfile} is an array reference
125 if ($docfile !~ /key:/i) {
126 my $i = 0;
127 for ($i = 0; $i < scalar (@{$list->{$docfile}}); $i ++) {
128 if ($list->{$docfile}->[$i] =~ /^<([^>]+)>(.+)$/) {
129 unless (defined ($metadata->{$1})) {
130 $metadata->{$1} = [];
131 }
132 push (@{$metadata->{$1}}, $2);
133 } elsif (scalar @fields >= $i) {
134 unless (defined ($metadata->{$fields[$i]})) {
135 $metadata->{$fields[$i]} = [];
136 }
137 push (@{$metadata->{$fields[$i]}}, $list->{$docfile}->[$i]);
138 } else {
139 $metadata->{'Subject'} = $list->{$docfile};
140 }
141 }
142 $count += &plugin::read ($pluginfo, $index_base_dir, $docfile, $metadata, $processor, $maxdocs);
143 }
144 }
145
146 return $count; # was processed
147}
148
149
1501;
Note: See TracBrowser for help on using the repository browser.