1 | ###########################################################################
|
---|
2 | #
|
---|
3 | # IndexPlug.pm --
|
---|
4 | # A component of the Greenstone digital library software
|
---|
5 | # from the New Zealand Digital Library Project at the
|
---|
6 | # University of Waikato, New Zealand.
|
---|
7 | #
|
---|
8 | # Copyright (C) 1999 New Zealand Digital Library Project
|
---|
9 | #
|
---|
10 | # This program is free software; you can redistribute it and/or modify
|
---|
11 | # it under the terms of the GNU General Public License as published by
|
---|
12 | # the Free Software Foundation; either version 2 of the License, or
|
---|
13 | # (at your option) any later version.
|
---|
14 | #
|
---|
15 | # This program is distributed in the hope that it will be useful,
|
---|
16 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
|
---|
17 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
---|
18 | # GNU General Public License for more details.
|
---|
19 | #
|
---|
20 | # You should have received a copy of the GNU General Public License
|
---|
21 | # along with this program; if not, write to the Free Software
|
---|
22 | # Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
|
---|
23 | #
|
---|
24 | ###########################################################################
|
---|
25 |
|
---|
26 | # This recursive plugin processes an index.txt file.
|
---|
27 | # The index.txt file should contain the list of files to be
|
---|
28 | # included in the collection followed by any extra metadata to
|
---|
29 | # be associated with each file.
|
---|
30 |
|
---|
31 | # The index.txt file should be formatted as follows:
|
---|
32 | # The first line may be a key (beginning with key:)
|
---|
33 | # to name the metadata fields
|
---|
34 | # (e.g. key: Subject Organization Date)
|
---|
35 | # The following lines will contain a filename followed
|
---|
36 | # by the value that metadata entry is to be set to.
|
---|
37 | # (e.g. 'irma/iw097e 3.2 unesco 1993' will associate the
|
---|
38 | # metadata Subject=3.2, Organization=unesco, and Date=1993
|
---|
39 | # with the file irma/iw097e if the above key line was used)
|
---|
40 |
|
---|
41 | # Note that if any of the metadata fields use the Hierarchy
|
---|
42 | # classifier plugin then the value they're set to should
|
---|
43 | # correspond to the first field (the descriptor) in the
|
---|
44 | # appropriate classification file.
|
---|
45 |
|
---|
46 | # Metadata values may be named separately using a tag
|
---|
47 | # (e.g. <Subject>3.2) and this will override any name
|
---|
48 | # given to them by the key line.
|
---|
49 | # If there's no key line any unnamed metadata value will be
|
---|
50 | # named 'Subject'.
|
---|
51 |
|
---|
52 | # 12/05/02 Added usage datastructure - John Thompson
|
---|
53 |
|
---|
54 | package IndexPlug;
|
---|
55 |
|
---|
56 | use plugin;
|
---|
57 | use BasPlug;
|
---|
58 | use doc;
|
---|
59 | use util;
|
---|
60 | use cfgread;
|
---|
61 |
|
---|
62 | sub BEGIN {
|
---|
63 | @ISA = ('BasPlug');
|
---|
64 | }
|
---|
65 |
|
---|
66 | my $options = { 'name' => "IndexPlug",
|
---|
67 | 'desc' => "This recursive plugin processes an index.txt file. The index.txt file should contain the list of files to be included in the collection followed by any extra metadata to be associated with each file.\n\nThe index.txt file should be formatted as follows: The first line may be a key (beginning with key:) to name the metadata fields (e.g. key: Subject Organization Date). The following lines will contain a filename followed by the value that metadata entry is to be set to. (e.g. 'irma/iw097e 3.2 unesco 1993' will associate the metadata Subject=3.2, Organization=unesco, and Date=1993 with the file irma/iw097e if the above key line was used)\n\nNote that if any of the metadata fields use the Hierarchy classifier plugin then the value they're set to should correspond to the first field (the descriptor) in the appropriate classification file.\n\nMetadata values may be named separately using a tag (e.g. >Subject<3.2) and this will override any name given to them by the key line. If there's no key line any unnamed metadata value will be named 'Subject'..",
|
---|
68 | 'inherits' => "yes" };
|
---|
69 |
|
---|
70 | sub new {
|
---|
71 | my ($class) = @_;
|
---|
72 | my $self = new BasPlug ("IndexPlug", @_);
|
---|
73 |
|
---|
74 | # 14-05-02 To allow for proper inheritance of arguments - John Thompson
|
---|
75 | my $option_list = $self->{'option_list'};
|
---|
76 | push( @{$option_list}, $options );
|
---|
77 |
|
---|
78 | return bless $self, $class;
|
---|
79 | }
|
---|
80 |
|
---|
81 | # return 1 if this class might recurse using $pluginfo
|
---|
82 | sub is_recursive {
|
---|
83 | my $self = shift (@_);
|
---|
84 |
|
---|
85 | return 1;
|
---|
86 | }
|
---|
87 |
|
---|
88 | # return number of files processed, undef if can't process
|
---|
89 | # Note that $base_dir might be "" and that $file might
|
---|
90 | # include directories
|
---|
91 | sub read {
|
---|
92 | my $self = shift (@_);
|
---|
93 | my ($pluginfo, $base_dir, $file, $metadata, $processor, $maxdocs) = @_;
|
---|
94 | my $outhandle = $self->{'outhandle'};
|
---|
95 |
|
---|
96 | my $indexfile = &util::filename_cat($base_dir, $file, "index.txt");
|
---|
97 | if (!-f $indexfile) {
|
---|
98 | # not a directory containing an index file
|
---|
99 | return undef;
|
---|
100 | }
|
---|
101 |
|
---|
102 | # found an index.txt file
|
---|
103 | print $outhandle "IndexPlug: processing $indexfile\n";
|
---|
104 |
|
---|
105 | # read in the index.txt
|
---|
106 | my $list = &cfgread::read_cfg_file ($indexfile, undef, '^[^#]\w');
|
---|
107 | my @fields = ();
|
---|
108 | # see if there's a 'key:' line
|
---|
109 | if (defined $list->{'key:'}) {
|
---|
110 | @fields = @{$list->{'key:'}};
|
---|
111 | }
|
---|
112 |
|
---|
113 | my $index_base_dir = &util::filename_cat($base_dir, $file);
|
---|
114 |
|
---|
115 | # process each document
|
---|
116 | my $count = 0;
|
---|
117 | foreach my $docfile (keys (%$list)) {
|
---|
118 | last if ($maxdocs != -1 && $count >= $maxdocs);
|
---|
119 | $metadata = {}; # at present we can do this as metadata
|
---|
120 | # will always be empty when it arrives
|
---|
121 | # at this plugin - this might cause
|
---|
122 | # problems if things change though
|
---|
123 |
|
---|
124 | # note that $list->{$docfile} is an array reference
|
---|
125 | if ($docfile !~ /key:/i) {
|
---|
126 | my $i = 0;
|
---|
127 | for ($i = 0; $i < scalar (@{$list->{$docfile}}); $i ++) {
|
---|
128 | if ($list->{$docfile}->[$i] =~ /^<([^>]+)>(.+)$/) {
|
---|
129 | unless (defined ($metadata->{$1})) {
|
---|
130 | $metadata->{$1} = [];
|
---|
131 | }
|
---|
132 | push (@{$metadata->{$1}}, $2);
|
---|
133 | } elsif (scalar @fields >= $i) {
|
---|
134 | unless (defined ($metadata->{$fields[$i]})) {
|
---|
135 | $metadata->{$fields[$i]} = [];
|
---|
136 | }
|
---|
137 | push (@{$metadata->{$fields[$i]}}, $list->{$docfile}->[$i]);
|
---|
138 | } else {
|
---|
139 | $metadata->{'Subject'} = $list->{$docfile};
|
---|
140 | }
|
---|
141 | }
|
---|
142 | $count += &plugin::read ($pluginfo, $index_base_dir, $docfile, $metadata, $processor, $maxdocs);
|
---|
143 | }
|
---|
144 | }
|
---|
145 |
|
---|
146 | return $count; # was processed
|
---|
147 | }
|
---|
148 |
|
---|
149 |
|
---|
150 | 1;
|
---|