source: trunk/gsdl/perllib/plugins/GMLPlug.pm@ 1244

Last change on this file since 1244 was 1244, checked in by sjboddie, 24 years ago

Caught up most general plugins (that's the ones in gsdlhome/perllib/plugins)
with changes to BasPlug so that they can all now use the new general plugin
options. Those I didn't do were FoxPlug (as it's not actually used anywhere
and I don't know what it does) and WebPlug (as it's kind of a work in
progress and doesn't really work anyway). All plugins will still work
(including all the collection specific ones that are laying around), some
of them just won't have access to the general options.
I also wrote a short perl script (pluginfo.pl) that prints out all the
options available to a given plugin.

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 5.8 KB
Line 
1###########################################################################
2#
3# GMLPlug.pm --
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26# plugin which processes a GML format document
27# assumes that gml tags are all in lower-case.
28
29package GMLPlug;
30
31use BasPlug;
32use util;
33use doc;
34
35sub BEGIN {
36 @ISA = ('BasPlug');
37}
38
39use strict;
40
41sub new {
42 my ($class) = @_;
43 my $self = new BasPlug ("GMLPlug", @_);
44
45 return bless $self, $class;
46}
47
48sub get_default_process_exp {
49 my $self = shift (@_);
50
51 return q^(?i)\.gml(\.gz)?$^;
52}
53
54# return number of files processed, undef if can't process
55# Note that $base_dir might be "" and that $file might
56# include directories
57sub read {
58 my $self = shift (@_);
59 my ($pluginfo, $base_dir, $file, $metadata, $processor, $maxdocs) = @_;
60
61 my $filename = &util::filename_cat($base_dir, $file);
62 return 0 if $self->{'block_exp'} ne "" && $filename =~ /$self->{'block_exp'}/;
63 if ($filename !~ /$self->{'process_exp'}/ || !-f $filename) {
64 return undef;
65 }
66 $file =~ s/^[\/\\]+//; # $file often begins with / so we'll tidy it up
67
68 print STDERR "GMLPlug: processing $file\n";
69
70 my $parent_dir = $file;
71 $parent_dir =~ s/[^\\\/]*$//;
72 $parent_dir = &util::filename_cat ($base_dir, $parent_dir);
73
74 # all this gzip stuff should one day be replaced by a gzip/bzip/zip/tar
75 # handling plugin
76 my $gz = 0;
77 if ($file =~ /\.gz$/i) {
78 $gz = 1;
79 }
80
81 # read in the document - input is assumed throughout this plugin to already be utf8
82 if ($gz) {
83 if (!open (INFILE, "zcat $filename |")) {
84 print STDERR "GMLPlug::read - zcat couldn't read $filename\n";
85 return 0;
86 }
87 } else {
88 if (!open (INFILE, $filename)) {
89 print STDERR "GMLPlug::read - couldn't read $filename\n";
90 return 0;
91 }
92 }
93
94 undef $/;
95 my $gml = <INFILE>;
96 $/ = "\n";
97 close (INFILE);
98
99 my @gml_sections = split("</gsdlsection>",$gml);
100 $gml = shift(@gml_sections);
101
102 my $no_docs = 0;
103
104 while (1) {
105 # create a new document
106 my $doc_obj = new doc ();
107 my $section = $doc_obj->get_top_section();
108
109 # process the document
110 my $firstsection = 1;
111 while (1) {
112 my ($tags, $text) = ("", "");
113
114 my @indenting_sections = split("<gsdlsection", $gml);
115 shift(@indenting_sections); # first entry is trivially empty
116
117 foreach $gml (@indenting_sections) {
118
119 if ($gml =~ /^\s*([^>]*)>(.*)$/so) {
120 $tags = $1 if defined $1;
121 $text = &GMLPlug::_unescape_text($2);
122
123 } else {
124 print STDERR "GMLPlug::read - error in file $filename\n";
125 print STDERR "text: \"$gml\"\n";
126 last;
127 }
128
129 # create the section (unless this is the first section)
130 if ($firstsection) {
131 $firstsection = 0;
132# $tags =~ /gsdlsourcefilename\s*=\s*(?:\"([^\"]*)\")/o;
133# $src_filename = $2 || $3;
134
135 } else {
136
137 $tags =~ s/gsdlnum\s*=\s*\"?(\d+)\"?//o;
138 if (defined $1) {
139 $section .= ".$1";
140 $doc_obj->create_named_section($section);
141 } else {
142 $section = $doc_obj->insert_section($doc_obj->get_end_child($section));
143 }
144 }
145
146 # add the tags
147 while ((defined $tags) && ($tags =~ s/^\s*(\S+)=\"([^\"]*)\"//o)) {
148 $doc_obj->add_utf8_metadata($section, $1, &GMLPlug::_unescape_text($2))
149 if (defined $1 and defined $2);
150
151 }
152
153 # add the text
154 $doc_obj->add_utf8_text($section, $text)
155 if ((defined $text) && ($text ne ""));
156 }
157
158 $gml = shift(@gml_sections); # get next bit of data
159 last unless defined $gml;
160 last if $section eq ""; # back to top level again (more than one document in gml file)
161 $section = $doc_obj->get_parent_section ($section);
162 } # while (1) section level
163
164 # add the associated files
165 my $assoc_files = $doc_obj->get_metadata($doc_obj->get_top_section(), "gsdlassocfile");
166 my ($assoc_file_info, $afile);
167 foreach $assoc_file_info (@$assoc_files) {
168 my ($assoc_file, $mime_type, $dir) = split (":", $assoc_file_info);
169 if (defined $dir && $dir =~ /\w/) {
170 $afile = &util::filename_cat($dir, $assoc_file);
171 } else {
172 $afile = $assoc_file;
173 }
174 $doc_obj->associate_file(&util::filename_cat($parent_dir, $assoc_file),
175 $afile, $mime_type);
176 }
177 $doc_obj->delete_metadata($doc_obj->get_top_section(), "gsdlassocfile");
178
179 # add metadata passed in from elsewhere
180 $self->extra_metadata ($doc_obj, $doc_obj->get_top_section(), $metadata);
181
182 # do any automatic metadata extraction
183 $self->auto_extract_metadata ($doc_obj);
184
185 # assume the document has an OID already
186
187 # process the document
188 $processor->process($doc_obj, $file);
189
190 $no_docs++;
191 last if ($maxdocs > -1 && $no_docs >= $maxdocs);
192 last unless defined $gml && $gml =~ /\w/;
193 } # while(1) document level
194
195 return $no_docs; # no of docs processed
196}
197
198sub _unescape_text {
199 my ($text) = @_;
200
201 # special characters in the gml encoding
202 $text =~ s/&lt;/</g;
203 $text =~ s/&gt;/>/g;
204 $text =~ s/&quot;/\"/g;
205 $text =~ s/&amp;/&/g; # this has to be last...
206
207 return $text;
208}
209
2101;
Note: See TracBrowser for help on using the repository browser.