source: gsdl/trunk/perllib/plugins/GMLPlugin.pm@ 15918

Last change on this file since 15918 was 15872, checked in by kjdon, 16 years ago

plugin overhaul: plugins renamed to xxPlugin, and in some cases the names are made more sensible. They now use the new base plugins. Hopefully we have better code reuse. Some of the plugins still need work done as I didn't want to spend another month doing this before committing it. Alos, I haven't really tested anything yet...

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 7.1 KB
Line 
1###########################################################################
2#
3# GMLPlugin.pm --
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26# plugin which processes a GML format document
27# assumes that gml tags are all in lower-case.
28
29# 12/05/02 Added usage datastructure - John Thompson
30
31package GMLPlugin;
32
33use BasePlugin;
34use util;
35use doc;
36
37use strict;
38no strict 'refs'; # allow filehandles to be variables and viceversa
39
40sub BEGIN {
41 @GMLPlugin::ISA = ('BasePlugin');
42}
43
44my $arguments =
45 [ { 'name' => "process_exp",
46 'desc' => "{BasePlugin.process_exp}",
47 'type' => "regexp",
48 'deft' => &get_default_process_exp() }
49 ];
50
51my $options = { 'name' => "GMLPlugin",
52 'desc' => "{GMLPlugin.desc}",
53 'abstract' => "no",
54 'inherits' => "yes",
55 'args' => $arguments };
56
57sub new {
58 my ($class) = shift (@_);
59 my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
60 push(@$pluginlist, $class);
61
62 push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
63 push(@{$hashArgOptLists->{"OptList"}},$options);
64
65 my $self = new BasePlugin($pluginlist, $inputargs, $hashArgOptLists);
66
67 return bless $self, $class;
68}
69
70sub get_default_process_exp {
71 my $self = shift (@_);
72
73 return q^(?i)\.gml?$^;
74}
75
76# return number of files processed, undef if can't process
77# Note that $base_dir might be "" and that $file might
78# include directories
79sub read {
80 my $self = shift (@_);
81 my ($pluginfo, $base_dir, $file, $metadata, $processor, $maxdocs, $total_count, $gli) = @_;
82 my $outhandle = $self->{'outhandle'};
83
84 #check process and block exps, smart block, etc
85 my ($block_status,$filename) = $self->read_block(@_);
86 return $block_status if ((!defined $block_status) || ($block_status==0));
87
88 $file =~ s/^[\/\\]+//; # $file often begins with / so we'll tidy it up
89
90 print STDERR "<Processing n='$file' p='GMLPlugin'>\n" if ($gli);
91 print $outhandle "GMLPlugin: processing $file\n";
92
93 my $parent_dir = $file;
94 $parent_dir =~ s/[^\\\/]*$//;
95 $parent_dir = &util::filename_cat ($base_dir, $parent_dir);
96
97 if (!open (INFILE, $filename)) {
98 if ($gli) {
99 print STDERR "<ProcessingError n='$file' r='Could not read $filename'>\n";
100 }
101 print $outhandle "GMLPlugin::read - couldn't read $filename\n";
102 return -1;
103 }
104
105 undef $/;
106 my $gml = <INFILE>;
107 $/ = "\n";
108 close (INFILE);
109
110 my @gml_sections = split("</gsdlsection>",$gml);
111 $gml = shift(@gml_sections);
112
113 my $no_docs = 0;
114
115 while (1) {
116 # create a new document
117 my $doc_obj = new doc ();
118 my $section = $doc_obj->get_top_section();
119
120 # process the document
121 my $firstsection = 1;
122 while (1) {
123 my ($tags, $text) = ("", "");
124
125 my @indenting_sections = split("<gsdlsection", $gml);
126 shift(@indenting_sections); # skips over xml header if present
127
128 foreach $gml (@indenting_sections) {
129
130 if ($gml =~ /^\s*([^>]*)>(.*)$/so) {
131 $tags = $1 if defined $1;
132 $text = &GMLPlugin::_unescape_text($2);
133
134 } else {
135 print $outhandle "GMLPlugin::read - error in file $filename\n";
136 print $outhandle "text: \"$gml\"\n";
137 last;
138 }
139
140 # create the section (unless this is the first section)
141 if ($firstsection) {
142 $firstsection = 0;
143# $tags =~ /gsdlsourcefilename\s*=\s*(?:\"([^\"]*)\")/o;
144# $src_filename = $2 || $3;
145
146 } else {
147
148 $tags =~ s/gsdlnum\s*=\s*\"?(\d+)\"?//o;
149 if (defined $1) {
150 $section .= ".$1";
151 $doc_obj->create_named_section($section);
152 } else {
153 $section = $doc_obj->insert_section($doc_obj->get_end_child($section));
154 }
155 }
156
157 # add the metadata
158 # could be stored as either attributes or ....
159 while ((defined $tags) && ($tags =~ s/^\s*(\S+)=\"([^\"]*)\"//o)) {
160 $doc_obj->add_utf8_metadata($section, $1, &GMLPlugin::_unescape_text($2))
161 if (defined $1 and defined $2);
162
163 }
164
165 # ... or tags (xml compliant)
166 if ($text =~ m/^\s*<metadata>/)
167 {
168 my ($metadata, $tagname, $tagvalue);
169 ($metadata,$text)
170 = ($text =~ m/\s*<metadata>\s*(<.*)\s*<\/metadata>(.*)$/s);
171
172 # note: \1 refers to 1st match within regexp, so we can
173 # handle the unescaped text here...
174 while ((defined $metadata)
175 && ($metadata =~ s/<(.*?)>(.*?)<\/\1>//s))
176 {
177 if (defined $1 && defined $2)
178 {
179 $tagname = $1;
180 $tagvalue = $2;
181
182 # if tagname begins with '/' it will be escaped
183 $tagname =~ s/^&\#47;/\//;
184
185 $doc_obj->add_utf8_metadata($section, $tagname, &GMLPlugin::_unescape_text($tagvalue));
186 }
187 }
188 }
189
190 # add the text
191
192 $doc_obj->add_utf8_text($section, $text)
193 if ((defined $text) && ($text ne ""));
194 }
195
196 $gml = shift(@gml_sections); # get next bit of data
197 last unless defined $gml;
198 last if $section eq ""; # back to top level again (more than one document in gml file)
199 $section = $doc_obj->get_parent_section ($section);
200 } # while (1) section level
201
202 # add the FileFormat as the metadata
203 $doc_obj->add_metadata($doc_obj->get_top_section(), "FileFormat", "GML");
204
205 # add the associated files
206 my $assoc_files = $doc_obj->get_metadata($doc_obj->get_top_section(), "gsdlassocfile");
207 my ($assoc_file_info);
208
209 foreach $assoc_file_info (@$assoc_files)
210 {
211 my ($assoc_file, $mime_type, $dir) = split (":", $assoc_file_info);
212 my $real_dir = &util::filename_cat($parent_dir, $assoc_file),
213 my $assoc_dir = (defined $dir && $dir ne "")
214 ? &util::filename_cat($dir, $assoc_file) : $assoc_file;
215 $doc_obj->associate_file($real_dir, $assoc_dir, $mime_type);
216
217 }
218 $doc_obj->delete_metadata($doc_obj->get_top_section(), "gsdlassocfile");
219
220 # add metadata passed in from elsewhere
221 $self->extra_metadata ($doc_obj, $doc_obj->get_top_section(), $metadata);
222
223 # do any automatic metadata extraction
224 $self->auto_extract_metadata ($doc_obj);
225
226 # assume the document has an OID already
227
228 # process the document
229 $processor->process($doc_obj, $file);
230
231 $no_docs++;
232 last if ($maxdocs > -1 && ($total_count+$no_docs) >= $maxdocs);
233 last unless defined $gml && $gml =~ /\w/;
234 } # while(1) document level
235
236 return $no_docs; # no of docs processed
237}
238
239sub _unescape_text {
240 my ($text) = @_;
241
242 # special characters in the gml encoding
243 $text =~ s/&lt;/</g;
244 $text =~ s/&gt;/>/g;
245 $text =~ s/&quot;/\"/g;
246 $text =~ s/&amp;/&/g; # this has to be last...
247
248 return $text;
249}
250
2511;
Note: See TracBrowser for help on using the repository browser.