source: trunk/gsdl/perllib/plugins/GMLPlug.pm@ 10254

Last change on this file since 10254 was 10254, checked in by kjdon, 19 years ago

added 'use strict' to all plugins, and made modifications (mostly adding 'my') to make them compile

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 7.3 KB
Line 
1###########################################################################
2#
3# GMLPlug.pm --
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26# plugin which processes a GML format document
27# assumes that gml tags are all in lower-case.
28
29# 12/05/02 Added usage datastructure - John Thompson
30
31package GMLPlug;
32
33use BasPlug;
34use util;
35use doc;
36
37use strict;
38no strict 'refs'; # allow filehandles to be variables and viceversa
39
40sub BEGIN {
41 @GMLPlug::ISA = ('BasPlug');
42}
43
44my $arguments =
45 [ { 'name' => "process_exp",
46 'desc' => "{BasPlug.process_exp}",
47 'type' => "regexp",
48 'deft' => &get_default_process_exp() }
49 ];
50
51my $options = { 'name' => "GMLPlug",
52 'desc' => "{GMLPlug.desc}",
53 'abstract' => "no",
54 'inherits' => "yes",
55 'args' => $arguments };
56
57sub new {
58 my ($class) = shift (@_);
59 my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
60 push(@$pluginlist, $class);
61
62 if(defined $arguments){ push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});}
63 if(defined $options) { push(@{$hashArgOptLists->{"OptList"}},$options)};
64
65 my $self = (defined $hashArgOptLists)? new BasPlug($pluginlist,$inputargs,$hashArgOptLists): new BasPlug($pluginlist,$inputargs);
66
67 return bless $self, $class;
68}
69
70sub get_default_process_exp {
71 my $self = shift (@_);
72
73 return q^(?i)\.gml?$^;
74}
75
76# return number of files processed, undef if can't process
77# Note that $base_dir might be "" and that $file might
78# include directories
79sub read {
80 my $self = shift (@_);
81 my ($pluginfo, $base_dir, $file, $metadata, $processor, $maxdocs, $total_count, $gli) = @_;
82 my $outhandle = $self->{'outhandle'};
83
84 my $filename = $file;
85 $filename = &util::filename_cat ($base_dir, $file) if $base_dir =~ /\w/;
86
87 return 0 if $self->{'block_exp'} ne "" && $filename =~ /$self->{'block_exp'}/;
88 if ($filename !~ /$self->{'process_exp'}/ || !-f $filename) {
89 return undef;
90 }
91 $file =~ s/^[\/\\]+//; # $file often begins with / so we'll tidy it up
92
93 print STDERR "<Processing n='$file' p='GMLPlug'>\n" if ($gli);
94 print $outhandle "GMLPlug: processing $file\n";
95
96 my $parent_dir = $file;
97 $parent_dir =~ s/[^\\\/]*$//;
98 $parent_dir = &util::filename_cat ($base_dir, $parent_dir);
99
100 if (!open (INFILE, $filename)) {
101 if ($gli) {
102 print STDERR "<ProcessingError n='$file' r='Could not read $filename'>\n";
103 }
104 print $outhandle "GMLPlug::read - couldn't read $filename\n";
105 return -1;
106 }
107
108 undef $/;
109 my $gml = <INFILE>;
110 $/ = "\n";
111 close (INFILE);
112
113 my @gml_sections = split("</gsdlsection>",$gml);
114 $gml = shift(@gml_sections);
115
116 my $no_docs = 0;
117
118 while (1) {
119 # create a new document
120 my $doc_obj = new doc ();
121 my $section = $doc_obj->get_top_section();
122
123 # process the document
124 my $firstsection = 1;
125 while (1) {
126 my ($tags, $text) = ("", "");
127
128 my @indenting_sections = split("<gsdlsection", $gml);
129 shift(@indenting_sections); # skips over xml header if present
130
131 foreach $gml (@indenting_sections) {
132
133 if ($gml =~ /^\s*([^>]*)>(.*)$/so) {
134 $tags = $1 if defined $1;
135 $text = &GMLPlug::_unescape_text($2);
136
137 } else {
138 print $outhandle "GMLPlug::read - error in file $filename\n";
139 print $outhandle "text: \"$gml\"\n";
140 last;
141 }
142
143 # create the section (unless this is the first section)
144 if ($firstsection) {
145 $firstsection = 0;
146# $tags =~ /gsdlsourcefilename\s*=\s*(?:\"([^\"]*)\")/o;
147# $src_filename = $2 || $3;
148
149 } else {
150
151 $tags =~ s/gsdlnum\s*=\s*\"?(\d+)\"?//o;
152 if (defined $1) {
153 $section .= ".$1";
154 $doc_obj->create_named_section($section);
155 } else {
156 $section = $doc_obj->insert_section($doc_obj->get_end_child($section));
157 }
158 }
159
160 # add the metadata
161 # could be stored as either attributes or ....
162 while ((defined $tags) && ($tags =~ s/^\s*(\S+)=\"([^\"]*)\"//o)) {
163 $doc_obj->add_utf8_metadata($section, $1, &GMLPlug::_unescape_text($2))
164 if (defined $1 and defined $2);
165
166 }
167
168 # ... or tags (xml compliant)
169 if ($text =~ m/^\s*<metadata>/)
170 {
171 my ($metadata, $tagname, $tagvalue);
172 ($metadata,$text)
173 = ($text =~ m/\s*<metadata>\s*(<.*)\s*<\/metadata>(.*)$/s);
174
175 # note: \1 refers to 1st match within regexp, so we can
176 # handle the unescaped text here...
177 while ((defined $metadata)
178 && ($metadata =~ s/<(.*?)>(.*?)<\/\1>//s))
179 {
180 if (defined $1 && defined $2)
181 {
182 $tagname = $1;
183 $tagvalue = $2;
184
185 # if tagname begins with '/' it will be escaped
186 $tagname =~ s/^&\#47;/\//;
187
188 $doc_obj->add_utf8_metadata($section, $tagname, &GMLPlug::_unescape_text($tagvalue));
189 }
190 }
191 }
192
193 # add the text
194
195 $doc_obj->add_utf8_text($section, $text)
196 if ((defined $text) && ($text ne ""));
197 }
198
199 $gml = shift(@gml_sections); # get next bit of data
200 last unless defined $gml;
201 last if $section eq ""; # back to top level again (more than one document in gml file)
202 $section = $doc_obj->get_parent_section ($section);
203 } # while (1) section level
204
205 # add the FileFormat as the metadata
206 $doc_obj->add_metadata($doc_obj->get_top_section(), "FileFormat", "GML");
207
208 # add the associated files
209 my $assoc_files = $doc_obj->get_metadata($doc_obj->get_top_section(), "gsdlassocfile");
210 my ($assoc_file_info);
211
212 foreach $assoc_file_info (@$assoc_files)
213 {
214 my ($assoc_file, $mime_type, $dir) = split (":", $assoc_file_info);
215 my $real_dir = &util::filename_cat($parent_dir, $assoc_file),
216 my $assoc_dir = (defined $dir && $dir ne "")
217 ? &util::filename_cat($dir, $assoc_file) : $assoc_file;
218 $doc_obj->associate_file($real_dir, $assoc_dir, $mime_type);
219
220 }
221 $doc_obj->delete_metadata($doc_obj->get_top_section(), "gsdlassocfile");
222
223 # add metadata passed in from elsewhere
224 $self->extra_metadata ($doc_obj, $doc_obj->get_top_section(), $metadata);
225
226 # do any automatic metadata extraction
227 $self->auto_extract_metadata ($doc_obj);
228
229 # assume the document has an OID already
230
231 # process the document
232 $processor->process($doc_obj, $file);
233
234 $no_docs++;
235 last if ($maxdocs > -1 && ($total_count+$no_docs) >= $maxdocs);
236 last unless defined $gml && $gml =~ /\w/;
237 } # while(1) document level
238
239 return $no_docs; # no of docs processed
240}
241
242sub _unescape_text {
243 my ($text) = @_;
244
245 # special characters in the gml encoding
246 $text =~ s/&lt;/</g;
247 $text =~ s/&gt;/>/g;
248 $text =~ s/&quot;/\"/g;
249 $text =~ s/&amp;/&/g; # this has to be last...
250
251 return $text;
252}
253
2541;
Note: See TracBrowser for help on using the repository browser.