source: trunk/gsdl/perllib/plugins/GMLPlug.pm@ 9823

Last change on this file since 9823 was 9703, checked in by mdewsnip, 19 years ago

Improvement to previous change so "file not processed" messages are seen in Expert mode.

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 7.0 KB
Line 
1###########################################################################
2#
3# GMLPlug.pm --
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26# plugin which processes a GML format document
27# assumes that gml tags are all in lower-case.
28
29# 12/05/02 Added usage datastructure - John Thompson
30
31package GMLPlug;
32
33use BasPlug;
34use util;
35use doc;
36
37sub BEGIN {
38 @ISA = ('BasPlug');
39}
40
41my $arguments =
42 [ { 'name' => "process_exp",
43 'desc' => "{BasPlug.process_exp}",
44 'type' => "regexp",
45 'deft' => &get_default_process_exp() }
46 ];
47
48my $options = { 'name' => "GMLPlug",
49 'desc' => "{GMLPlug.desc}",
50 'abstract' => "no",
51 'inherits' => "yes",
52 'args' => $arguments };
53
54sub new {
55 my ($class) = @_;
56 my $self = new BasPlug ("GMLPlug", @_);
57
58 # 14-05-02 To allow for proper inheritance of arguments - John Thompson
59 my $option_list = $self->{'option_list'};
60 push( @{$option_list}, $options );
61
62 return bless $self, $class;}
63
64sub get_default_process_exp {
65 my $self = shift (@_);
66
67 return q^(?i)\.gml?$^;
68}
69
70# return number of files processed, undef if can't process
71# Note that $base_dir might be "" and that $file might
72# include directories
73sub read {
74 my $self = shift (@_);
75 my ($pluginfo, $base_dir, $file, $metadata, $processor, $maxdocs, $gli) = @_;
76 my $outhandle = $self->{'outhandle'};
77
78 my $filename = $file;
79 $filename = &util::filename_cat ($base_dir, $file) if $base_dir =~ /\w/;
80
81 return 0 if $self->{'block_exp'} ne "" && $filename =~ /$self->{'block_exp'}/;
82 if ($filename !~ /$self->{'process_exp'}/ || !-f $filename) {
83 return undef;
84 }
85 $file =~ s/^[\/\\]+//; # $file often begins with / so we'll tidy it up
86
87 print STDERR "<Processing n='$file' p='GMLPlug'>\n" if ($gli);
88 print $outhandle "GMLPlug: processing $file\n";
89
90 my $parent_dir = $file;
91 $parent_dir =~ s/[^\\\/]*$//;
92 $parent_dir = &util::filename_cat ($base_dir, $parent_dir);
93
94 if (!open (INFILE, $filename)) {
95 if ($gli) {
96 print STDERR "<ProcessingError n='$file' r='Could not read $filename'>\n";
97 }
98 print $outhandle "GMLPlug::read - couldn't read $filename\n";
99 return -1;
100 }
101
102 undef $/;
103 my $gml = <INFILE>;
104 $/ = "\n";
105 close (INFILE);
106
107 my @gml_sections = split("</gsdlsection>",$gml);
108 $gml = shift(@gml_sections);
109
110 my $no_docs = 0;
111
112 while (1) {
113 # create a new document
114 my $doc_obj = new doc ();
115 my $section = $doc_obj->get_top_section();
116
117 # process the document
118 my $firstsection = 1;
119 while (1) {
120 my ($tags, $text) = ("", "");
121
122 my @indenting_sections = split("<gsdlsection", $gml);
123 shift(@indenting_sections); # skips over xml header if present
124
125 foreach $gml (@indenting_sections) {
126
127 if ($gml =~ /^\s*([^>]*)>(.*)$/so) {
128 $tags = $1 if defined $1;
129 $text = &GMLPlug::_unescape_text($2);
130
131 } else {
132 print $outhandle "GMLPlug::read - error in file $filename\n";
133 print $outhandle "text: \"$gml\"\n";
134 last;
135 }
136
137 # create the section (unless this is the first section)
138 if ($firstsection) {
139 $firstsection = 0;
140# $tags =~ /gsdlsourcefilename\s*=\s*(?:\"([^\"]*)\")/o;
141# $src_filename = $2 || $3;
142
143 } else {
144
145 $tags =~ s/gsdlnum\s*=\s*\"?(\d+)\"?//o;
146 if (defined $1) {
147 $section .= ".$1";
148 $doc_obj->create_named_section($section);
149 } else {
150 $section = $doc_obj->insert_section($doc_obj->get_end_child($section));
151 }
152 }
153
154 # add the metadata
155 # could be stored as either attributes or ....
156 while ((defined $tags) && ($tags =~ s/^\s*(\S+)=\"([^\"]*)\"//o)) {
157 $doc_obj->add_utf8_metadata($section, $1, &GMLPlug::_unescape_text($2))
158 if (defined $1 and defined $2);
159
160 }
161
162 # ... or tags (xml compliant)
163 if ($text =~ m/^\s*<metadata>/)
164 {
165 my ($metadata, $tagname, $tagvalue);
166 ($metadata,$text)
167 = ($text =~ m/\s*<metadata>\s*(<.*)\s*<\/metadata>(.*)$/s);
168
169 # note: \1 refers to 1st match within regexp, so we can
170 # handle the unescaped text here...
171 while ((defined $metadata)
172 && ($metadata =~ s/<(.*?)>(.*?)<\/\1>//s))
173 {
174 if (defined $1 && defined $2)
175 {
176 $tagname = $1;
177 $tagvalue = $2;
178
179 # if tagname begins with '/' it will be escaped
180 $tagname =~ s/^&\#47;/\//;
181
182 $doc_obj->add_utf8_metadata($section, $tagname, &GMLPlug::_unescape_text($tagvalue));
183 }
184 }
185 }
186
187 # add the text
188
189 $doc_obj->add_utf8_text($section, $text)
190 if ((defined $text) && ($text ne ""));
191 }
192
193 $gml = shift(@gml_sections); # get next bit of data
194 last unless defined $gml;
195 last if $section eq ""; # back to top level again (more than one document in gml file)
196 $section = $doc_obj->get_parent_section ($section);
197 } # while (1) section level
198
199 # add the FileFormat as the metadata
200 $doc_obj->add_metadata($doc_obj->get_top_section(), "FileFormat", "GML");
201
202 # add the associated files
203 my $assoc_files = $doc_obj->get_metadata($doc_obj->get_top_section(), "gsdlassocfile");
204 my ($assoc_file_info);
205
206 foreach $assoc_file_info (@$assoc_files)
207 {
208 my ($assoc_file, $mime_type, $dir) = split (":", $assoc_file_info);
209 my $real_dir = &util::filename_cat($parent_dir, $assoc_file),
210 my $assoc_dir = (defined $dir && $dir ne "")
211 ? &util::filename_cat($dir, $assoc_file) : $assoc_file;
212 $doc_obj->associate_file($real_dir, $assoc_dir, $mime_type);
213
214 }
215 $doc_obj->delete_metadata($doc_obj->get_top_section(), "gsdlassocfile");
216
217 # add metadata passed in from elsewhere
218 $self->extra_metadata ($doc_obj, $doc_obj->get_top_section(), $metadata);
219
220 # do any automatic metadata extraction
221 $self->auto_extract_metadata ($doc_obj);
222
223 # assume the document has an OID already
224
225 # process the document
226 $processor->process($doc_obj, $file);
227
228 $no_docs++;
229 last if ($maxdocs > -1 && $no_docs >= $maxdocs);
230 last unless defined $gml && $gml =~ /\w/;
231 } # while(1) document level
232
233 return $no_docs; # no of docs processed
234}
235
236sub _unescape_text {
237 my ($text) = @_;
238
239 # special characters in the gml encoding
240 $text =~ s/&lt;/</g;
241 $text =~ s/&gt;/>/g;
242 $text =~ s/&quot;/\"/g;
243 $text =~ s/&amp;/&/g; # this has to be last...
244
245 return $text;
246}
247
2481;
Note: See TracBrowser for help on using the repository browser.