source: trunk/gsdl/perllib/plugins/GMLPlug.pm@ 7243

Last change on this file since 7243 was 6408, checked in by jmt12, 20 years ago

Added two new attributes for script arguments. HiddenGLI controls whether the argument will be visible at all in GLI, while ModeGLI defines the lowest detail mode under which the argument will be visible (only really for import and buildcol). Also ensured that the scripts were reporting their correct default process expressions, and further refined argument types by adding the catagory regexp for any regular expression (which can then be hidden under lower detail modes in GLI)

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 6.8 KB
Line 
1###########################################################################
2#
3# GMLPlug.pm --
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26# plugin which processes a GML format document
27# assumes that gml tags are all in lower-case.
28
29# 12/05/02 Added usage datastructure - John Thompson
30
31package GMLPlug;
32
33use BasPlug;
34use util;
35use doc;
36
37sub BEGIN {
38 @ISA = ('BasPlug');
39}
40
41my $arguments =
42 [ { 'name' => "process_exp",
43 'desc' => "{BasPlug.process_exp}",
44 'type' => "regexp",
45 'deft' => &get_default_process_exp() }
46 ];
47
48my $options = { 'name' => "GMLPlug",
49 'desc' => "{GMLPlug.desc}",
50 'abstract' => "no",
51 'inherits' => "yes",
52 'args' => $arguments };
53
54sub new {
55 my ($class) = @_;
56 my $self = new BasPlug ("GMLPlug", @_);
57
58 # 14-05-02 To allow for proper inheritance of arguments - John Thompson
59 my $option_list = $self->{'option_list'};
60 push( @{$option_list}, $options );
61
62 return bless $self, $class;}
63
64sub get_default_process_exp {
65 my $self = shift (@_);
66
67 return q^(?i)\.gml?$^;
68}
69
70# return number of files processed, undef if can't process
71# Note that $base_dir might be "" and that $file might
72# include directories
73sub read {
74 my $self = shift (@_);
75 my ($pluginfo, $base_dir, $file, $metadata, $processor, $maxdocs, $gli) = @_;
76 my $outhandle = $self->{'outhandle'};
77
78 my $filename = $file;
79 $filename = &util::filename_cat ($base_dir, $file) if $base_dir =~ /\w/;
80
81 return 0 if $self->{'block_exp'} ne "" && $filename =~ /$self->{'block_exp'}/;
82 if ($filename !~ /$self->{'process_exp'}/ || !-f $filename) {
83 return undef;
84 }
85 $file =~ s/^[\/\\]+//; # $file often begins with / so we'll tidy it up
86
87 print STDERR "<Processing n='$file' p='GMLPlug'>\n" if ($gli);
88 print $outhandle "GMLPlug: processing $file\n";
89
90 my $parent_dir = $file;
91 $parent_dir =~ s/[^\\\/]*$//;
92 $parent_dir = &util::filename_cat ($base_dir, $parent_dir);
93
94 if (!open (INFILE, $filename)) {
95 print $outhandle "GMLPlug::read - couldn't read $filename\n";
96 return 0;
97 }
98
99 undef $/;
100 my $gml = <INFILE>;
101 $/ = "\n";
102 close (INFILE);
103
104 my @gml_sections = split("</gsdlsection>",$gml);
105 $gml = shift(@gml_sections);
106
107 my $no_docs = 0;
108
109 while (1) {
110 # create a new document
111 my $doc_obj = new doc ();
112 my $section = $doc_obj->get_top_section();
113
114 # process the document
115 my $firstsection = 1;
116 while (1) {
117 my ($tags, $text) = ("", "");
118
119 my @indenting_sections = split("<gsdlsection", $gml);
120 shift(@indenting_sections); # skips over xml header if present
121
122 foreach $gml (@indenting_sections) {
123
124 if ($gml =~ /^\s*([^>]*)>(.*)$/so) {
125 $tags = $1 if defined $1;
126 $text = &GMLPlug::_unescape_text($2);
127
128 } else {
129 print $outhandle "GMLPlug::read - error in file $filename\n";
130 print $outhandle "text: \"$gml\"\n";
131 last;
132 }
133
134 # create the section (unless this is the first section)
135 if ($firstsection) {
136 $firstsection = 0;
137# $tags =~ /gsdlsourcefilename\s*=\s*(?:\"([^\"]*)\")/o;
138# $src_filename = $2 || $3;
139
140 } else {
141
142 $tags =~ s/gsdlnum\s*=\s*\"?(\d+)\"?//o;
143 if (defined $1) {
144 $section .= ".$1";
145 $doc_obj->create_named_section($section);
146 } else {
147 $section = $doc_obj->insert_section($doc_obj->get_end_child($section));
148 }
149 }
150
151 # add the metadata
152 # could be stored as either attributes or ....
153 while ((defined $tags) && ($tags =~ s/^\s*(\S+)=\"([^\"]*)\"//o)) {
154 $doc_obj->add_utf8_metadata($section, $1, &GMLPlug::_unescape_text($2))
155 if (defined $1 and defined $2);
156
157 }
158
159 # ... or tags (xml compliant)
160 if ($text =~ m/^\s*<metadata>/)
161 {
162 my ($metadata, $tagname, $tagvalue);
163 ($metadata,$text)
164 = ($text =~ m/\s*<metadata>\s*(<.*)\s*<\/metadata>(.*)$/s);
165
166 # note: \1 refers to 1st match within regexp, so we can
167 # handle the unescaped text here...
168 while ((defined $metadata)
169 && ($metadata =~ s/<(.*?)>(.*?)<\/\1>//s))
170 {
171 if (defined $1 && defined $2)
172 {
173 $tagname = $1;
174 $tagvalue = $2;
175
176 # if tagname begins with '/' it will be escaped
177 $tagname =~ s/^&\#47;/\//;
178
179 $doc_obj->add_utf8_metadata($section, $tagname, &GMLPlug::_unescape_text($tagvalue));
180 }
181 }
182 }
183
184 # add the text
185
186 $doc_obj->add_utf8_text($section, $text)
187 if ((defined $text) && ($text ne ""));
188 }
189
190 $gml = shift(@gml_sections); # get next bit of data
191 last unless defined $gml;
192 last if $section eq ""; # back to top level again (more than one document in gml file)
193 $section = $doc_obj->get_parent_section ($section);
194 } # while (1) section level
195
196 # add the associated files
197 my $assoc_files = $doc_obj->get_metadata($doc_obj->get_top_section(), "gsdlassocfile");
198 my ($assoc_file_info);
199
200 foreach $assoc_file_info (@$assoc_files)
201 {
202 my ($assoc_file, $mime_type, $dir) = split (":", $assoc_file_info);
203 my $real_dir = &util::filename_cat($parent_dir, $assoc_file),
204 my $assoc_dir = (defined $dir && $dir ne "")
205 ? &util::filename_cat($dir, $assoc_file) : $assoc_file;
206 $doc_obj->associate_file($real_dir, $assoc_dir, $mime_type);
207
208 }
209 $doc_obj->delete_metadata($doc_obj->get_top_section(), "gsdlassocfile");
210
211 # add metadata passed in from elsewhere
212 $self->extra_metadata ($doc_obj, $doc_obj->get_top_section(), $metadata);
213
214 # do any automatic metadata extraction
215 $self->auto_extract_metadata ($doc_obj);
216
217 # assume the document has an OID already
218
219 # process the document
220 $processor->process($doc_obj, $file);
221
222 $no_docs++;
223 last if ($maxdocs > -1 && $no_docs >= $maxdocs);
224 last unless defined $gml && $gml =~ /\w/;
225 } # while(1) document level
226
227 return $no_docs; # no of docs processed
228}
229
230sub _unescape_text {
231 my ($text) = @_;
232
233 # special characters in the gml encoding
234 $text =~ s/&lt;/</g;
235 $text =~ s/&gt;/>/g;
236 $text =~ s/&quot;/\"/g;
237 $text =~ s/&amp;/&/g; # this has to be last...
238
239 return $text;
240}
241
2421;
Note: See TracBrowser for help on using the repository browser.