source: gsdl/trunk/perllib/plugins/GMLPlugin.pm@ 15918

Last change on this file since 15918 was 15872, checked in by kjdon, 16 years ago

plugin overhaul: plugins renamed to xxPlugin, and in some cases the names are made more sensible. They now use the new base plugins. Hopefully we have better code reuse. Some of the plugins still need work done as I didn't want to spend another month doing this before committing it. Alos, I haven't really tested anything yet...

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 7.1 KB
RevLine 
[537]1###########################################################################
2#
[15872]3# GMLPlugin.pm --
[537]4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
[4]25
[537]26# plugin which processes a GML format document
[840]27# assumes that gml tags are all in lower-case.
[537]28
[3540]29# 12/05/02 Added usage datastructure - John Thompson
30
[15872]31package GMLPlugin;
[4]32
[15872]33use BasePlugin;
[4]34use util;
35use doc;
36
[10254]37use strict;
38no strict 'refs'; # allow filehandles to be variables and viceversa
39
[4]40sub BEGIN {
[15872]41 @GMLPlugin::ISA = ('BasePlugin');
[4]42}
43
[6408]44my $arguments =
45 [ { 'name' => "process_exp",
[15872]46 'desc' => "{BasePlugin.process_exp}",
[6408]47 'type' => "regexp",
48 'deft' => &get_default_process_exp() }
49 ];
50
[15872]51my $options = { 'name' => "GMLPlugin",
52 'desc' => "{GMLPlugin.desc}",
[6408]53 'abstract' => "no",
54 'inherits' => "yes",
55 'args' => $arguments };
[3540]56
[4]57sub new {
[10218]58 my ($class) = shift (@_);
59 my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
60 push(@$pluginlist, $class);
[4]61
[15872]62 push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
63 push(@{$hashArgOptLists->{"OptList"}},$options);
[3540]64
[15872]65 my $self = new BasePlugin($pluginlist, $inputargs, $hashArgOptLists);
[4]66
[10218]67 return bless $self, $class;
68}
69
[1244]70sub get_default_process_exp {
[4]71 my $self = shift (@_);
72
[1269]73 return q^(?i)\.gml?$^;
[4]74}
75
[317]76# return number of files processed, undef if can't process
[4]77# Note that $base_dir might be "" and that $file might
78# include directories
79sub read {
80 my $self = shift (@_);
[9853]81 my ($pluginfo, $base_dir, $file, $metadata, $processor, $maxdocs, $total_count, $gli) = @_;
[1424]82 my $outhandle = $self->{'outhandle'};
[4]83
[11090]84 #check process and block exps, smart block, etc
85 my ($block_status,$filename) = $self->read_block(@_);
86 return $block_status if ((!defined $block_status) || ($block_status==0));
[2795]87
[1244]88 $file =~ s/^[\/\\]+//; # $file often begins with / so we'll tidy it up
[4]89
[15872]90 print STDERR "<Processing n='$file' p='GMLPlugin'>\n" if ($gli);
91 print $outhandle "GMLPlugin: processing $file\n";
[4]92
[1244]93 my $parent_dir = $file;
94 $parent_dir =~ s/[^\\\/]*$//;
95 $parent_dir = &util::filename_cat ($base_dir, $parent_dir);
96
[1269]97 if (!open (INFILE, $filename)) {
[9584]98 if ($gli) {
99 print STDERR "<ProcessingError n='$file' r='Could not read $filename'>\n";
100 }
[15872]101 print $outhandle "GMLPlugin::read - couldn't read $filename\n";
[7362]102 return -1;
[433]103 }
104
[840]105 undef $/;
[862]106 my $gml = <INFILE>;
[840]107 $/ = "\n";
[4]108 close (INFILE);
[862]109
[840]110 my @gml_sections = split("</gsdlsection>",$gml);
111 $gml = shift(@gml_sections);
[4]112
[840]113 my $no_docs = 0;
114
[862]115 while (1) {
[840]116 # create a new document
117 my $doc_obj = new doc ();
118 my $section = $doc_obj->get_top_section();
119
120 # process the document
121 my $firstsection = 1;
[862]122 while (1) {
123 my ($tags, $text) = ("", "");
[840]124
[862]125 my @indenting_sections = split("<gsdlsection", $gml);
[2267]126 shift(@indenting_sections); # skips over xml header if present
[840]127
[862]128 foreach $gml (@indenting_sections) {
[840]129
[862]130 if ($gml =~ /^\s*([^>]*)>(.*)$/so) {
131 $tags = $1 if defined $1;
[15872]132 $text = &GMLPlugin::_unescape_text($2);
[862]133
134 } else {
[15872]135 print $outhandle "GMLPlugin::read - error in file $filename\n";
[1424]136 print $outhandle "text: \"$gml\"\n";
[840]137 last;
138 }
[4]139
[840]140 # create the section (unless this is the first section)
[862]141 if ($firstsection) {
[863]142 $firstsection = 0;
[862]143# $tags =~ /gsdlsourcefilename\s*=\s*(?:\"([^\"]*)\")/o;
144# $src_filename = $2 || $3;
[840]145
[862]146 } else {
147
148 $tags =~ s/gsdlnum\s*=\s*\"?(\d+)\"?//o;
149 if (defined $1) {
[840]150 $section .= ".$1";
151 $doc_obj->create_named_section($section);
[862]152 } else {
[840]153 $section = $doc_obj->insert_section($doc_obj->get_end_child($section));
154 }
155 }
156
[2267]157 # add the metadata
158 # could be stored as either attributes or ....
[863]159 while ((defined $tags) && ($tags =~ s/^\s*(\S+)=\"([^\"]*)\"//o)) {
[15872]160 $doc_obj->add_utf8_metadata($section, $1, &GMLPlugin::_unescape_text($2))
[840]161 if (defined $1 and defined $2);
[863]162
[840]163 }
164
[2267]165 # ... or tags (xml compliant)
166 if ($text =~ m/^\s*<metadata>/)
167 {
[2326]168 my ($metadata, $tagname, $tagvalue);
[2267]169 ($metadata,$text)
170 = ($text =~ m/\s*<metadata>\s*(<.*)\s*<\/metadata>(.*)$/s);
[2363]171
172 # note: \1 refers to 1st match within regexp, so we can
173 # handle the unescaped text here...
[2267]174 while ((defined $metadata)
[2363]175 && ($metadata =~ s/<(.*?)>(.*?)<\/\1>//s))
[2267]176 {
[2326]177 if (defined $1 && defined $2)
178 {
179 $tagname = $1;
180 $tagvalue = $2;
181
182 # if tagname begins with '/' it will be escaped
183 $tagname =~ s/^&\#47;/\//;
184
[15872]185 $doc_obj->add_utf8_metadata($section, $tagname, &GMLPlugin::_unescape_text($tagvalue));
[2326]186 }
[2267]187 }
188 }
189
[840]190 # add the text
[2267]191
[840]192 $doc_obj->add_utf8_text($section, $text)
193 if ((defined $text) && ($text ne ""));
[4]194 }
195
[840]196 $gml = shift(@gml_sections); # get next bit of data
[862]197 last unless defined $gml;
[863]198 last if $section eq ""; # back to top level again (more than one document in gml file)
[4]199 $section = $doc_obj->get_parent_section ($section);
[1244]200 } # while (1) section level
[8121]201
202 # add the FileFormat as the metadata
203 $doc_obj->add_metadata($doc_obj->get_top_section(), "FileFormat", "GML");
[4]204
[862]205 # add the associated files
[1244]206 my $assoc_files = $doc_obj->get_metadata($doc_obj->get_top_section(), "gsdlassocfile");
[1401]207 my ($assoc_file_info);
[8121]208
[1401]209 foreach $assoc_file_info (@$assoc_files)
210 {
[862]211 my ($assoc_file, $mime_type, $dir) = split (":", $assoc_file_info);
[1401]212 my $real_dir = &util::filename_cat($parent_dir, $assoc_file),
213 my $assoc_dir = (defined $dir && $dir ne "")
214 ? &util::filename_cat($dir, $assoc_file) : $assoc_file;
215 $doc_obj->associate_file($real_dir, $assoc_dir, $mime_type);
216
[840]217 }
218 $doc_obj->delete_metadata($doc_obj->get_top_section(), "gsdlassocfile");
[1401]219
[863]220 # add metadata passed in from elsewhere
221 $self->extra_metadata ($doc_obj, $doc_obj->get_top_section(), $metadata);
[862]222
[1244]223 # do any automatic metadata extraction
224 $self->auto_extract_metadata ($doc_obj);
225
226 # assume the document has an OID already
[1401]227
[840]228 # process the document
229 $processor->process($doc_obj, $file);
[862]230
[840]231 $no_docs++;
[9853]232 last if ($maxdocs > -1 && ($total_count+$no_docs) >= $maxdocs);
[918]233 last unless defined $gml && $gml =~ /\w/;
[1244]234 } # while(1) document level
[1401]235
[840]236 return $no_docs; # no of docs processed
[4]237}
238
[1244]239sub _unescape_text {
240 my ($text) = @_;
[4]241
[1244]242 # special characters in the gml encoding
243 $text =~ s/&lt;/</g;
244 $text =~ s/&gt;/>/g;
245 $text =~ s/&quot;/\"/g;
246 $text =~ s/&amp;/&/g; # this has to be last...
247
248 return $text;
249}
250
[4]2511;
Note: See TracBrowser for help on using the repository browser.