[537] | 1 | ###########################################################################
|
---|
| 2 | #
|
---|
[15872] | 3 | # GMLPlugin.pm --
|
---|
[537] | 4 | # A component of the Greenstone digital library software
|
---|
| 5 | # from the New Zealand Digital Library Project at the
|
---|
| 6 | # University of Waikato, New Zealand.
|
---|
| 7 | #
|
---|
| 8 | # Copyright (C) 1999 New Zealand Digital Library Project
|
---|
| 9 | #
|
---|
| 10 | # This program is free software; you can redistribute it and/or modify
|
---|
| 11 | # it under the terms of the GNU General Public License as published by
|
---|
| 12 | # the Free Software Foundation; either version 2 of the License, or
|
---|
| 13 | # (at your option) any later version.
|
---|
| 14 | #
|
---|
| 15 | # This program is distributed in the hope that it will be useful,
|
---|
| 16 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
|
---|
| 17 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
---|
| 18 | # GNU General Public License for more details.
|
---|
| 19 | #
|
---|
| 20 | # You should have received a copy of the GNU General Public License
|
---|
| 21 | # along with this program; if not, write to the Free Software
|
---|
| 22 | # Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
|
---|
| 23 | #
|
---|
| 24 | ###########################################################################
|
---|
[4] | 25 |
|
---|
[537] | 26 | # plugin which processes a GML format document
|
---|
[840] | 27 | # assumes that gml tags are all in lower-case.
|
---|
[537] | 28 |
|
---|
[3540] | 29 | # 12/05/02 Added usage datastructure - John Thompson
|
---|
| 30 |
|
---|
[15872] | 31 | package GMLPlugin;
|
---|
[4] | 32 |
|
---|
[15872] | 33 | use BasePlugin;
|
---|
[4] | 34 | use util;
|
---|
| 35 | use doc;
|
---|
| 36 |
|
---|
[10254] | 37 | use strict;
|
---|
| 38 | no strict 'refs'; # allow filehandles to be variables and viceversa
|
---|
| 39 |
|
---|
[4] | 40 | sub BEGIN {
|
---|
[15872] | 41 | @GMLPlugin::ISA = ('BasePlugin');
|
---|
[4] | 42 | }
|
---|
| 43 |
|
---|
[6408] | 44 | my $arguments =
|
---|
| 45 | [ { 'name' => "process_exp",
|
---|
[15872] | 46 | 'desc' => "{BasePlugin.process_exp}",
|
---|
[6408] | 47 | 'type' => "regexp",
|
---|
| 48 | 'deft' => &get_default_process_exp() }
|
---|
| 49 | ];
|
---|
| 50 |
|
---|
[15872] | 51 | my $options = { 'name' => "GMLPlugin",
|
---|
| 52 | 'desc' => "{GMLPlugin.desc}",
|
---|
[6408] | 53 | 'abstract' => "no",
|
---|
| 54 | 'inherits' => "yes",
|
---|
| 55 | 'args' => $arguments };
|
---|
[3540] | 56 |
|
---|
[4] | 57 | sub new {
|
---|
[10218] | 58 | my ($class) = shift (@_);
|
---|
| 59 | my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
|
---|
| 60 | push(@$pluginlist, $class);
|
---|
[4] | 61 |
|
---|
[15872] | 62 | push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
|
---|
| 63 | push(@{$hashArgOptLists->{"OptList"}},$options);
|
---|
[3540] | 64 |
|
---|
[15872] | 65 | my $self = new BasePlugin($pluginlist, $inputargs, $hashArgOptLists);
|
---|
[4] | 66 |
|
---|
[10218] | 67 | return bless $self, $class;
|
---|
| 68 | }
|
---|
| 69 |
|
---|
[1244] | 70 | sub get_default_process_exp {
|
---|
[4] | 71 | my $self = shift (@_);
|
---|
| 72 |
|
---|
[1269] | 73 | return q^(?i)\.gml?$^;
|
---|
[4] | 74 | }
|
---|
| 75 |
|
---|
[317] | 76 | # return number of files processed, undef if can't process
|
---|
[4] | 77 | # Note that $base_dir might be "" and that $file might
|
---|
| 78 | # include directories
|
---|
| 79 | sub read {
|
---|
| 80 | my $self = shift (@_);
|
---|
[9853] | 81 | my ($pluginfo, $base_dir, $file, $metadata, $processor, $maxdocs, $total_count, $gli) = @_;
|
---|
[1424] | 82 | my $outhandle = $self->{'outhandle'};
|
---|
[4] | 83 |
|
---|
[11090] | 84 | #check process and block exps, smart block, etc
|
---|
| 85 | my ($block_status,$filename) = $self->read_block(@_);
|
---|
| 86 | return $block_status if ((!defined $block_status) || ($block_status==0));
|
---|
[2795] | 87 |
|
---|
[1244] | 88 | $file =~ s/^[\/\\]+//; # $file often begins with / so we'll tidy it up
|
---|
[4] | 89 |
|
---|
[15872] | 90 | print STDERR "<Processing n='$file' p='GMLPlugin'>\n" if ($gli);
|
---|
| 91 | print $outhandle "GMLPlugin: processing $file\n";
|
---|
[4] | 92 |
|
---|
[1244] | 93 | my $parent_dir = $file;
|
---|
| 94 | $parent_dir =~ s/[^\\\/]*$//;
|
---|
| 95 | $parent_dir = &util::filename_cat ($base_dir, $parent_dir);
|
---|
| 96 |
|
---|
[1269] | 97 | if (!open (INFILE, $filename)) {
|
---|
[9584] | 98 | if ($gli) {
|
---|
| 99 | print STDERR "<ProcessingError n='$file' r='Could not read $filename'>\n";
|
---|
| 100 | }
|
---|
[15872] | 101 | print $outhandle "GMLPlugin::read - couldn't read $filename\n";
|
---|
[7362] | 102 | return -1;
|
---|
[433] | 103 | }
|
---|
| 104 |
|
---|
[840] | 105 | undef $/;
|
---|
[862] | 106 | my $gml = <INFILE>;
|
---|
[840] | 107 | $/ = "\n";
|
---|
[4] | 108 | close (INFILE);
|
---|
[862] | 109 |
|
---|
[840] | 110 | my @gml_sections = split("</gsdlsection>",$gml);
|
---|
| 111 | $gml = shift(@gml_sections);
|
---|
[4] | 112 |
|
---|
[840] | 113 | my $no_docs = 0;
|
---|
| 114 |
|
---|
[862] | 115 | while (1) {
|
---|
[840] | 116 | # create a new document
|
---|
| 117 | my $doc_obj = new doc ();
|
---|
| 118 | my $section = $doc_obj->get_top_section();
|
---|
| 119 |
|
---|
| 120 | # process the document
|
---|
| 121 | my $firstsection = 1;
|
---|
[862] | 122 | while (1) {
|
---|
| 123 | my ($tags, $text) = ("", "");
|
---|
[840] | 124 |
|
---|
[862] | 125 | my @indenting_sections = split("<gsdlsection", $gml);
|
---|
[2267] | 126 | shift(@indenting_sections); # skips over xml header if present
|
---|
[840] | 127 |
|
---|
[862] | 128 | foreach $gml (@indenting_sections) {
|
---|
[840] | 129 |
|
---|
[862] | 130 | if ($gml =~ /^\s*([^>]*)>(.*)$/so) {
|
---|
| 131 | $tags = $1 if defined $1;
|
---|
[15872] | 132 | $text = &GMLPlugin::_unescape_text($2);
|
---|
[862] | 133 |
|
---|
| 134 | } else {
|
---|
[15872] | 135 | print $outhandle "GMLPlugin::read - error in file $filename\n";
|
---|
[1424] | 136 | print $outhandle "text: \"$gml\"\n";
|
---|
[840] | 137 | last;
|
---|
| 138 | }
|
---|
[4] | 139 |
|
---|
[840] | 140 | # create the section (unless this is the first section)
|
---|
[862] | 141 | if ($firstsection) {
|
---|
[863] | 142 | $firstsection = 0;
|
---|
[862] | 143 | # $tags =~ /gsdlsourcefilename\s*=\s*(?:\"([^\"]*)\")/o;
|
---|
| 144 | # $src_filename = $2 || $3;
|
---|
[840] | 145 |
|
---|
[862] | 146 | } else {
|
---|
| 147 |
|
---|
| 148 | $tags =~ s/gsdlnum\s*=\s*\"?(\d+)\"?//o;
|
---|
| 149 | if (defined $1) {
|
---|
[840] | 150 | $section .= ".$1";
|
---|
| 151 | $doc_obj->create_named_section($section);
|
---|
[862] | 152 | } else {
|
---|
[840] | 153 | $section = $doc_obj->insert_section($doc_obj->get_end_child($section));
|
---|
| 154 | }
|
---|
| 155 | }
|
---|
| 156 |
|
---|
[2267] | 157 | # add the metadata
|
---|
| 158 | # could be stored as either attributes or ....
|
---|
[863] | 159 | while ((defined $tags) && ($tags =~ s/^\s*(\S+)=\"([^\"]*)\"//o)) {
|
---|
[15872] | 160 | $doc_obj->add_utf8_metadata($section, $1, &GMLPlugin::_unescape_text($2))
|
---|
[840] | 161 | if (defined $1 and defined $2);
|
---|
[863] | 162 |
|
---|
[840] | 163 | }
|
---|
| 164 |
|
---|
[2267] | 165 | # ... or tags (xml compliant)
|
---|
| 166 | if ($text =~ m/^\s*<metadata>/)
|
---|
| 167 | {
|
---|
[2326] | 168 | my ($metadata, $tagname, $tagvalue);
|
---|
[2267] | 169 | ($metadata,$text)
|
---|
| 170 | = ($text =~ m/\s*<metadata>\s*(<.*)\s*<\/metadata>(.*)$/s);
|
---|
[2363] | 171 |
|
---|
| 172 | # note: \1 refers to 1st match within regexp, so we can
|
---|
| 173 | # handle the unescaped text here...
|
---|
[2267] | 174 | while ((defined $metadata)
|
---|
[2363] | 175 | && ($metadata =~ s/<(.*?)>(.*?)<\/\1>//s))
|
---|
[2267] | 176 | {
|
---|
[2326] | 177 | if (defined $1 && defined $2)
|
---|
| 178 | {
|
---|
| 179 | $tagname = $1;
|
---|
| 180 | $tagvalue = $2;
|
---|
| 181 |
|
---|
| 182 | # if tagname begins with '/' it will be escaped
|
---|
| 183 | $tagname =~ s/^&\#47;/\//;
|
---|
| 184 |
|
---|
[15872] | 185 | $doc_obj->add_utf8_metadata($section, $tagname, &GMLPlugin::_unescape_text($tagvalue));
|
---|
[2326] | 186 | }
|
---|
[2267] | 187 | }
|
---|
| 188 | }
|
---|
| 189 |
|
---|
[840] | 190 | # add the text
|
---|
[2267] | 191 |
|
---|
[840] | 192 | $doc_obj->add_utf8_text($section, $text)
|
---|
| 193 | if ((defined $text) && ($text ne ""));
|
---|
[4] | 194 | }
|
---|
| 195 |
|
---|
[840] | 196 | $gml = shift(@gml_sections); # get next bit of data
|
---|
[862] | 197 | last unless defined $gml;
|
---|
[863] | 198 | last if $section eq ""; # back to top level again (more than one document in gml file)
|
---|
[4] | 199 | $section = $doc_obj->get_parent_section ($section);
|
---|
[1244] | 200 | } # while (1) section level
|
---|
[8121] | 201 |
|
---|
| 202 | # add the FileFormat as the metadata
|
---|
| 203 | $doc_obj->add_metadata($doc_obj->get_top_section(), "FileFormat", "GML");
|
---|
[4] | 204 |
|
---|
[862] | 205 | # add the associated files
|
---|
[1244] | 206 | my $assoc_files = $doc_obj->get_metadata($doc_obj->get_top_section(), "gsdlassocfile");
|
---|
[1401] | 207 | my ($assoc_file_info);
|
---|
[8121] | 208 |
|
---|
[1401] | 209 | foreach $assoc_file_info (@$assoc_files)
|
---|
| 210 | {
|
---|
[862] | 211 | my ($assoc_file, $mime_type, $dir) = split (":", $assoc_file_info);
|
---|
[1401] | 212 | my $real_dir = &util::filename_cat($parent_dir, $assoc_file),
|
---|
| 213 | my $assoc_dir = (defined $dir && $dir ne "")
|
---|
| 214 | ? &util::filename_cat($dir, $assoc_file) : $assoc_file;
|
---|
| 215 | $doc_obj->associate_file($real_dir, $assoc_dir, $mime_type);
|
---|
| 216 |
|
---|
[840] | 217 | }
|
---|
| 218 | $doc_obj->delete_metadata($doc_obj->get_top_section(), "gsdlassocfile");
|
---|
[1401] | 219 |
|
---|
[863] | 220 | # add metadata passed in from elsewhere
|
---|
| 221 | $self->extra_metadata ($doc_obj, $doc_obj->get_top_section(), $metadata);
|
---|
[862] | 222 |
|
---|
[1244] | 223 | # do any automatic metadata extraction
|
---|
| 224 | $self->auto_extract_metadata ($doc_obj);
|
---|
| 225 |
|
---|
| 226 | # assume the document has an OID already
|
---|
[1401] | 227 |
|
---|
[840] | 228 | # process the document
|
---|
| 229 | $processor->process($doc_obj, $file);
|
---|
[862] | 230 |
|
---|
[840] | 231 | $no_docs++;
|
---|
[9853] | 232 | last if ($maxdocs > -1 && ($total_count+$no_docs) >= $maxdocs);
|
---|
[918] | 233 | last unless defined $gml && $gml =~ /\w/;
|
---|
[1244] | 234 | } # while(1) document level
|
---|
[1401] | 235 |
|
---|
[840] | 236 | return $no_docs; # no of docs processed
|
---|
[4] | 237 | }
|
---|
| 238 |
|
---|
[1244] | 239 | sub _unescape_text {
|
---|
| 240 | my ($text) = @_;
|
---|
[4] | 241 |
|
---|
[1244] | 242 | # special characters in the gml encoding
|
---|
| 243 | $text =~ s/</</g;
|
---|
| 244 | $text =~ s/>/>/g;
|
---|
| 245 | $text =~ s/"/\"/g;
|
---|
| 246 | $text =~ s/&/&/g; # this has to be last...
|
---|
| 247 |
|
---|
| 248 | return $text;
|
---|
| 249 | }
|
---|
| 250 |
|
---|
[4] | 251 | 1;
|
---|