# plugin which process an HTML book directory package GMLPlug; use html; use BasPlug; use util; use doc; sub BEGIN { @ISA = ('BasPlug'); } sub new { my ($class) = @_; $self = new BasPlug (); return bless $self, $class; } sub is_recursive { my $self = shift (@_); return 0; # this is not a recursive plugin } sub _unescape_text { my ($text) = @_; # special characters in the gml encoding $text =~ s/<//g; $text =~ s/"/\"/g; $text =~ s/&/&/g; # this has to be last... return $text; } # return number of files processed, undef if can't process # Note that $base_dir might be "" and that $file might # include directories sub read { my $self = shift (@_); my ($pluginfo, $base_dir, $file, $metadata, $processor) = @_; my $fullname = &util::filename_cat ($base_dir, $file); # see if this is a gml book return undef unless (-f $fullname && $fullname =~ /\.gml$/i); my ($parent_dir) = $fullname =~ /^(.*)\/[^\/]+.gml$/; # create a new document my $doc_obj = new doc (); my $section = $doc_obj->get_top_section(); print STDERR "GMLPlug: processing $file\n"; # read in the document my $gml = ""; my $line = ""; if (!open (INFILE, $fullname)) { print STDERR "GMLPlug::read - couldn't read $fullname\n"; return undef; } while (defined ($line = )) { $gml .= $line; } close (INFILE); # process the document my $firstsection = 1; while ($gml =~ /\S/) { if ($gml =~ s/^\s*]*)>(.*?)(<\/?gsdlsection)/$3/is) { my $tags = $1; $tags = "" unless defined $tags; my $text = &_unescape_text($2); # create the section (unless this is the first section) if (!$firstsection) { $tags =~ s/gsdlnum\s*=\s*\"?(\d+)\"?//i; if (defined $1) { $section .= ".$1"; $doc_obj->create_named_section($section); } else { $section = $doc_obj->insert_section($doc_obj->get_end_child($section)); } } $firstsection = 0; # add the tags while ((defined $tags) && ($tags =~ s/^\s*(\w+)\s*=\s*\"([^\"]*)\"//)) { $doc_obj->add_utf8_metadata($section, $1 , &_unescape_text($2)) if (defined $1 and defined $2); } # add the text $doc_obj->add_utf8_text($section, $text) if ((defined $text) && ($text ne "")); } elsif ($gml =~ s/^\s*<\/gsdlsection>//) { $section = $doc_obj->get_parent_section ($section); } else { print STDERR "GMLPlug::read - error in file $fullname\n"; print STDERR "text: \"$gml\"\n"; last; } } # add the associated files $assoc_files = $doc_obj->get_metadata($doc_obj->get_top_section(), "gsdlassocfile"); my ($assoc_file_info); foreach $assoc_file_info (@$assoc_files) { my ($assoc_file, $mime_type) = split (":", $assoc_file_info); $doc_obj->associate_file(&util::filename_cat($parent_dir, $assoc_file), $assoc_file, $mime_type); } $doc_obj->delete_metadata($doc_obj->get_top_section(), "gsdlassocfile"); # add metadata foreach $field (keys(%$metadata)) { # $metadata->{$field} may be an array reference if (ref ($metadata->{$field}) eq "ARRAY") { map { $doc_obj->add_metadata ($doc_obj->get_top_section(), $field, $_); } @{$metadata->{$field}}; } else { $doc_obj->add_metadata ($doc_obj->get_top_section(), $field, $metadata->{$field}); } } # assume the document has an OID # process the document $processor->process($doc_obj, $file); return 1; # processed the file } 1;