source: trunk/gsdl/perllib/muread.pm@ 627

Last change on this file since 627 was 627, checked in by rjmcnab, 25 years ago

initial revision.

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 4.0 KB
Line 
1###########################################################################
2#
3# muread.pm -- read a marked-up file
4#
5# Copyright (C) 1999 DigiLib Systems Limited, NZ
6#
7# This program is free software; you can redistribute it and/or modify
8# it under the terms of the GNU General Public License as published by
9# the Free Software Foundation; either version 2 of the License, or
10# (at your option) any later version.
11#
12# This program is distributed in the hope that it will be useful,
13# but WITHOUT ANY WARRANTY; without even the implied warranty of
14# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15# GNU General Public License for more details.
16#
17# You should have received a copy of the GNU General Public License
18# along with this program; if not, write to the Free Software
19# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
20#
21###########################################################################
22
23
24package muread;
25
26use unicode;
27use multiread;
28
29sub new {
30 my ($class) = @_;
31
32 my $self = {'filename'=>"",
33 'encoding'=>"",
34 'handle'=>"",
35 'reader'=>"",
36 'buffer'=>""};
37
38 return bless $self, $class;
39}
40
41# returns a new tag with a tag name and any options
42sub parse_tag {
43 my $self = shift (@_);
44 my ($orgtagtext) = @_;
45 my $tagtext = $orgtagtext;
46 my $newtag = {};
47 my $misformed = 0;
48
49# print STDERR "parsing \"$tagtext\"\n";
50
51 # get tag name (if there is one)
52 if ($tagtext =~ /^(\w+)/) {
53 $newtag->{'_tagname'} = $1;
54 $tagtext =~ s/^(\w+)//;
55 } else {
56 print STDERR "muread::parse_tag error - no tag name found\n";
57 }
58
59 # get the tag arguments
60 while ($tagtext =~ /\S/) {
61 $tagtext =~ s/^\s+//s;
62 if ($tagtext =~ /^(\w+)\s*=\s*\"([^\"]*)\"/s) {
63 $newtag->{$1} = (defined $2) ? $2 : "";
64 $tagtext =~ s/^\w+\s*=\s*\"[^\"]*\"//s;
65
66 } else {
67 if (!$misformed) {
68 print STDERR "muread::parse_tag error - miss-formed tag <$orgtagtext>\n";
69 $misformed = 1;
70 }
71 $tagtext =~ s/^\S+//s;
72 }
73 }
74
75 return $newtag;
76}
77
78sub read_tag_content {
79 my $self = shift (@_);
80 my ($tag) = @_;
81
82 # all tags contain a _tagname except the tag for the document
83
84 my $line = "";
85 while (1) {
86 # deal with preceeding text
87 if ($self->{'buffer'} =~ /^([^<]+)</s) {
88 # add preceeding text
89 $tag->{'_contains'} = [] unless defined $tag->{'_contains'};
90 push (@{$tag->{'_contains'}}, {'_text'=>$1});
91
92 $self->{'buffer'} =~ s/^[^<]+</</s;
93 }
94
95 if ($self->{'buffer'} =~ /^<([^>\/]+)>/s) {
96 # add info from this tag
97 my $tagtext = $1;
98 my $newtag = $self->parse_tag ($tagtext);
99 push (@{$tag->{'_contains'}}, $newtag);
100 $self->{'buffer'} =~ s/^<[^>\/]+>//s;
101
102 # deal with the contents of this tag
103 $self->read_tag_content ($newtag);
104
105 } elsif ($self->{'buffer'} =~ /^<\/([^>\/]+)>/s) {
106 my $tagname = $1;
107 $self->{'buffer'} =~ s/^<\/[^>\/]+>//s;
108
109 # check that this tag is the right tag
110 if (!defined $tag->{'_tagname'} || $tag->{'_tagname'} ne $tagname) {
111 print STDERR "muread::read_tag_content error - mismatched tag </$tagname>, " .
112 "expected </$tag->{'_tagname'}>\n";
113 } else {
114 return;
115 }
116 } elsif (defined ($line = $self->{'reader'}->read_line())) {
117 $self->{'buffer'} .= $line;
118 } else {
119 if ($self->{'buffer'} =~ /\S/) {
120 print STDERR "muread::read_tag_content error - can't parse text \"$self->{'buffer'}\"\n";
121 }
122 last;
123 }
124 }
125
126 if (defined $tag->{'_tagname'}) {
127 print STDERR "muread::read_tag_content error - eof reached before closing " .
128 "tag \"$tag->{'_tagname'}\" found\n";
129 }
130}
131
132sub read_file {
133 my $self = shift (@_);
134 ($self->{'handle'}, $self->{'filename'}, $self->{'encoding'}) = @_;
135 $self->{'encoding'} = "utf8" unless defined $self->{'encoding'};
136
137 my $doc = {};
138
139 # get reader set up
140 $self->{'reader'} = new multiread ();
141 $self->{'reader'}->set_handle ($self->{'handle'});
142 $self->{'reader'}->set_encoding ($self->{'encoding'});
143
144 # read in the file
145 $self->read_tag_content ($doc);
146
147 $self->{'handle'} = "";
148 return $doc;
149}
150
1511;
Note: See TracBrowser for help on using the repository browser.