root/trunk/cstr/perllib/plugins/PrePlug.pm @ 891

Revision 891, 8.3 KB (checked in by sjboddie, 20 years ago)

Initial revision

  • Property svn:keywords set to Author Date Id Revision
Line 
1###########################################################################
2#
3# PrePlug.pm --
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26# plugin that processes simple html files like those output
27# by prescript when converting postscript to html.
28# prescript output has <p> tags separating paragraphs
29# and <!--End Of Page--> tags separating pages.
30# it may also have <!--Page No--> tags at top or bottom of
31# some or all pages.
32
33# if a .html1 version of a .html file exists then that version
34# will be used so that the author name can be extracted
35# (.html1 files were generated by software which automatically
36# extracts author names from .html files).
37
38# also looks for a .info file containing metadata (as used by
39# the cstr collection)
40
41package PrePlug;
42
43use BasPlug;
44use sorttools;
45
46sub BEGIN {
47    @ISA = ('BasPlug');
48}
49
50sub new {
51    my ($class) = @_;
52    $self = new BasPlug ();
53
54    return bless $self, $class;
55}
56
57sub is_recursive {
58    my $self = shift (@_);
59
60    return 0; # this is not a recursive plugin
61}
62
63
64# return number of files processed, undef if can't process
65# Note that $base_dir might be "" and that $file might
66# include directories
67sub read {
68    my $self = shift (@_);
69    my ($pluginfo, $base_dir, $file, $metadata, $processor) = @_;
70
71    my $filename = &util::filename_cat($base_dir, $file);
72
73    return 0 if ($filename =~ /\.(html1|html1\.gz|info|gif)/);
74
75    return undef unless ($filename =~ /\.(html(\.gz)?)$/i && (-e $filename));
76
77    my $gz = 0;
78    $gz = 1 if (defined $2 && $2 eq ".gz");
79
80    my $filename1 = $filename;
81    if ($gz) {
82    $filename1 =~ s/\.html\.gz$/\.html1\.gz/;
83    } else {
84    $filename1 .= "1";
85    }
86   
87    $filename = $filename1 if (-e $filename1);
88
89    print STDERR "PrePlug: processing $filename\n" if $processor->{'verbosity'};
90
91    # create a new document
92    my $doc_obj = new doc ($file, "indexed_doc");
93
94    if ($gz) {
95    open (FILE, "zcat $filename |") || die "PrePlug::read - zcat can't open $filename\n";
96    } else {
97    open (FILE, $filename) || die "PrePlug::read - can't open $filename\n";
98    }
99    my $cursection = $doc_obj->get_top_section();
100
101    my $text = "";
102    my $line = "";
103    my $pagenumber = 1;
104    my @creators = ();
105    my $numlines = 0;
106    while (defined ($line = <FILE>)) {
107    if ($filename =~ /html1$/) {
108        while ($text =~ s/<_author_search_\([^\)]*\)>([^<]*)<\/a>/$1/i) {
109        push (@creators, $1);
110        }
111    }
112
113    # numlines sorts out documents that just have a single <!-- End of Page--> tag at the end
114    if ($line =~ /<!--End\s+Of\s+Page-->/i && $numlines < 200) {
115
116        $numlines = -1000;
117        $cursection =
118        $doc_obj->insert_section($doc_obj->get_end_child($doc_obj->get_top_section()));
119
120        $text =~ s/<!--Page No-->\s*<p>\s*<center>\s*<b>\s*(\d+)<\/b>\s*<\/center>\s*//i;
121#       if (defined $1 && $1 != $pagenumber) {
122#       print STDERR "PrePlug Warning: Pagenumbers don't line up in $filename\n";
123#       }
124        $doc_obj->add_text ($cursection, $text);
125        $doc_obj->add_metadata ($cursection, "Title", $pagenumber);
126        $pagenumber ++;
127        $text = "";
128    } else {
129        $text .= $line;
130        $numlines ++;
131    }
132    }
133    close FILE;
134
135    if ($cursection eq $doc_obj->get_top_section()) {
136    # there weren't any <!--End Of Page--> tags
137   
138    # see if there were any <!--Page No--> tags to split on
139    if ($text =~ /<!--Page No-->/i) {
140        while ($text =~ s/^(.*?)<!--Page No-->\s*<p>\s*<center>\s*<b>\s*\d+<\/b>\s*<\/center>\s*//i) {
141        $cursection =
142            $doc_obj->insert_section($doc_obj->get_end_child($doc_obj->get_top_section()));
143        $doc_obj->add_text ($cursection, $text);
144        $doc_obj->add_metadata ($cursection, "Title", $pagenumber);
145        $pagenumber ++;
146        }
147    }
148    # otherwise we'll just have to split pages on set number of lines
149    else {
150        my $pagetext = "";
151        my $line = "";
152        my $count = 0;
153        while (length ($text) && $text =~ s/^(.*?)(\n|$)//) {
154        $line = $1;
155        $line = "" unless defined $line;
156        $pagetext .= $line . "\n";
157        if (($count >= 50 && $line !~ /\w/) || $count == 80) {
158            $cursection =
159            $doc_obj->insert_section($doc_obj->get_end_child($doc_obj->get_top_section()));
160            $doc_obj->add_text ($cursection, $pagetext);
161            $doc_obj->add_metadata ($cursection, "Title", $pagenumber);
162            $pagetext = "";
163            $pagenumber ++;
164            $count = 0;
165        } else {
166            $count ++;
167        }
168        }
169    }
170       
171    }
172       
173    # final section
174    if ($text =~ /\w/) {
175    $cursection =
176        $doc_obj->insert_section($doc_obj->get_end_child($doc_obj->get_top_section()));
177   
178    $text =~ s/<!--Page No-->\s*<p>\s*<center>\s*<b>\s*(\d+)<\/b>\s*<\/center>\s*//;
179#   if (defined $1 && $1 != $pagenumber) {
180#       print STDERR "PrePlug Warning: Pagenumbers don't line up in $filename\n";
181#   }
182    $doc_obj->add_text ($cursection, $text);
183    $doc_obj->add_metadata ($cursection, "Title", $pagenumber);
184    }
185
186    # add meta data for top level of document
187    map {$doc_obj->add_metadata ($doc_obj->get_top_section(), "Creator", $_)} @creators;
188
189    my ($filesuff) = $filename =~ /^(.*?)\.html1?/;
190    if (!-e $filesuff . ".info") {
191    print STDERR "Preplug Warning: $filename has no corresponding .info file\n";
192    } else {
193    my ($dir) = $filesuff =~ /^(.*?)~?[^~]*$/;
194    open (INFO, $filesuff . ".info") || die "PrePlug::read - can't open $filesuff.info\n";
195    my $line = "";
196    while (defined ($line = <INFO>)) {
197        chop $line;
198        my ($key, $value) = $line =~ /^<([^>]*)>(.*)$/;
199        next if $key =~ /^(pages|compressedsize|\/?info|size)$/i;
200        next if $value !~ /\w/;
201        if ($key =~ /^abstract$/i) {
202        $key = "Description";
203        } elsif ($key =~ /^url$/i) {
204        $key = "Source";
205        } elsif ($key =~ /^filedate$/i) {
206        $key = "Date";
207        my ($day, $month, $year) = split /\//, $value;
208        $value = &sorttools::format_date ($day, $month, $year);
209        } elsif ($key =~ /^transferdate$/i) {
210        my ($day, $month, $year) = split /\//, $value;
211        $value = &sorttools::format_date ($day, $month, $year);
212        } elsif ($key =~ /^facsimiles$/i) {
213        $value =~ s/^\s+//;
214        $value =~ s/\s+$//;
215        my @facsimiles = split /\s+/, $value;
216        $value = join ",", @facsimiles;
217        foreach $facsimile (@facsimiles) {
218            # assume images are in the same directory
219            my $imagefile = $dir . "~" . $facsimile;
220            if (-e $imagefile) {
221            my ($imagetype) = $facsimile =~ /\.([^\.]*)$/;
222            $doc_obj->associate_file($imagefile, $facsimile, "image/" . $imagetype);
223            } else {
224            print STDERR "PrePlug: Warning - facsimile file $imagefile doesn't exist\n";
225            next;
226            }
227        }
228        } elsif ($key =~ /^figures$/i) {
229        $value =~ s/^\s+//;
230        $value =~ s/\s+$//;
231        my @figures = split /\s+/, $value;
232        foreach $figure (@figures) {
233            # assume images are in the same directory
234            my $imagefile = $dir . "~" . $figure;
235            if (-e $imagefile) {
236            my ($imagetype) = $figure =~ /\.([^\.]*)$/;
237            $doc_obj->associate_file($imagefile, $figure, "image/" . $imagetype);
238            } else {
239            print STDERR "PrePlug: Warning - figure file $imagefile doesn't exist\n";
240            next;
241            }
242            $value = join ",", @figures;
243        }
244        }
245        $doc_obj->add_metadata ($doc_obj->get_top_section(), $key, $value);
246    }
247    close INFO;
248    }
249
250    foreach $field (keys(%$metadata)) {
251    # $metadata->{$field} may be an array reference
252    if (ref ($metadata->{$field}) eq "ARRAY") {
253        map {
254        $doc_obj->add_metadata ($doc_obj->get_top_section(), $field, $_);
255        } @{$metadata->{$field}};
256    } else {
257        $doc_obj->add_metadata ($doc_obj->get_top_section(), $field, $metadata->{$field});
258    }
259    }
260
261    # add OID
262    $doc_obj->set_OID ();
263
264    # process the document
265    $processor->process($doc_obj);
266
267    return 1; # processed the file
268}
269
2701;
271
272
273
274
275
276
277
278
279
280
281
Note: See TracBrowser for help on using the browser.