source: trunk/niupepa/perllib/plugins/NPPlug.pm@ 3704

Last change on this file since 3704 was 3704, checked in by sjboddie, 21 years ago

* empty log message *

  • Property svn:keywords set to Author Date Id Revision
File size: 14.4 KB
Line 
1###########################################################################
2#
3# NPPlug.pm -- Plugin for the niupepa collection
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26# The niupepa collection has a file structure as follows:
27# Each niupepa series has its own directory containing some/all
28# of the following:
29
30# meta.txt - file contains metadata to be associated with all documents
31# in series.
32# *.issue - each issue should have a .issue file which may or may not
33# contain metadata to associate with the issue. also contains the list
34# of filenames that make up the issue (i.e. one for each page). meta.txt
35# is read before *.issue so metadata in .issue files will override that in
36# meta.txt
37# *.commentary - the commentary of the niupepa series (1 per series)
38# text/*.txt/htm - text/html files of issue pages (1 per page) -
39# text files are expected to be either .htm or .txt (lower case).
40# images/*.gif - image files of issue pages (1 per page)
41# abstracts/*.abstract - html files of issue abstracts (1 per issue)
42
43
44package NPPlug;
45
46use BasPlug;
47use util;
48
49sub BEGIN {
50 @ISA = ('BasPlug');
51}
52
53use strict;
54
55sub print_usage {
56 print STDERR "\nIncorrect options passed to NPPlug, check your collect.cfg configuration file\n";
57
58 print STDERR "\n usage: plugin NPPlug [options]\n\n";
59 print STDERR " options:\n";
60 print STDERR " -create_log Creates a log file containing info about which portions\n";
61 print STDERR " of papers are missing\n";
62 print STDERR " -logfile Path of logfile (defaults to ./log.txt)\n\n";
63}
64
65sub new {
66 my ($class) = @_;
67 my $self = new BasPlug ();
68
69 if (!parsargv::parse(\@_,
70 q^create_log^, \$self->{'create_log'},
71 q^logfile/.*/./log.txt^, \$self->{'logfile'})) {
72 &print_usage();
73 die "\n";
74 }
75
76 $self->{'commentaries'} = {};
77 $self->{'num_issues'} = 0;
78 $self->{'num_text_pages'} = 0;
79 $self->{'num_image_pages'} = 0;
80 $self->{'num_abstracts'} = 0;
81 return bless $self, $class;
82}
83
84sub is_recursive {
85 my $self = shift (@_);
86
87 return 0; # this is not a recursive plugin
88}
89
90sub begin {
91 my $self = shift (@_);
92 my ($pluginfo, $base_dir, $processor, $maxdocs) = @_;
93
94 # open up logfile
95 # note that we append to logfile as building will otherwise
96 # overwrite a file generated at import time
97 if ($self->{'create_log'}) {
98 open (LOGFILE, ">>$self->{'logfile'}") || die
99 "NPPlug.pm: Couldn't open log file $self->{'logfile'}\n";
100
101 my @time = localtime (time);
102
103 print LOGFILE "------------------------------------------------------------\n";
104 print LOGFILE "Log start $time[3]/$time[4]/" . (1900 + $time[5]) . "\n";
105 print LOGFILE "------------------------------------------------------------\n";
106 }
107}
108
109sub end {
110 my $self = shift (@_);
111
112 if ($self->{'create_log'}) {
113 my $numseries = 0;
114 my $numcommentaries = 0;
115
116 # record missing commentaries in logfile
117 foreach my $key (keys %{$self->{'commentaries'}}) {
118 $numseries ++;
119 if (!$self->{'commentaries'}->{$key}) {
120 print LOGFILE "Commentary missing for series $key\n";
121 } else {
122 $numcommentaries ++;
123 }
124 }
125
126 print LOGFILE "\n\nStatistics:\n";
127 print LOGFILE "series: $numseries\n";
128 print LOGFILE "commentaries: $numcommentaries\n";
129 print LOGFILE "issues: $self->{'num_issues'}\n";
130 print LOGFILE "abstracts: $self->{'num_abstracts'}\n";
131 print LOGFILE "text pages: $self->{'num_text_pages'}\n";
132 print LOGFILE "image pages: $self->{'num_image_pages'}\n";
133
134 # close logfile
135 close LOGFILE;
136 }
137}
138
139# return number of files processed, undef if can't process
140# Note that $base_dir might be "" and that $file might
141# include directories
142sub read {
143 my $self = shift (@_);
144 my ($pluginfo, $base_dir, $file, $metadata, $processor) = @_;
145
146 my $filename = &util::filename_cat($base_dir, $file);
147
148 # we don't want RecPlug to go recursing into the text, images or
149 # abstracts directories
150 return 0 if (-d $filename && $filename =~ /(abstracts|images|text)/);
151
152 return 0 if $filename =~ /meta\.txt$/i;
153
154 return undef unless ($filename =~ /\.(issue|commentary)$/i && (-e $filename));
155
156 $self->{'verbosity'} = $processor->{'verbosity'};
157 print STDERR "NPPlug: processing $filename\n" if $self->{'verbosity'};
158
159 my ($dir);
160 ($dir, $file) = $filename =~ /^(.*?)([^\/\\]*)$/;
161 my ($issuekey) = $file =~ /^([^\.\_]*)/;
162
163 if ($filename =~ /\.commentary$/i) {
164 # commentary
165 return $self->process_commentary ($filename, $issuekey, $dir, $file, $processor);
166 }
167
168 my $numprocessed = 0;
169 $self->{'commentaries'}->{$issuekey} = 0 unless defined $self->{'commentaries'}->{$issuekey};
170
171 my ($abstractfile) = $file =~ /^([^\.]*)\.issue/i;
172 my $abstractOID = $abstractfile . "abstract";
173 $abstractfile .= ".abstract";
174 my $afile = &util::filename_cat($dir, "abstracts", $abstractfile);
175 my $hasabstract = 0;
176 if (-e $afile) {$hasabstract = 1;}
177 else {$abstractOID = undef;}
178
179 # process the .issue file
180 my %meta = ();
181 $numprocessed += $self->process_issue ($filename, $issuekey, $dir, $file,
182 $abstractOID, $processor, \%meta);
183
184 # process abstract of this issue
185 if ($hasabstract) {
186 $numprocessed += $self->process_abstract ($afile, $issuekey, $dir,
187 $abstractfile, $processor, \%meta);
188 }
189
190 return $numprocessed;
191}
192
193sub process_issue {
194 my $self = shift (@_);
195 my ($filename, $issuekey, $dir, $file, $abstract, $processor, $meta) = @_;
196
197 $self->{'num_issues'} ++;
198 my $doc_obj = new doc ($file, "indexed_doc");
199 my $topsection = $doc_obj->get_top_section();
200 $self->associate_cover_images ($doc_obj, $dir, $issuekey);
201 $doc_obj->set_utf8_metadata_element ($topsection, 'Title', $self->get_title_string($file));
202 $doc_obj->set_utf8_metadata_element ($topsection, 'abstract', $abstract) if defined $abstract;
203 $self->set_main_metadata ($doc_obj, $dir);
204
205 # process issue's pdf if one exists
206 my ($pdffile) = $file =~ /^([^\.]*)\.issue/i;
207 $pdffile .= ".pdf";
208 $pdffile = &util::filename_cat($dir, "pdf", $pdffile);
209 if (-e $pdffile) {
210 $doc_obj->set_utf8_metadata_element ($topsection, "haspdf", "1");
211 $doc_obj->associate_file($pdffile, "paper.pdf");
212 } else {
213 $doc_obj->set_utf8_metadata_element ($topsection, "haspdf", "0");
214 }
215
216 open (ISSUEFILE, $filename) || die "couldn't open $filename\n";
217 my $line = "";
218 while (defined ($line = <ISSUEFILE>)) {
219 next unless $line =~ /\w/;
220 chomp $line;
221 if ($line =~ /^<([^>]*)>(.*?)\s*$/) {
222 $doc_obj->set_utf8_metadata_element ($topsection, $1, $2);
223 $meta->{$1} = $2;
224 } else {
225 # should be a section name
226 $line =~ s/^\s+//;
227 $line =~ s/\s+$//;
228 my ($pagenum) = $line =~ /([^_]*)$/;
229# $doc_obj->create_named_section($pagenum); <-- can't do this anymore as pagenum may
230# be something like "cover"
231 my $cursection = $doc_obj->insert_section($doc_obj->get_end_child($topsection));
232
233# $doc_obj->set_utf8_metadata_element($pagenum, 'Title', $pagenum);
234 $doc_obj->set_utf8_metadata_element($cursection, 'Title', $pagenum);
235# $self->process_text ($dir, $line, $doc_obj, $pagenum);
236 $self->process_text ($dir, $line, $doc_obj, $cursection);
237# $self->process_images ($dir, $line, $doc_obj, $pagenum);
238 $self->process_images ($dir, $line, $doc_obj, $cursection);
239 }
240 }
241 $file =~ s/\.issue//i;
242 $doc_obj->set_OID ($file);
243 $processor->process ($doc_obj);
244 return 1;
245}
246
247sub process_images {
248 my $self = shift (@_);
249 my ($dir, $page, $doc_obj, $cursection) = @_;
250
251 my $filename = &util::filename_cat ($dir, "images", $page);
252
253 if (-e "$filename.gif") {
254 $self->{'num_image_pages'} ++;
255 $doc_obj->set_utf8_metadata_element ($cursection, "hasimg", "1");
256 $doc_obj->set_utf8_metadata_element ($cursection, "Source", $page);
257 $doc_obj->associate_file("$filename.gif", "$page.gif", "image/gif");
258 } elsif ($self->{'create_log'}) {
259 $doc_obj->set_utf8_metadata_element ($cursection, "hasimg", "0");
260 print LOGFILE "no fullsize image file for $page\n";
261 }
262
263 if (-e "${filename}_p.gif") {
264 $doc_obj->set_utf8_metadata_element ($cursection, "hasprevimg", "1");
265 $doc_obj->set_utf8_metadata_element ($cursection, "Source", $page);
266 $doc_obj->associate_file("${filename}_p.gif", "${page}_p.gif", "image/gif");
267 } elsif ($self->{'create_log'}) {
268 $doc_obj->set_utf8_metadata_element ($cursection, "hasprevimg", "0");
269 print LOGFILE "no preview image file for $page\n";
270 }
271}
272
273sub process_text {
274 my $self = shift (@_);
275 my ($dir, $page, $doc_obj, $cursection) = @_;
276 my ($text);
277
278 my $filename = &util::filename_cat ($dir, "text", $page);
279 if (-e "$filename.htm") {
280 $text = $self->get_text ("$filename.htm");
281 } elsif (-e "$filename.txt") {
282 $text = $self->get_text ("$filename.txt");
283 }
284
285 if (defined $text) {
286 $self->{'num_text_pages'} ++;
287 $doc_obj->add_utf8_text ($cursection, $text);
288 } elsif ($self->{'create_log'}) {
289 print LOGFILE "no txt or htm file for $page\n";
290 }
291}
292
293sub process_abstract {
294 my $self = shift (@_);
295 my ($filename, $issuekey, $dir, $file, $processor, $meta) = @_;
296
297 my $text = $self->get_text ($filename);
298 if (defined $text) {
299 $self->{'num_abstracts'} ++;
300 my $doc_obj = new doc ($file, "indexed_doc");
301 my $cursection = $doc_obj->get_top_section();
302 $self->associate_cover_images ($doc_obj, $dir, $issuekey);
303 $doc_obj->set_utf8_metadata_element ($cursection, 'Title', $self->get_title_string($file));
304 $self->set_main_metadata ($doc_obj, $dir);
305 map { $doc_obj->set_utf8_metadata_element ($cursection, $_, $meta->{$_}); } keys %$meta;
306 $doc_obj->set_utf8_metadata_element ($cursection, "doctype", "Description");
307 $doc_obj->add_utf8_text ($cursection, $text);
308 $file =~ s/\.abstract//i;
309 $doc_obj->set_OID ($file . "abstract");
310 $processor->process ($doc_obj);
311 return 1;
312 }
313
314 if ($self->{'create_log'}) {
315 print LOGFILE "abstract file $filename doesn't exist\n";
316 }
317 return 0;
318}
319
320sub process_commentary {
321 my $self = shift (@_);
322 my ($filename, $issuekey, $dir, $file, $processor) = @_;
323
324 my $text = $self->get_text ($filename);
325
326 return 0 unless defined $text;
327
328 $self->{'commentaries'}->{$issuekey} = 1;
329 my $doc_obj = new doc ($file, "indexed_doc");
330 my $cursection = $doc_obj->get_top_section();
331 $self->associate_cover_images ($doc_obj, $dir, $issuekey);
332 $doc_obj->set_utf8_metadata_element ($cursection, 'Title', "_commentary_");
333 $self->set_main_metadata ($doc_obj, $dir);
334 $doc_obj->set_utf8_metadata_element ($cursection, "doctype", "Commentary");
335 $doc_obj->add_utf8_text ($cursection, $text);
336 $doc_obj->set_OID ($issuekey . "commentary");
337 $processor->process ($doc_obj);
338 return 1;
339}
340
341sub associate_cover_images {
342 my $self = shift (@_);
343 my ($doc_obj, $dir, $issuekey) = @_;
344
345 my $cover = &util::filename_cat ($dir, $issuekey);
346 $doc_obj->associate_file("${cover}on.gif", "${issuekey}/coveron.gif", "image/gif");
347 $doc_obj->associate_file("${cover}of.gif", "${issuekey}/coverof.gif", "image/gif");
348}
349
350# reads in the meta.txt file and sets metadata
351sub set_main_metadata {
352 my $self = shift (@_);
353 my ($doc_obj, $dir) = @_;
354
355 my $metafile = &util::filename_cat ($dir, "meta.txt");
356 return unless (-e $metafile);
357
358 if (!open (METAFILE, $metafile)) {
359 print STDERR "NPPlug: Couldn't read $metafile\n" if $self->{'verbosity'};
360 return;
361 }
362
363 my $cursection = $doc_obj->get_top_section();
364 my $line = "";
365 while (defined ($line = <METAFILE>)) {
366 next unless $line =~ /\w/;
367 chomp $line;
368 if ($line =~ /<([^>]*)>(.*)$/) {
369 # note we're using set_metadata_element (not add_metadata_element)
370 # this will override any previously set metadata of the same name
371 $doc_obj->set_utf8_metadata_element ($cursection, $1, $2);
372 } elsif ($self->{'verbosity'}) {
373 print STDERR "NPPlug: Badly formatted line in $metafile\n";
374 print STDERR "meta.txt lines should be formatted '<metaname>metavalue'\n";
375 }
376 }
377}
378
379sub get_text {
380 my $self = shift (@_);
381 my ($filename) = @_;
382
383 if (open (FILE, $filename)) {
384 my $text = "";
385 my $line = "";
386 if ($filename =~ /\.(htm|commentary|abstract)$/i) {
387 my $savedtext = "";
388 my $foundbody = 0;
389 while (defined ($line = <FILE>)) {
390 if ($line =~ s/.*?<body[^>]*>//i) {
391 $foundbody = 1;
392 }
393 $line =~ s/(<\/?html[^>]*>|<\/?head[^>]*>|<\/p>|<\/?font[^>]*>|<\/?body[^>]*>)//ig;
394 if ($foundbody) {
395 $text .= $line;
396 } else {
397 $savedtext .= $line;
398 }
399 }
400 close FILE;
401 if (!$foundbody) {$text = $savedtext;}
402 if ($filename =~ /\.(commentary|abstract)$/i) {
403 # commentaries and abstracts should already be utf8
404 return $text;
405 } else {
406 # a few extended ascii characters have snuck through
407 # in some text files so we need to convert them to utf8
408 return &unicode::ascii2utf8(\$text);
409 }
410
411 } else {
412 while (defined ($line = <FILE>)) {
413 $line = "<p>\n" unless $line =~ /\w/;
414 $text .= $line;
415 }
416 close FILE;
417 # a few extended ascii characters have snuck through
418 # in some text files so we need to convert them to utf8
419 return &unicode::ascii2utf8(\$text);
420 }
421
422 } else {
423 print STDERR "NPPlug: Warning: get_text() couldn't open $filename\n"
424 if $self->{'verbosity'};
425 return undef;
426 }
427}
428
429sub get_title_string {
430 my $self = shift (@_);
431 my ($filename) = @_;
432
433 $filename =~ s/\.(issue|abstract)$//i;
434 my ($series, $vol, $num) = split /\_/, $filename;
435 my $title = "";
436 $title .= "_vol_ $vol" if defined $vol && $vol =~ /\w/;
437 if (defined $num && $num =~ /\w/) {
438 $title .= ", " if defined $vol && $vol =~ /\w/;
439 $title .= "_num_ $num";
440 }
441 return $title;
442}
443
4441;
Note: See TracBrowser for help on using the repository browser.