source: trunk/niupepa/perllib/plugins/NPPlug.pm@ 13310

Last change on this file since 13310 was 13310, checked in by nzdl, 17 years ago

modify the new method

  • Property svn:keywords set to Author Date Id Revision
File size: 15.4 KB
Line 
1###########################################################################
2#
3# NPPlug.pm -- Plugin for the niupepa collection
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26# The niupepa collection has a file structure as follows:
27# Each niupepa series has its own directory containing some/all
28# of the following:
29
30# meta.txt - file contains metadata to be associated with all documents
31# in series.
32# *.issue - each issue should have a .issue file which may or may not
33# contain metadata to associate with the issue. also contains the list
34# of filenames that make up the issue (i.e. one for each page). meta.txt
35# is read before *.issue so metadata in .issue files will override that in
36# meta.txt
37# *.commentary - the commentary of the niupepa series (1 per series)
38# text/*.txt/htm - text/html files of issue pages (1 per page) -
39# text files are expected to be either .htm or .txt (lower case).
40# images/*.gif - image files of issue pages (1 per page)
41# abstracts/*.abstract - html files of issue abstracts (1 per issue)
42
43
44package NPPlug;
45
46use BasPlug;
47use util;
48use strict;
49use unicode;
50
51sub BEGIN {
52 @NPPlug::ISA = ('BasPlug');
53}
54
55
56my $arguments =
57 [
58 { 'name' => "create_log",
59 'desc' => "To create a log file",
60 'type' => "flag" },
61 { 'name' => "logfile",
62 'desc' => "The log file path",
63 'type' => "string",
64 'deft' => "./log.txt"}
65 ];
66
67my $options = { 'name' => "NPPlug",
68 'desc' => "{NPPlug.desc}",
69 'abstract' => "no",
70 'inherits' => "yes",
71 'args' => $arguments };
72
73sub new {
74 my ($class) = shift(@_);
75
76 my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
77 push(@$pluginlist, $class);
78
79 if(defined $arguments){ push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});}
80 if(defined $options) { push(@{$hashArgOptLists->{"OptList"}},$options)};
81
82 my $self = new BasPlug($pluginlist, $inputargs, $hashArgOptLists);
83
84 $self->{'commentaries'} = {};
85 $self->{'num_issues'} = 0;
86 $self->{'num_text_pages'} = 0;
87 $self->{'num_image_pages'} = 0;
88 $self->{'num_abstracts'} = 0;
89 return bless $self, $class;
90}
91
92
93sub get_default_process_exp {
94 my $self = shift (@_);
95
96 # the last option is an attempt to encode the concept of an html query ...
97 return q^(?i)(\.issue)$^;
98}
99
100
101sub is_recursive {
102 my $self = shift (@_);
103
104 return 0; # this is not a recursive plugin
105}
106
107sub begin {
108 my $self = shift (@_);
109 my ($pluginfo, $base_dir, $processor, $maxdocs) = @_;
110
111 # open up logfile
112 # note that we append to logfile as building will otherwise
113 # overwrite a file generated at import time
114 if ($self->{'create_log'}) {
115 open (LOGFILE, ">>$self->{'logfile'}") || die
116 "NPPlug.pm: Couldn't open log file $self->{'logfile'}\n";
117
118 my @time = localtime (time);
119
120 print LOGFILE "------------------------------------------------------------\n";
121 print LOGFILE "Log start $time[3]/$time[4]/" . (1900 + $time[5]) . "\n";
122 print LOGFILE "------------------------------------------------------------\n";
123 }
124}
125
126sub end {
127 my $self = shift (@_);
128
129 if ($self->{'create_log'}) {
130 my $numseries = 0;
131 my $numcommentaries = 0;
132
133 # record missing commentaries in logfile
134 foreach my $key (keys %{$self->{'commentaries'}}) {
135 $numseries ++;
136 if (!$self->{'commentaries'}->{$key}) {
137 print LOGFILE "Commentary missing for series $key\n";
138 } else {
139 $numcommentaries ++;
140 }
141 }
142
143 print LOGFILE "\n\nStatistics:\n";
144 print LOGFILE "series: $numseries\n";
145 print LOGFILE "commentaries: $numcommentaries\n";
146 print LOGFILE "issues: $self->{'num_issues'}\n";
147 print LOGFILE "abstracts: $self->{'num_abstracts'}\n";
148 print LOGFILE "text pages: $self->{'num_text_pages'}\n";
149 print LOGFILE "image pages: $self->{'num_image_pages'}\n";
150
151 # close logfile
152 close LOGFILE;
153 }
154}
155
156# return number of files processed, undef if can't process
157# Note that $base_dir might be "" and that $file might
158# include directories
159sub read {
160 my $self = shift (@_);
161 my ($pluginfo, $base_dir, $file, $metadata, $processor) = @_;
162
163 my $filename = &util::filename_cat($base_dir, $file);
164
165 # we don't want RecPlug to go recursing into the text, images or
166 # abstracts directories
167 return 0 if (-d $filename && $filename =~ /(abstracts|images|text)/);
168
169 return 0 if $filename =~ /meta\.txt$/i;
170
171 return undef unless ($filename =~ /\.(issue|commentary|gif)$/i && (-e $filename));
172
173 $self->{'verbosity'} = $processor->{'verbosity'};
174 print STDERR "NPPlug: processing $filename\n" if $self->{'verbosity'};
175
176 my ($dir);
177 ($dir, $file) = $filename =~ /^(.*?)([^\/\\]*)$/;
178 my ($issuekey) = $file =~ /^([^\.\_]*)/;
179
180 if ($filename =~ /\.commentary$/i) {
181 # commentary
182 $self->{'num_processed'}
183 += $self->process_commentary ($filename, $issuekey, $dir, $file, $processor);
184 return 1;
185 }
186
187 if ($filename =~ /(.*)on\.gif$/i) {
188 my $root = $1;
189 # create surrogate document to store cover image (on and off) for
190 # newspaper
191
192 $issuekey =~ s/on$//;
193
194 my $doc_obj = new doc ($dir, "indexed_doc");
195 my $cover = &util::filename_cat ($dir, $issuekey);
196
197 $self->associate_cover_images ($doc_obj, $cover, "" );
198 $doc_obj->set_OID ($issuekey);
199 $processor->process ($doc_obj);
200
201 return 0;
202 }
203 if ($filename =~ /(.*)of\.gif$/i) {
204 # block it
205 return 0;
206 }
207 if ($filename =~ /\.gif$/i) {
208 # Let some other plugin have a go
209 return undef;
210 }
211
212 my $numprocessed = 0;
213 $self->{'commentaries'}->{$issuekey} = 0
214 unless defined $self->{'commentaries'}->{$issuekey};
215
216 my ($abstractfile) = $file =~ /^([^\.]*)\.issue/i;
217 my $abstractOID = $abstractfile . "abstract";
218 $abstractfile .= ".abstract";
219 my $afile = &util::filename_cat($dir, "abstracts", $abstractfile);
220 my $hasabstract = 0;
221 if (-e $afile) {$hasabstract = 1;}
222 else {$abstractOID = undef;}
223
224 # process the .issue file
225 my %meta = ();
226 $numprocessed += $self->process_issue ($filename, $issuekey, $dir, $file,
227 $abstractOID, $processor, \%meta);
228
229 # process abstract of this issue
230 if ($hasabstract) {
231 $numprocessed += $self->process_abstract ($afile, $issuekey, $dir,
232 $abstractfile, $processor, \%meta);
233 }
234
235 $self->{'num_processed'} += $numprocessed;
236
237
238
239 return 1;
240}
241
242sub process_issue {
243 my $self = shift (@_);
244 my ($filename, $issuekey, $dir, $file, $abstract, $processor, $meta) = @_;
245
246 $self->{'num_issues'} ++;
247 my $doc_obj = new doc ($file, "indexed_doc");
248 my $topsection = $doc_obj->get_top_section();
249 $self->associate_cover_images ($doc_obj, $dir, $issuekey);
250 $doc_obj->set_utf8_metadata_element ($topsection, 'Title', $self->get_title_string($file));
251 $doc_obj->set_utf8_metadata_element ($topsection, 'abstract', $abstract) if defined $abstract;
252 $self->set_main_metadata ($doc_obj, $dir);
253
254 my $com_filename = &util::filename_cat($dir,"$issuekey.commentary");
255 if (-e $com_filename) {
256 $doc_obj->set_utf8_metadata_element ($topsection, "hascom", "1");
257 }
258
259 # process issue's pdf if one exists
260 my ($pdffile) = $file =~ /^([^\.]*)\.issue/i;
261 $pdffile .= ".pdf";
262 $pdffile = &util::filename_cat($dir, "pdf", $pdffile);
263 if (-e $pdffile) {
264 $doc_obj->set_utf8_metadata_element ($topsection, "haspdf", "1");
265 $doc_obj->associate_file($pdffile, "paper.pdf");
266 } else {
267 $doc_obj->set_utf8_metadata_element ($topsection, "haspdf", "0");
268 }
269
270 open (ISSUEFILE, $filename) || die "couldn't open $filename\n";
271 my $line = "";
272 while (defined ($line = <ISSUEFILE>)) {
273 next unless $line =~ /\w/;
274 chomp $line;
275 if ($line =~ /^<([^>]*)>(.*?)\s*$/) {
276 $doc_obj->set_utf8_metadata_element ($topsection, $1, $2);
277 $meta->{$1} = $2;
278 } else {
279 # should be a section name
280 $line =~ s/^\s+//;
281 $line =~ s/\s+$//;
282 my ($pagenum) = $line =~ /([^_]*)$/;
283# $doc_obj->create_named_section($pagenum); <-- can't do this anymore as pagenum may
284# be something like "cover"
285 my $cursection = $doc_obj->insert_section($doc_obj->get_end_child($topsection));
286
287# $doc_obj->set_utf8_metadata_element($pagenum, 'Title', $pagenum);
288 $doc_obj->set_utf8_metadata_element($cursection, 'Title', $pagenum);
289# $self->process_text ($dir, $line, $doc_obj, $pagenum);
290 $self->process_text ($dir, $line, $doc_obj, $cursection);
291# $self->process_images ($dir, $line, $doc_obj, $pagenum);
292 $self->process_images ($dir, $line, $doc_obj, $cursection);
293 }
294 }
295 $file =~ s/\.issue//i;
296 $doc_obj->set_OID ($file);
297 $processor->process ($doc_obj);
298 return 1;
299}
300
301sub process_images {
302 my $self = shift (@_);
303 my ($dir, $page, $doc_obj, $cursection) = @_;
304
305 my $filename = &util::filename_cat ($dir, "images", $page);
306
307 if (-e "$filename.gif") {
308 $self->{'num_image_pages'} ++;
309 $doc_obj->set_utf8_metadata_element ($cursection, "hasimg", "1");
310 $doc_obj->set_utf8_metadata_element ($cursection, "Source", $page);
311 $doc_obj->associate_file("$filename.gif", "$page.gif", "image/gif");
312 } elsif ($self->{'create_log'}) {
313 $doc_obj->set_utf8_metadata_element ($cursection, "hasimg", "0");
314 print LOGFILE "no fullsize image file for $page\n";
315 }
316
317 if (-e "${filename}_p.gif") {
318 $doc_obj->set_utf8_metadata_element ($cursection, "hasprevimg", "1");
319 $doc_obj->set_utf8_metadata_element ($cursection, "Source", $page);
320 $doc_obj->associate_file("${filename}_p.gif", "${page}_p.gif", "image/gif");
321 } elsif ($self->{'create_log'}) {
322 $doc_obj->set_utf8_metadata_element ($cursection, "hasprevimg", "0");
323 print LOGFILE "no preview image file for $page\n";
324 }
325}
326
327sub process_text {
328 my $self = shift (@_);
329 my ($dir, $page, $doc_obj, $cursection) = @_;
330 my ($text);
331
332 my $filename = &util::filename_cat ($dir, "text", $page);
333 if (-e "$filename.htm") {
334 $text = $self->get_text ("$filename.htm");
335 } elsif (-e "$filename.txt") {
336 $text = $self->get_text ("$filename.txt");
337 }
338
339 if (defined $text) {
340 $self->{'num_text_pages'} ++;
341 $doc_obj->add_utf8_text ($cursection, $text);
342 } elsif ($self->{'create_log'}) {
343 print LOGFILE "no txt or htm file for $page\n";
344 }
345}
346
347sub process_abstract {
348 my $self = shift (@_);
349 my ($filename, $issuekey, $dir, $file, $processor, $meta) = @_;
350
351 my $text = $self->get_text ($filename);
352 if (defined $text) {
353 $self->{'num_abstracts'} ++;
354 my $doc_obj = new doc ($file, "indexed_doc");
355 my $cursection = $doc_obj->get_top_section();
356 $self->associate_cover_images ($doc_obj, $dir, $issuekey);
357 $doc_obj->set_utf8_metadata_element ($cursection, 'Title', $self->get_title_string($file));
358 $self->set_main_metadata ($doc_obj, $dir);
359 map { $doc_obj->set_utf8_metadata_element ($cursection, $_, $meta->{$_}); } keys %$meta;
360 $doc_obj->set_utf8_metadata_element ($cursection, "doctype", "Description");
361 $doc_obj->add_utf8_text ($cursection, $text);
362 $file =~ s/\.abstract//i;
363 $doc_obj->set_OID ($file . "abstract");
364 $processor->process ($doc_obj);
365 return 1;
366 }
367
368 if ($self->{'create_log'}) {
369 print LOGFILE "abstract file $filename doesn't exist\n";
370 }
371 return 0;
372}
373
374sub process_commentary {
375 my $self = shift (@_);
376 my ($filename, $issuekey, $dir, $file, $processor) = @_;
377
378 my $text = $self->get_text ($filename);
379
380 return 0 unless defined $text;
381
382 $self->{'commentaries'}->{$issuekey} = 1;
383 my $doc_obj = new doc ($file, "indexed_doc");
384 my $cursection = $doc_obj->get_top_section();
385 $self->associate_cover_images ($doc_obj, $dir, $issuekey);
386 $doc_obj->set_utf8_metadata_element ($cursection, 'Title', "_commentary_");
387 $self->set_main_metadata ($doc_obj, $dir);
388 $doc_obj->set_utf8_metadata_element ($cursection, "doctype", "Commentary");
389 $doc_obj->add_utf8_text ($cursection, $text);
390 $doc_obj->set_OID ($issuekey . "commentary");
391 $processor->process ($doc_obj);
392 return 1;
393}
394
395sub associate_cover_images {
396 my $self = shift (@_);
397 my ($doc_obj, $dir, $issuekey) = @_;
398
399 my $cover = &util::filename_cat ($dir, $issuekey);
400 $doc_obj->associate_file("${cover}on.gif", "${issuekey}/coveron.gif", "image/gif");
401 $doc_obj->associate_file("${cover}of.gif", "${issuekey}/coverof.gif", "image/gif");
402}
403
404# reads in the meta.txt file and sets metadata
405sub set_main_metadata {
406 my $self = shift (@_);
407 my ($doc_obj, $dir) = @_;
408
409 my $metafile = &util::filename_cat ($dir, "meta.txt");
410 return unless (-e $metafile);
411
412 if (!open (METAFILE, $metafile)) {
413 print STDERR "NPPlug: Couldn't read $metafile\n" if $self->{'verbosity'};
414 return;
415 }
416
417 my $cursection = $doc_obj->get_top_section();
418 my $line = "";
419 while (defined ($line = <METAFILE>)) {
420 next unless $line =~ /\w/;
421 chomp $line;
422 if ($line =~ /<([^>]*)>(.*)$/) {
423 # note we're using set_metadata_element (not add_metadata_element)
424 # this will override any previously set metadata of the same name
425 $doc_obj->set_utf8_metadata_element ($cursection, $1, $2);
426 } elsif ($self->{'verbosity'}) {
427 print STDERR "NPPlug: Badly formatted line in $metafile\n";
428 print STDERR "meta.txt lines should be formatted '<metaname>metavalue'\n";
429 }
430 }
431}
432
433sub get_text {
434 my $self = shift (@_);
435 my ($filename) = @_;
436
437 if (open (FILE, $filename)) {
438 my $text = "";
439 my $line = "";
440 if ($filename =~ /\.(htm|commentary|abstract)$/i) {
441 my $savedtext = "";
442 my $foundbody = 0;
443 while (defined ($line = <FILE>)) {
444 if ($line =~ s/.*?<body[^>]*>//i) {
445 $foundbody = 1;
446 }
447 $line =~ s/(<\/?html[^>]*>|<\/?head[^>]*>|<\/p>|<\/?font[^>]*>|<\/?body[^>]*>)//ig;
448 if ($foundbody) {
449 $text .= $line;
450 } else {
451 $savedtext .= $line;
452 }
453 }
454 close FILE;
455 if (!$foundbody) {$text = $savedtext;}
456 if ($filename =~ /\.(commentary|abstract)$/i) {
457 # commentaries and abstracts should already be utf8
458 return $text;
459 } else {
460 # a few extended ascii characters have snuck through
461 # in some text files so we need to convert them to utf8
462 return &unicode::ascii2utf8(\$text);
463 }
464
465 } else {
466 while (defined ($line = <FILE>)) {
467 $line = "<p>\n" unless $line =~ /\w/;
468 $text .= $line;
469 }
470 close FILE;
471 # a few extended ascii characters have snuck through
472 # in some text files so we need to convert them to utf8
473 return &unicode::ascii2utf8(\$text);
474 }
475
476 } else {
477 print STDERR "NPPlug: Warning: get_text() couldn't open $filename\n"
478 if $self->{'verbosity'};
479 return undef;
480 }
481}
482
483sub get_title_string {
484 my $self = shift (@_);
485 my ($filename) = @_;
486
487 $filename =~ s/\.(issue|abstract)$//i;
488 my ($series, $vol, $num) = split /\_/, $filename;
489 my $title = "";
490 $title .= "_vol_ $vol" if defined $vol && $vol =~ /\w/;
491 if (defined $num && $num =~ /\w/) {
492 $title .= ", " if defined $vol && $vol =~ /\w/;
493 $title .= "_num_ $num";
494 }
495 return $title;
496}
497
4981;
Note: See TracBrowser for help on using the repository browser.