Context Navigation

source: trunk/niupepa/perllib/plugins/NPPlug.pm@ 13310

Last change on this file since 13310 was 13310, checked in by nzdl, 17 years ago
modify the new method
Property svn:keywords set to `Author Date Id Revision`
File size: 15.4 KB

Line
1	###########################################################################
2	#
3	# NPPlug.pm -- Plugin for the niupepa collection
4	# A component of the Greenstone digital library software
5	# from the New Zealand Digital Library Project at the
6	# University of Waikato, New Zealand.
7	#
8	# Copyright (C) 1999 New Zealand Digital Library Project
9	#
10	# This program is free software; you can redistribute it and/or modify
11	# it under the terms of the GNU General Public License as published by
12	# the Free Software Foundation; either version 2 of the License, or
13	# (at your option) any later version.
14	#
15	# This program is distributed in the hope that it will be useful,
16	# but WITHOUT ANY WARRANTY; without even the implied warranty of
17	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18	# GNU General Public License for more details.
19	#
20	# You should have received a copy of the GNU General Public License
21	# along with this program; if not, write to the Free Software
22	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23	#
24	###########################################################################
25
26	# The niupepa collection has a file structure as follows:
27	# Each niupepa series has its own directory containing some/all
28	# of the following:
29
30	# meta.txt - file contains metadata to be associated with all documents
31	# in series.
32	# *.issue - each issue should have a .issue file which may or may not
33	# contain metadata to associate with the issue. also contains the list
34	# of filenames that make up the issue (i.e. one for each page). meta.txt
35	# is read before *.issue so metadata in .issue files will override that in
36	# meta.txt
37	# *.commentary - the commentary of the niupepa series (1 per series)
38	# text/*.txt/htm - text/html files of issue pages (1 per page) -
39	# text files are expected to be either .htm or .txt (lower case).
40	# images/*.gif - image files of issue pages (1 per page)
41	# abstracts/*.abstract - html files of issue abstracts (1 per issue)
42
43
44	package NPPlug;
45
46	use BasPlug;
47	use util;
48	use strict;
49	use unicode;
50
51	sub BEGIN {
52	@NPPlug::ISA = ('BasPlug');
53	}
54
55
56	my $arguments =
57	[
58	{ 'name' => "create_log",
59	'desc' => "To create a log file",
60	'type' => "flag" },
61	{ 'name' => "logfile",
62	'desc' => "The log file path",
63	'type' => "string",
64	'deft' => "./log.txt"}
65	];
66
67	my $options = { 'name' => "NPPlug",
68	'desc' => "{NPPlug.desc}",
69	'abstract' => "no",
70	'inherits' => "yes",
71	'args' => $arguments };
72
73	sub new {
74	my ($class) = shift(@_);
75
76	my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
77	push(@$pluginlist, $class);
78
79	if(defined $arguments){ push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});}
80	if(defined $options) { push(@{$hashArgOptLists->{"OptList"}},$options)};
81
82	my $self = new BasPlug($pluginlist, $inputargs, $hashArgOptLists);
83
84	$self->{'commentaries'} = {};
85	$self->{'num_issues'} = 0;
86	$self->{'num_text_pages'} = 0;
87	$self->{'num_image_pages'} = 0;
88	$self->{'num_abstracts'} = 0;
89	return bless $self, $class;
90	}
91
92
93	sub get_default_process_exp {
94	my $self = shift (@_);
95
96	# the last option is an attempt to encode the concept of an html query ...
97	return q^(?i)(\.issue)$^;
98	}
99
100
101	sub is_recursive {
102	my $self = shift (@_);
103
104	return 0; # this is not a recursive plugin
105	}
106
107	sub begin {
108	my $self = shift (@_);
109	my ($pluginfo, $base_dir, $processor, $maxdocs) = @_;
110
111	# open up logfile
112	# note that we append to logfile as building will otherwise
113	# overwrite a file generated at import time
114	if ($self->{'create_log'}) {
115	open (LOGFILE, ">>$self->{'logfile'}") \|\| die
116	"NPPlug.pm: Couldn't open log file $self->{'logfile'}\n";
117
118	my @time = localtime (time);
119
120	print LOGFILE "------------------------------------------------------------\n";
121	print LOGFILE "Log start $time[3]/$time[4]/" . (1900 + $time[5]) . "\n";
122	print LOGFILE "------------------------------------------------------------\n";
123	}
124	}
125
126	sub end {
127	my $self = shift (@_);
128
129	if ($self->{'create_log'}) {
130	my $numseries = 0;
131	my $numcommentaries = 0;
132
133	# record missing commentaries in logfile
134	foreach my $key (keys %{$self->{'commentaries'}}) {
135	$numseries ++;
136	if (!$self->{'commentaries'}->{$key}) {
137	print LOGFILE "Commentary missing for series $key\n";
138	} else {
139	$numcommentaries ++;
140	}
141	}
142
143	print LOGFILE "\n\nStatistics:\n";
144	print LOGFILE "series: $numseries\n";
145	print LOGFILE "commentaries: $numcommentaries\n";
146	print LOGFILE "issues: $self->{'num_issues'}\n";
147	print LOGFILE "abstracts: $self->{'num_abstracts'}\n";
148	print LOGFILE "text pages: $self->{'num_text_pages'}\n";
149	print LOGFILE "image pages: $self->{'num_image_pages'}\n";
150
151	# close logfile
152	close LOGFILE;
153	}
154	}
155
156	# return number of files processed, undef if can't process
157	# Note that $base_dir might be "" and that $file might
158	# include directories
159	sub read {
160	my $self = shift (@_);
161	my ($pluginfo, $base_dir, $file, $metadata, $processor) = @_;
162
163	my $filename = &util::filename_cat($base_dir, $file);
164
165	# we don't want RecPlug to go recursing into the text, images or
166	# abstracts directories
167	return 0 if (-d $filename && $filename =~ /(abstracts\|images\|text)/);
168
169	return 0 if $filename =~ /meta\.txt$/i;
170
171	return undef unless ($filename =~ /\.(issue\|commentary\|gif)$/i && (-e $filename));
172
173	$self->{'verbosity'} = $processor->{'verbosity'};
174	print STDERR "NPPlug: processing $filename\n" if $self->{'verbosity'};
175
176	my ($dir);
177	($dir, $file) = $filename =~ /^(.?)([^\/\\])$/;
178	my ($issuekey) = $file =~ /^([^\.\_]*)/;
179
180	if ($filename =~ /\.commentary$/i) {
181	# commentary
182	$self->{'num_processed'}
183	+= $self->process_commentary ($filename, $issuekey, $dir, $file, $processor);
184	return 1;
185	}
186
187	if ($filename =~ /(.*)on\.gif$/i) {
188	my $root = $1;
189	# create surrogate document to store cover image (on and off) for
190	# newspaper
191
192	$issuekey =~ s/on$//;
193
194	my $doc_obj = new doc ($dir, "indexed_doc");
195	my $cover = &util::filename_cat ($dir, $issuekey);
196
197	$self->associate_cover_images ($doc_obj, $cover, "" );
198	$doc_obj->set_OID ($issuekey);
199	$processor->process ($doc_obj);
200
201	return 0;
202	}
203	if ($filename =~ /(.*)of\.gif$/i) {
204	# block it
205	return 0;
206	}
207	if ($filename =~ /\.gif$/i) {
208	# Let some other plugin have a go
209	return undef;
210	}
211
212	my $numprocessed = 0;
213	$self->{'commentaries'}->{$issuekey} = 0
214	unless defined $self->{'commentaries'}->{$issuekey};
215
216	my ($abstractfile) = $file =~ /^([^\.]*)\.issue/i;
217	my $abstractOID = $abstractfile . "abstract";
218	$abstractfile .= ".abstract";
219	my $afile = &util::filename_cat($dir, "abstracts", $abstractfile);
220	my $hasabstract = 0;
221	if (-e $afile) {$hasabstract = 1;}
222	else {$abstractOID = undef;}
223
224	# process the .issue file
225	my %meta = ();
226	$numprocessed += $self->process_issue ($filename, $issuekey, $dir, $file,
227	$abstractOID, $processor, \%meta);
228
229	# process abstract of this issue
230	if ($hasabstract) {
231	$numprocessed += $self->process_abstract ($afile, $issuekey, $dir,
232	$abstractfile, $processor, \%meta);
233	}
234
235	$self->{'num_processed'} += $numprocessed;
236
237
238
239	return 1;
240	}
241
242	sub process_issue {
243	my $self = shift (@_);
244	my ($filename, $issuekey, $dir, $file, $abstract, $processor, $meta) = @_;
245
246	$self->{'num_issues'} ++;
247	my $doc_obj = new doc ($file, "indexed_doc");
248	my $topsection = $doc_obj->get_top_section();
249	$self->associate_cover_images ($doc_obj, $dir, $issuekey);
250	$doc_obj->set_utf8_metadata_element ($topsection, 'Title', $self->get_title_string($file));
251	$doc_obj->set_utf8_metadata_element ($topsection, 'abstract', $abstract) if defined $abstract;
252	$self->set_main_metadata ($doc_obj, $dir);
253
254	my $com_filename = &util::filename_cat($dir,"$issuekey.commentary");
255	if (-e $com_filename) {
256	$doc_obj->set_utf8_metadata_element ($topsection, "hascom", "1");
257	}
258
259	# process issue's pdf if one exists
260	my ($pdffile) = $file =~ /^([^\.]*)\.issue/i;
261	$pdffile .= ".pdf";
262	$pdffile = &util::filename_cat($dir, "pdf", $pdffile);
263	if (-e $pdffile) {
264	$doc_obj->set_utf8_metadata_element ($topsection, "haspdf", "1");
265	$doc_obj->associate_file($pdffile, "paper.pdf");
266	} else {
267	$doc_obj->set_utf8_metadata_element ($topsection, "haspdf", "0");
268	}
269
270	open (ISSUEFILE, $filename) \|\| die "couldn't open $filename\n";
271	my $line = "";
272	while (defined ($line = <ISSUEFILE>)) {
273	next unless $line =~ /\w/;
274	chomp $line;
275	if ($line =~ /^<([^>])>(.?)\s*$/) {
276	$doc_obj->set_utf8_metadata_element ($topsection, $1, $2);
277	$meta->{$1} = $2;
278	} else {
279	# should be a section name
280	$line =~ s/^\s+//;
281	$line =~ s/\s+$//;
282	my ($pagenum) = $line =~ /([^_]*)$/;
283	# $doc_obj->create_named_section($pagenum); <-- can't do this anymore as pagenum may
284	# be something like "cover"
285	my $cursection = $doc_obj->insert_section($doc_obj->get_end_child($topsection));
286
287	# $doc_obj->set_utf8_metadata_element($pagenum, 'Title', $pagenum);
288	$doc_obj->set_utf8_metadata_element($cursection, 'Title', $pagenum);
289	# $self->process_text ($dir, $line, $doc_obj, $pagenum);
290	$self->process_text ($dir, $line, $doc_obj, $cursection);
291	# $self->process_images ($dir, $line, $doc_obj, $pagenum);
292	$self->process_images ($dir, $line, $doc_obj, $cursection);
293	}
294	}
295	$file =~ s/\.issue//i;
296	$doc_obj->set_OID ($file);
297	$processor->process ($doc_obj);
298	return 1;
299	}
300
301	sub process_images {
302	my $self = shift (@_);
303	my ($dir, $page, $doc_obj, $cursection) = @_;
304
305	my $filename = &util::filename_cat ($dir, "images", $page);
306
307	if (-e "$filename.gif") {
308	$self->{'num_image_pages'} ++;
309	$doc_obj->set_utf8_metadata_element ($cursection, "hasimg", "1");
310	$doc_obj->set_utf8_metadata_element ($cursection, "Source", $page);
311	$doc_obj->associate_file("$filename.gif", "$page.gif", "image/gif");
312	} elsif ($self->{'create_log'}) {
313	$doc_obj->set_utf8_metadata_element ($cursection, "hasimg", "0");
314	print LOGFILE "no fullsize image file for $page\n";
315	}
316
317	if (-e "${filename}_p.gif") {
318	$doc_obj->set_utf8_metadata_element ($cursection, "hasprevimg", "1");
319	$doc_obj->set_utf8_metadata_element ($cursection, "Source", $page);
320	$doc_obj->associate_file("${filename}_p.gif", "${page}_p.gif", "image/gif");
321	} elsif ($self->{'create_log'}) {
322	$doc_obj->set_utf8_metadata_element ($cursection, "hasprevimg", "0");
323	print LOGFILE "no preview image file for $page\n";
324	}
325	}
326
327	sub process_text {
328	my $self = shift (@_);
329	my ($dir, $page, $doc_obj, $cursection) = @_;
330	my ($text);
331
332	my $filename = &util::filename_cat ($dir, "text", $page);
333	if (-e "$filename.htm") {
334	$text = $self->get_text ("$filename.htm");
335	} elsif (-e "$filename.txt") {
336	$text = $self->get_text ("$filename.txt");
337	}
338
339	if (defined $text) {
340	$self->{'num_text_pages'} ++;
341	$doc_obj->add_utf8_text ($cursection, $text);
342	} elsif ($self->{'create_log'}) {
343	print LOGFILE "no txt or htm file for $page\n";
344	}
345	}
346
347	sub process_abstract {
348	my $self = shift (@_);
349	my ($filename, $issuekey, $dir, $file, $processor, $meta) = @_;
350
351	my $text = $self->get_text ($filename);
352	if (defined $text) {
353	$self->{'num_abstracts'} ++;
354	my $doc_obj = new doc ($file, "indexed_doc");
355	my $cursection = $doc_obj->get_top_section();
356	$self->associate_cover_images ($doc_obj, $dir, $issuekey);
357	$doc_obj->set_utf8_metadata_element ($cursection, 'Title', $self->get_title_string($file));
358	$self->set_main_metadata ($doc_obj, $dir);
359	map { $doc_obj->set_utf8_metadata_element ($cursection, $_, $meta->{$_}); } keys %$meta;
360	$doc_obj->set_utf8_metadata_element ($cursection, "doctype", "Description");
361	$doc_obj->add_utf8_text ($cursection, $text);
362	$file =~ s/\.abstract//i;
363	$doc_obj->set_OID ($file . "abstract");
364	$processor->process ($doc_obj);
365	return 1;
366	}
367
368	if ($self->{'create_log'}) {
369	print LOGFILE "abstract file $filename doesn't exist\n";
370	}
371	return 0;
372	}
373
374	sub process_commentary {
375	my $self = shift (@_);
376	my ($filename, $issuekey, $dir, $file, $processor) = @_;
377
378	my $text = $self->get_text ($filename);
379
380	return 0 unless defined $text;
381
382	$self->{'commentaries'}->{$issuekey} = 1;
383	my $doc_obj = new doc ($file, "indexed_doc");
384	my $cursection = $doc_obj->get_top_section();
385	$self->associate_cover_images ($doc_obj, $dir, $issuekey);
386	$doc_obj->set_utf8_metadata_element ($cursection, 'Title', "_commentary_");
387	$self->set_main_metadata ($doc_obj, $dir);
388	$doc_obj->set_utf8_metadata_element ($cursection, "doctype", "Commentary");
389	$doc_obj->add_utf8_text ($cursection, $text);
390	$doc_obj->set_OID ($issuekey . "commentary");
391	$processor->process ($doc_obj);
392	return 1;
393	}
394
395	sub associate_cover_images {
396	my $self = shift (@_);
397	my ($doc_obj, $dir, $issuekey) = @_;
398
399	my $cover = &util::filename_cat ($dir, $issuekey);
400	$doc_obj->associate_file("${cover}on.gif", "${issuekey}/coveron.gif", "image/gif");
401	$doc_obj->associate_file("${cover}of.gif", "${issuekey}/coverof.gif", "image/gif");
402	}
403
404	# reads in the meta.txt file and sets metadata
405	sub set_main_metadata {
406	my $self = shift (@_);
407	my ($doc_obj, $dir) = @_;
408
409	my $metafile = &util::filename_cat ($dir, "meta.txt");
410	return unless (-e $metafile);
411
412	if (!open (METAFILE, $metafile)) {
413	print STDERR "NPPlug: Couldn't read $metafile\n" if $self->{'verbosity'};
414	return;
415	}
416
417	my $cursection = $doc_obj->get_top_section();
418	my $line = "";
419	while (defined ($line = <METAFILE>)) {
420	next unless $line =~ /\w/;
421	chomp $line;
422	if ($line =~ /<([^>])>(.)$/) {
423	# note we're using set_metadata_element (not add_metadata_element)
424	# this will override any previously set metadata of the same name
425	$doc_obj->set_utf8_metadata_element ($cursection, $1, $2);
426	} elsif ($self->{'verbosity'}) {
427	print STDERR "NPPlug: Badly formatted line in $metafile\n";
428	print STDERR "meta.txt lines should be formatted '<metaname>metavalue'\n";
429	}
430	}
431	}
432
433	sub get_text {
434	my $self = shift (@_);
435	my ($filename) = @_;
436
437	if (open (FILE, $filename)) {
438	my $text = "";
439	my $line = "";
440	if ($filename =~ /\.(htm\|commentary\|abstract)$/i) {
441	my $savedtext = "";
442	my $foundbody = 0;
443	while (defined ($line = <FILE>)) {
444	if ($line =~ s/.?<body[^>]>//i) {
445	$foundbody = 1;
446	}
447	$line =~ s/(<\/?html[^>]>\|<\/?head[^>]>\|<\/p>\|<\/?font[^>]>\|<\/?body[^>]>)//ig;
448	if ($foundbody) {
449	$text .= $line;
450	} else {
451	$savedtext .= $line;
452	}
453	}
454	close FILE;
455	if (!$foundbody) {$text = $savedtext;}
456	if ($filename =~ /\.(commentary\|abstract)$/i) {
457	# commentaries and abstracts should already be utf8
458	return $text;
459	} else {
460	# a few extended ascii characters have snuck through
461	# in some text files so we need to convert them to utf8
462	return &unicode::ascii2utf8(\$text);
463	}
464
465	} else {
466	while (defined ($line = <FILE>)) {
467	$line = "<p>\n" unless $line =~ /\w/;
468	$text .= $line;
469	}
470	close FILE;
471	# a few extended ascii characters have snuck through
472	# in some text files so we need to convert them to utf8
473	return &unicode::ascii2utf8(\$text);
474	}
475
476	} else {
477	print STDERR "NPPlug: Warning: get_text() couldn't open $filename\n"
478	if $self->{'verbosity'};
479	return undef;
480	}
481	}
482
483	sub get_title_string {
484	my $self = shift (@_);
485	my ($filename) = @_;
486
487	$filename =~ s/\.(issue\|abstract)$//i;
488	my ($series, $vol, $num) = split /\_/, $filename;
489	my $title = "";
490	$title .= "_vol_ $vol" if defined $vol && $vol =~ /\w/;
491	if (defined $num && $num =~ /\w/) {
492	$title .= ", " if defined $vol && $vol =~ /\w/;
493	$title .= "_num_ $num";
494	}
495	return $title;
496	}
497
498	1;

Note: See TracBrowser for help on using the repository browser.

Download in other formats: