Context Navigation

source: trunk/niupepa/perllib/plugins/NPPlug.pm@ 3704

Last change on this file since 3704 was 3704, checked in by sjboddie, 21 years ago
* empty log message *
Property svn:keywords set to `Author Date Id Revision`
File size: 14.4 KB

Line
1	###########################################################################
2	#
3	# NPPlug.pm -- Plugin for the niupepa collection
4	# A component of the Greenstone digital library software
5	# from the New Zealand Digital Library Project at the
6	# University of Waikato, New Zealand.
7	#
8	# Copyright (C) 1999 New Zealand Digital Library Project
9	#
10	# This program is free software; you can redistribute it and/or modify
11	# it under the terms of the GNU General Public License as published by
12	# the Free Software Foundation; either version 2 of the License, or
13	# (at your option) any later version.
14	#
15	# This program is distributed in the hope that it will be useful,
16	# but WITHOUT ANY WARRANTY; without even the implied warranty of
17	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18	# GNU General Public License for more details.
19	#
20	# You should have received a copy of the GNU General Public License
21	# along with this program; if not, write to the Free Software
22	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23	#
24	###########################################################################
25
26	# The niupepa collection has a file structure as follows:
27	# Each niupepa series has its own directory containing some/all
28	# of the following:
29
30	# meta.txt - file contains metadata to be associated with all documents
31	# in series.
32	# *.issue - each issue should have a .issue file which may or may not
33	# contain metadata to associate with the issue. also contains the list
34	# of filenames that make up the issue (i.e. one for each page). meta.txt
35	# is read before *.issue so metadata in .issue files will override that in
36	# meta.txt
37	# *.commentary - the commentary of the niupepa series (1 per series)
38	# text/*.txt/htm - text/html files of issue pages (1 per page) -
39	# text files are expected to be either .htm or .txt (lower case).
40	# images/*.gif - image files of issue pages (1 per page)
41	# abstracts/*.abstract - html files of issue abstracts (1 per issue)
42
43
44	package NPPlug;
45
46	use BasPlug;
47	use util;
48
49	sub BEGIN {
50	@ISA = ('BasPlug');
51	}
52
53	use strict;
54
55	sub print_usage {
56	print STDERR "\nIncorrect options passed to NPPlug, check your collect.cfg configuration file\n";
57
58	print STDERR "\n usage: plugin NPPlug [options]\n\n";
59	print STDERR " options:\n";
60	print STDERR " -create_log Creates a log file containing info about which portions\n";
61	print STDERR " of papers are missing\n";
62	print STDERR " -logfile Path of logfile (defaults to ./log.txt)\n\n";
63	}
64
65	sub new {
66	my ($class) = @_;
67	my $self = new BasPlug ();
68
69	if (!parsargv::parse(\@_,
70	q^create_log^, \$self->{'create_log'},
71	q^logfile/.*/./log.txt^, \$self->{'logfile'})) {
72	&print_usage();
73	die "\n";
74	}
75
76	$self->{'commentaries'} = {};
77	$self->{'num_issues'} = 0;
78	$self->{'num_text_pages'} = 0;
79	$self->{'num_image_pages'} = 0;
80	$self->{'num_abstracts'} = 0;
81	return bless $self, $class;
82	}
83
84	sub is_recursive {
85	my $self = shift (@_);
86
87	return 0; # this is not a recursive plugin
88	}
89
90	sub begin {
91	my $self = shift (@_);
92	my ($pluginfo, $base_dir, $processor, $maxdocs) = @_;
93
94	# open up logfile
95	# note that we append to logfile as building will otherwise
96	# overwrite a file generated at import time
97	if ($self->{'create_log'}) {
98	open (LOGFILE, ">>$self->{'logfile'}") \|\| die
99	"NPPlug.pm: Couldn't open log file $self->{'logfile'}\n";
100
101	my @time = localtime (time);
102
103	print LOGFILE "------------------------------------------------------------\n";
104	print LOGFILE "Log start $time[3]/$time[4]/" . (1900 + $time[5]) . "\n";
105	print LOGFILE "------------------------------------------------------------\n";
106	}
107	}
108
109	sub end {
110	my $self = shift (@_);
111
112	if ($self->{'create_log'}) {
113	my $numseries = 0;
114	my $numcommentaries = 0;
115
116	# record missing commentaries in logfile
117	foreach my $key (keys %{$self->{'commentaries'}}) {
118	$numseries ++;
119	if (!$self->{'commentaries'}->{$key}) {
120	print LOGFILE "Commentary missing for series $key\n";
121	} else {
122	$numcommentaries ++;
123	}
124	}
125
126	print LOGFILE "\n\nStatistics:\n";
127	print LOGFILE "series: $numseries\n";
128	print LOGFILE "commentaries: $numcommentaries\n";
129	print LOGFILE "issues: $self->{'num_issues'}\n";
130	print LOGFILE "abstracts: $self->{'num_abstracts'}\n";
131	print LOGFILE "text pages: $self->{'num_text_pages'}\n";
132	print LOGFILE "image pages: $self->{'num_image_pages'}\n";
133
134	# close logfile
135	close LOGFILE;
136	}
137	}
138
139	# return number of files processed, undef if can't process
140	# Note that $base_dir might be "" and that $file might
141	# include directories
142	sub read {
143	my $self = shift (@_);
144	my ($pluginfo, $base_dir, $file, $metadata, $processor) = @_;
145
146	my $filename = &util::filename_cat($base_dir, $file);
147
148	# we don't want RecPlug to go recursing into the text, images or
149	# abstracts directories
150	return 0 if (-d $filename && $filename =~ /(abstracts\|images\|text)/);
151
152	return 0 if $filename =~ /meta\.txt$/i;
153
154	return undef unless ($filename =~ /\.(issue\|commentary)$/i && (-e $filename));
155
156	$self->{'verbosity'} = $processor->{'verbosity'};
157	print STDERR "NPPlug: processing $filename\n" if $self->{'verbosity'};
158
159	my ($dir);
160	($dir, $file) = $filename =~ /^(.?)([^\/\\])$/;
161	my ($issuekey) = $file =~ /^([^\.\_]*)/;
162
163	if ($filename =~ /\.commentary$/i) {
164	# commentary
165	return $self->process_commentary ($filename, $issuekey, $dir, $file, $processor);
166	}
167
168	my $numprocessed = 0;
169	$self->{'commentaries'}->{$issuekey} = 0 unless defined $self->{'commentaries'}->{$issuekey};
170
171	my ($abstractfile) = $file =~ /^([^\.]*)\.issue/i;
172	my $abstractOID = $abstractfile . "abstract";
173	$abstractfile .= ".abstract";
174	my $afile = &util::filename_cat($dir, "abstracts", $abstractfile);
175	my $hasabstract = 0;
176	if (-e $afile) {$hasabstract = 1;}
177	else {$abstractOID = undef;}
178
179	# process the .issue file
180	my %meta = ();
181	$numprocessed += $self->process_issue ($filename, $issuekey, $dir, $file,
182	$abstractOID, $processor, \%meta);
183
184	# process abstract of this issue
185	if ($hasabstract) {
186	$numprocessed += $self->process_abstract ($afile, $issuekey, $dir,
187	$abstractfile, $processor, \%meta);
188	}
189
190	return $numprocessed;
191	}
192
193	sub process_issue {
194	my $self = shift (@_);
195	my ($filename, $issuekey, $dir, $file, $abstract, $processor, $meta) = @_;
196
197	$self->{'num_issues'} ++;
198	my $doc_obj = new doc ($file, "indexed_doc");
199	my $topsection = $doc_obj->get_top_section();
200	$self->associate_cover_images ($doc_obj, $dir, $issuekey);
201	$doc_obj->set_utf8_metadata_element ($topsection, 'Title', $self->get_title_string($file));
202	$doc_obj->set_utf8_metadata_element ($topsection, 'abstract', $abstract) if defined $abstract;
203	$self->set_main_metadata ($doc_obj, $dir);
204
205	# process issue's pdf if one exists
206	my ($pdffile) = $file =~ /^([^\.]*)\.issue/i;
207	$pdffile .= ".pdf";
208	$pdffile = &util::filename_cat($dir, "pdf", $pdffile);
209	if (-e $pdffile) {
210	$doc_obj->set_utf8_metadata_element ($topsection, "haspdf", "1");
211	$doc_obj->associate_file($pdffile, "paper.pdf");
212	} else {
213	$doc_obj->set_utf8_metadata_element ($topsection, "haspdf", "0");
214	}
215
216	open (ISSUEFILE, $filename) \|\| die "couldn't open $filename\n";
217	my $line = "";
218	while (defined ($line = <ISSUEFILE>)) {
219	next unless $line =~ /\w/;
220	chomp $line;
221	if ($line =~ /^<([^>])>(.?)\s*$/) {
222	$doc_obj->set_utf8_metadata_element ($topsection, $1, $2);
223	$meta->{$1} = $2;
224	} else {
225	# should be a section name
226	$line =~ s/^\s+//;
227	$line =~ s/\s+$//;
228	my ($pagenum) = $line =~ /([^_]*)$/;
229	# $doc_obj->create_named_section($pagenum); <-- can't do this anymore as pagenum may
230	# be something like "cover"
231	my $cursection = $doc_obj->insert_section($doc_obj->get_end_child($topsection));
232
233	# $doc_obj->set_utf8_metadata_element($pagenum, 'Title', $pagenum);
234	$doc_obj->set_utf8_metadata_element($cursection, 'Title', $pagenum);
235	# $self->process_text ($dir, $line, $doc_obj, $pagenum);
236	$self->process_text ($dir, $line, $doc_obj, $cursection);
237	# $self->process_images ($dir, $line, $doc_obj, $pagenum);
238	$self->process_images ($dir, $line, $doc_obj, $cursection);
239	}
240	}
241	$file =~ s/\.issue//i;
242	$doc_obj->set_OID ($file);
243	$processor->process ($doc_obj);
244	return 1;
245	}
246
247	sub process_images {
248	my $self = shift (@_);
249	my ($dir, $page, $doc_obj, $cursection) = @_;
250
251	my $filename = &util::filename_cat ($dir, "images", $page);
252
253	if (-e "$filename.gif") {
254	$self->{'num_image_pages'} ++;
255	$doc_obj->set_utf8_metadata_element ($cursection, "hasimg", "1");
256	$doc_obj->set_utf8_metadata_element ($cursection, "Source", $page);
257	$doc_obj->associate_file("$filename.gif", "$page.gif", "image/gif");
258	} elsif ($self->{'create_log'}) {
259	$doc_obj->set_utf8_metadata_element ($cursection, "hasimg", "0");
260	print LOGFILE "no fullsize image file for $page\n";
261	}
262
263	if (-e "${filename}_p.gif") {
264	$doc_obj->set_utf8_metadata_element ($cursection, "hasprevimg", "1");
265	$doc_obj->set_utf8_metadata_element ($cursection, "Source", $page);
266	$doc_obj->associate_file("${filename}_p.gif", "${page}_p.gif", "image/gif");
267	} elsif ($self->{'create_log'}) {
268	$doc_obj->set_utf8_metadata_element ($cursection, "hasprevimg", "0");
269	print LOGFILE "no preview image file for $page\n";
270	}
271	}
272
273	sub process_text {
274	my $self = shift (@_);
275	my ($dir, $page, $doc_obj, $cursection) = @_;
276	my ($text);
277
278	my $filename = &util::filename_cat ($dir, "text", $page);
279	if (-e "$filename.htm") {
280	$text = $self->get_text ("$filename.htm");
281	} elsif (-e "$filename.txt") {
282	$text = $self->get_text ("$filename.txt");
283	}
284
285	if (defined $text) {
286	$self->{'num_text_pages'} ++;
287	$doc_obj->add_utf8_text ($cursection, $text);
288	} elsif ($self->{'create_log'}) {
289	print LOGFILE "no txt or htm file for $page\n";
290	}
291	}
292
293	sub process_abstract {
294	my $self = shift (@_);
295	my ($filename, $issuekey, $dir, $file, $processor, $meta) = @_;
296
297	my $text = $self->get_text ($filename);
298	if (defined $text) {
299	$self->{'num_abstracts'} ++;
300	my $doc_obj = new doc ($file, "indexed_doc");
301	my $cursection = $doc_obj->get_top_section();
302	$self->associate_cover_images ($doc_obj, $dir, $issuekey);
303	$doc_obj->set_utf8_metadata_element ($cursection, 'Title', $self->get_title_string($file));
304	$self->set_main_metadata ($doc_obj, $dir);
305	map { $doc_obj->set_utf8_metadata_element ($cursection, $_, $meta->{$_}); } keys %$meta;
306	$doc_obj->set_utf8_metadata_element ($cursection, "doctype", "Description");
307	$doc_obj->add_utf8_text ($cursection, $text);
308	$file =~ s/\.abstract//i;
309	$doc_obj->set_OID ($file . "abstract");
310	$processor->process ($doc_obj);
311	return 1;
312	}
313
314	if ($self->{'create_log'}) {
315	print LOGFILE "abstract file $filename doesn't exist\n";
316	}
317	return 0;
318	}
319
320	sub process_commentary {
321	my $self = shift (@_);
322	my ($filename, $issuekey, $dir, $file, $processor) = @_;
323
324	my $text = $self->get_text ($filename);
325
326	return 0 unless defined $text;
327
328	$self->{'commentaries'}->{$issuekey} = 1;
329	my $doc_obj = new doc ($file, "indexed_doc");
330	my $cursection = $doc_obj->get_top_section();
331	$self->associate_cover_images ($doc_obj, $dir, $issuekey);
332	$doc_obj->set_utf8_metadata_element ($cursection, 'Title', "_commentary_");
333	$self->set_main_metadata ($doc_obj, $dir);
334	$doc_obj->set_utf8_metadata_element ($cursection, "doctype", "Commentary");
335	$doc_obj->add_utf8_text ($cursection, $text);
336	$doc_obj->set_OID ($issuekey . "commentary");
337	$processor->process ($doc_obj);
338	return 1;
339	}
340
341	sub associate_cover_images {
342	my $self = shift (@_);
343	my ($doc_obj, $dir, $issuekey) = @_;
344
345	my $cover = &util::filename_cat ($dir, $issuekey);
346	$doc_obj->associate_file("${cover}on.gif", "${issuekey}/coveron.gif", "image/gif");
347	$doc_obj->associate_file("${cover}of.gif", "${issuekey}/coverof.gif", "image/gif");
348	}
349
350	# reads in the meta.txt file and sets metadata
351	sub set_main_metadata {
352	my $self = shift (@_);
353	my ($doc_obj, $dir) = @_;
354
355	my $metafile = &util::filename_cat ($dir, "meta.txt");
356	return unless (-e $metafile);
357
358	if (!open (METAFILE, $metafile)) {
359	print STDERR "NPPlug: Couldn't read $metafile\n" if $self->{'verbosity'};
360	return;
361	}
362
363	my $cursection = $doc_obj->get_top_section();
364	my $line = "";
365	while (defined ($line = <METAFILE>)) {
366	next unless $line =~ /\w/;
367	chomp $line;
368	if ($line =~ /<([^>])>(.)$/) {
369	# note we're using set_metadata_element (not add_metadata_element)
370	# this will override any previously set metadata of the same name
371	$doc_obj->set_utf8_metadata_element ($cursection, $1, $2);
372	} elsif ($self->{'verbosity'}) {
373	print STDERR "NPPlug: Badly formatted line in $metafile\n";
374	print STDERR "meta.txt lines should be formatted '<metaname>metavalue'\n";
375	}
376	}
377	}
378
379	sub get_text {
380	my $self = shift (@_);
381	my ($filename) = @_;
382
383	if (open (FILE, $filename)) {
384	my $text = "";
385	my $line = "";
386	if ($filename =~ /\.(htm\|commentary\|abstract)$/i) {
387	my $savedtext = "";
388	my $foundbody = 0;
389	while (defined ($line = <FILE>)) {
390	if ($line =~ s/.?<body[^>]>//i) {
391	$foundbody = 1;
392	}
393	$line =~ s/(<\/?html[^>]>\|<\/?head[^>]>\|<\/p>\|<\/?font[^>]>\|<\/?body[^>]>)//ig;
394	if ($foundbody) {
395	$text .= $line;
396	} else {
397	$savedtext .= $line;
398	}
399	}
400	close FILE;
401	if (!$foundbody) {$text = $savedtext;}
402	if ($filename =~ /\.(commentary\|abstract)$/i) {
403	# commentaries and abstracts should already be utf8
404	return $text;
405	} else {
406	# a few extended ascii characters have snuck through
407	# in some text files so we need to convert them to utf8
408	return &unicode::ascii2utf8(\$text);
409	}
410
411	} else {
412	while (defined ($line = <FILE>)) {
413	$line = "<p>\n" unless $line =~ /\w/;
414	$text .= $line;
415	}
416	close FILE;
417	# a few extended ascii characters have snuck through
418	# in some text files so we need to convert them to utf8
419	return &unicode::ascii2utf8(\$text);
420	}
421
422	} else {
423	print STDERR "NPPlug: Warning: get_text() couldn't open $filename\n"
424	if $self->{'verbosity'};
425	return undef;
426	}
427	}
428
429	sub get_title_string {
430	my $self = shift (@_);
431	my ($filename) = @_;
432
433	$filename =~ s/\.(issue\|abstract)$//i;
434	my ($series, $vol, $num) = split /\_/, $filename;
435	my $title = "";
436	$title .= "_vol_ $vol" if defined $vol && $vol =~ /\w/;
437	if (defined $num && $num =~ /\w/) {
438	$title .= ", " if defined $vol && $vol =~ /\w/;
439	$title .= "_num_ $num";
440	}
441	return $title;
442	}
443
444	1;

Note: See TracBrowser for help on using the repository browser.

Download in other formats: