Context Navigation

source: trunk/niupepa/perllib/plugins/AbstractPlug.pm@ 1554

Last change on this file since 1554 was 1554, checked in by sjboddie, 24 years ago
* empty log message *
Property svn:keywords set to `Author Date Id Revision`
File size: 5.9 KB

Line
1	###########################################################################
2	#
3	# AbstractPlug.pm --
4	# A component of the Greenstone digital library software
5	# from the New Zealand Digital Library Project at the
6	# University of Waikato, New Zealand.
7	#
8	# Copyright (C) 1999 New Zealand Digital Library Project
9	#
10	# This program is free software; you can redistribute it and/or modify
11	# it under the terms of the GNU General Public License as published by
12	# the Free Software Foundation; either version 2 of the License, or
13	# (at your option) any later version.
14	#
15	# This program is distributed in the hope that it will be useful,
16	# but WITHOUT ANY WARRANTY; without even the implied warranty of
17	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18	# GNU General Public License for more details.
19	#
20	# You should have received a copy of the GNU General Public License
21	# along with this program; if not, write to the Free Software
22	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23	#
24	###########################################################################
25
26	# AbstractPlug processes the abstracts created for the Niupepa collection
27	# (they're word documents). Each series has just one word document that
28	# contains all the abstracts for that series.
29	# Word files should be named something like 01abstract.doc (the series number
30	# at the beginning is the important thing).
31
32	# Use this plugin (along with import.pl) to split word documents into
33	# multiple files - this is done as an initial step - instead of creating
34	# meaningful files in the archives directory this plugin will create all the
35	# split abstract files in niupepa/newabstracts. these files should then be
36	# copied into the correct place in the real import directory so that import.pl
37	# can be rerun.
38	# This is kind of an ugly way to do it but NPPlug needs to know if a matching
39	# abstract exists when it's processing an issue and it can't know that until
40	# the doc files are split up by issue.
41
42	# any archives created by this plugin are an unwanted side-effect (as I'm too
43	# lazy to override the read() function in this plugin so BasPlug::read() will
44	# create an empty gml file for each doc file we process).
45
46	package AbstractPlug;
47
48	use ConvertToPlug;
49	use util;
50
51	sub BEGIN {
52	@ISA = ('ConvertToPlug');
53	}
54
55	sub get_default_process_exp {
56	my $self = shift (@_);
57
58	return q^(?i)\.doc$^;
59	}
60
61	# do plugin specific processing of doc_obj
62	sub process {
63	my $self = shift (@_);
64	my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj) = @_;
65
66	my ($seriesnum) = $file =~ /^(\d+)/;
67	my $dir = &util::filename_cat($ENV{'GSDLCOLLECTDIR'}, "newabstracts", $seriesnum);
68
69	&util::mk_all_dir ($dir) unless -d $dir;
70
71	# clean up the html we got from the conversion process
72	$self->clean_html ($textref);
73
74	# process each issue
75	$$textref =~ s/<p(?:\s[^>])?>\s\n
76	[^\n]+\n # series title line
77	<\/p>\n
78	<p(?:\s[^>])?>\s\n
79	<b>(.?)<\/b>.?\n # issue line (i.e. volume, issue number, date)
80	<\/p>\n
81	(<table[^>]>.?<\/table>)? # the table itself (may not always be defined)
82	/$self->process_issue($1, $2, $seriesnum, $dir)/isgxe;
83	}
84
85	sub process_issue {
86	my $self = shift @_;
87	my ($issue, $text, $seriesnum, $dir) = @_;
88
89	my $OID = $seriesnum . "_";
90
91	my ($volume) = $issue =~ /vol(?:ume)?\s\.?\s(\d+)/i;
92	if (defined $volume) {
93	$OID .= $volume . "_";
94	# print STDERR "volume: $volume ($issue)\n";
95	} else {
96	$OID .= "_";
97	my $outhandle = $self->{'outhandle'};
98	print $outhandle "AbstractPlug: Warning: No volume found ($issue)\n";
99	}
100
101	my ($number) = $issue =~ /(?:no\|num(?:ber)?)\s\.?\s(\d+)/i;
102	if (defined $number) {
103	$OID .= $number;
104	# print STDERR "number: $number ($issue)\n";
105	} else {
106	my $outhandle = $self->{'outhandle'};
107	print $outhandle "AbstractPlug: Warning: No number found ($issue)\n";
108	}
109
110	my $abfile = &util::filename_cat ($dir, "$OID.abstract");
111	open (ABFILE, ">$abfile") \|\| die;
112
113	# links to page numbers
114	if (defined $text) {
115	$text =~ s/(td(?:\s[^>])?>\s\n
116	<p(?:\s[^>])?>\s\n)
117	(pp?\s\.\s(\d+)(?:-\d+)?)(\s*\n)
118	/$1<a href=\"_httpdocument_&cl=_cgiargcl_&d=${OID}\.$3&gg=prev\">$2<\/a>$4/isgx;
119
120	print ABFILE $text;
121	} else {
122	# some don't have any abstract information - we'll just use
123	# the issue line to prevent a completely empty page
124	print ABFILE $issue;
125	}
126	close ABFILE;
127	}
128
129	sub clean_html {
130	my $self = shift (@_);
131	my ($textref) = @_;
132
133	$$textref =~ s/^.?<body[^>]>//is; # remove html headers
134	$$textref =~ s/(<div[^>]+>\|<\/div>)//isg; # don't really need divs either
135	$$textref =~ s/\s*?border=\"\d+\"//igs; # many tables have borders that we don't want
136	$$textref =~ s/\s*?(row\|col)span=\"1\"//igs; # rowspan\|colspan=1 don't seem real useful
137	$$textref =~ s/\s*?line\-height:[^;]+;//igs; # don't really want hard-coded line heights either
138	$$textref =~ s/(<td)/$1 valign=top/igs; # like to valign tables
139
140	# convert macron characters to _amn_ type macros
141
142	# the following characters aren't actually what they should be for utf-8 macron
143	# characters
144	# conversion to utf-8 by wvHtml doesn't appear to correctly encode macrons.
145	$$textref =~ s/\xC3\xA2/_amn_/g;
146	$$textref =~ s/\xC3\xA7/_emn_/g;
147	$$textref =~ s/\xC3\xB4/_omn_/g;
148	$$textref =~ s/\xC3\xBB/_umn_/g;
149	$$textref =~ s/\xC3\x94/_Omn_/g;
150
151	# there might also be some umlauts used in some places
152	$$textref =~ s/\xC3\xAF/_imn_/g;
153
154	# pound sign
155	$$textref =~ s/\xC2\xA3/&\#163/g;
156
157	# check if we've missed any (if this warning is triggered then the corresponding
158	# character(s) should be added to the above lists
159	if ($$textref =~ /([^\x00-\x7F])/) {
160	my $outhandle = $self->{'outhandle'};
161	print $outhandle "AbstractPlug: Warning: multibyte character found which ";
162	print $outhandle "could not be processed ($1)\n";
163	}
164
165	$$textref =~ s/\n+/\n/g; # remove all those extra blank lines
166	}
167
168
169	1;
170

Note: See TracBrowser for help on using the repository browser.

Download in other formats: