Context Navigation

source: trunk/niupepa/perllib/plugins/AbstractPlug.pm@ 11745

Last change on this file since 11745 was 2614, checked in by sjboddie, 23 years ago
* empty log message *
Property svn:keywords set to `Author Date Id Revision`
File size: 6.0 KB

Line
1	###########################################################################
2	#
3	# AbstractPlug.pm --
4	# A component of the Greenstone digital library software
5	# from the New Zealand Digital Library Project at the
6	# University of Waikato, New Zealand.
7	#
8	# Copyright (C) 1999 New Zealand Digital Library Project
9	#
10	# This program is free software; you can redistribute it and/or modify
11	# it under the terms of the GNU General Public License as published by
12	# the Free Software Foundation; either version 2 of the License, or
13	# (at your option) any later version.
14	#
15	# This program is distributed in the hope that it will be useful,
16	# but WITHOUT ANY WARRANTY; without even the implied warranty of
17	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18	# GNU General Public License for more details.
19	#
20	# You should have received a copy of the GNU General Public License
21	# along with this program; if not, write to the Free Software
22	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23	#
24	###########################################################################
25
26
27	#***************
28
29	# don't bother using this anymore - simply split up abstracts manually with
30	# the help of the format_abstract.pl script
31
32	#***************
33
34
35	# AbstractPlug processes the abstracts created for the Niupepa collection
36	# (they're word documents). Each series has just one word document that
37	# contains all the abstracts for that series.
38	# Word files should be named something like 01abstract.doc (the series number
39	# at the beginning is the important thing).
40
41	# Use this plugin (along with import.pl) to split word documents into
42	# multiple files - this is done as an initial step - instead of creating
43	# meaningful files in the archives directory this plugin will create all the
44	# split abstract files in niupepa/newabstracts. these files should then be
45	# copied into the correct place in the real import directory so that import.pl
46	# can be rerun.
47	# This is kind of an ugly way to do it but NPPlug needs to know if a matching
48	# abstract exists when it's processing an issue and it can't know that until
49	# the doc files are split up by issue.
50
51	# any archives created by this plugin are an unwanted side-effect (as I'm too
52	# lazy to override the read() function in this plugin so BasPlug::read() will
53	# create an empty gml file for each doc file we process).
54
55	package AbstractPlug;
56
57	use ConvertToPlug;
58	use util;
59
60	sub BEGIN {
61	@ISA = ('ConvertToPlug');
62	}
63
64	sub get_default_process_exp {
65	my $self = shift (@_);
66
67	return q^(?i)\.doc$^;
68	}
69
70	# do plugin specific processing of doc_obj
71	sub process {
72	my $self = shift (@_);
73	my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj) = @_;
74
75	my ($seriesnum) = $file =~ /^(\d+)/;
76	my $dir = &util::filename_cat($ENV{'GSDLCOLLECTDIR'}, "newabstracts", $seriesnum);
77
78	&util::mk_all_dir ($dir) unless -d $dir;
79
80	# clean up the html we got from the conversion process
81	$self->clean_html ($textref);
82
83	# process each issue
84	$$textref =~ s/<p(?:\s[^>])?>\s\n
85	[^\n]+\n # series title line
86	<\/p>\n
87	<p(?:\s[^>])?>\s\n
88	<b>(.?)<\/b>.?\n # issue line (i.e. volume, issue number, date)
89	<\/p>\n
90	(<table[^>]>.?<\/table>)? # the table itself (may not always be defined)
91	/$self->process_issue($1, $2, $seriesnum, $dir)/isgxe;
92	}
93
94	sub process_issue {
95	my $self = shift @_;
96	my ($issue, $text, $seriesnum, $dir) = @_;
97
98	my $OID = $seriesnum . "_";
99
100	my ($volume) = $issue =~ /vol(?:ume)?\s\.?\s(\d+)/i;
101	if (defined $volume) {
102	$OID .= $volume . "_";
103	# print STDERR "volume: $volume ($issue)\n";
104	} else {
105	$OID .= "_";
106	my $outhandle = $self->{'outhandle'};
107	print $outhandle "AbstractPlug: Warning: No volume found ($issue)\n";
108	}
109
110	my ($number) = $issue =~ /(?:no\|num(?:ber)?)\s\.?\s(\d+)/i;
111	if (defined $number) {
112	$OID .= $number;
113	# print STDERR "number: $number ($issue)\n";
114	} else {
115	my $outhandle = $self->{'outhandle'};
116	print $outhandle "AbstractPlug: Warning: No number found ($issue)\n";
117	}
118
119	my $abfile = &util::filename_cat ($dir, "$OID.abstract");
120	open (ABFILE, ">$abfile") \|\| die;
121
122	# links to page numbers
123	if (defined $text) {
124	$text =~ s/(td(?:\s[^>])?>\s\n
125	<p(?:\s[^>])?>\s\n)
126	(pp?\s\.\s(\d+)(?:-\d+)?)(\s*\n)
127	/$1<a href=\"_httpdocument_&cl=_cgiargcl_&d=${OID}\.$3&gg=prev\">$2<\/a>$4/isgx;
128
129	print ABFILE $text;
130	} else {
131	# some don't have any abstract information - we'll just use
132	# the issue line to prevent a completely empty page
133	print ABFILE $issue;
134	}
135	close ABFILE;
136	}
137
138	sub clean_html {
139	my $self = shift (@_);
140	my ($textref) = @_;
141
142	$$textref =~ s/^.?<body[^>]>//is; # remove html headers
143	$$textref =~ s/(<div[^>]+>\|<\/div>)//isg; # don't really need divs either
144	$$textref =~ s/\s*?border=\"\d+\"//igs; # many tables have borders that we don't want
145	$$textref =~ s/\s*?(row\|col)span=\"1\"//igs; # rowspan\|colspan=1 don't seem real useful
146	$$textref =~ s/\s*?line\-height:[^;]+;//igs; # don't really want hard-coded line heights either
147	$$textref =~ s/(<td)/$1 valign=top/igs; # like to valign tables
148
149	# convert macron characters to _amn_ type macros
150
151	# the following characters aren't actually what they should be for utf-8 macron
152	# characters
153	# conversion to utf-8 by wvHtml doesn't appear to correctly encode macrons.
154	$$textref =~ s/\xC3\xA2/_amn_/g;
155	$$textref =~ s/\xC3\xA7/_emn_/g;
156	$$textref =~ s/\xC3\xB4/_omn_/g;
157	$$textref =~ s/\xC3\xBB/_umn_/g;
158	$$textref =~ s/\xC3\x94/_Omn_/g;
159
160	# there might also be some umlauts used in some places
161	$$textref =~ s/\xC3\xAF/_imn_/g;
162
163	# pound sign
164	$$textref =~ s/\xC2\xA3/&\#163/g;
165
166	# check if we've missed any (if this warning is triggered then the corresponding
167	# character(s) should be added to the above lists
168	if ($$textref =~ /([^\x00-\x7F])/) {
169	my $outhandle = $self->{'outhandle'};
170	print $outhandle "AbstractPlug: Warning: multibyte character found which ";
171	print $outhandle "could not be processed ($1)\n";
172	}
173
174	$$textref =~ s/\n+/\n/g; # remove all those extra blank lines
175	}
176
177
178	1;
179

Note: See TracBrowser for help on using the repository browser.

Download in other formats: