source: trunk/niupepa/perllib/plugins/AbstractPlug.pm@ 11745

Last change on this file since 11745 was 2614, checked in by sjboddie, 23 years ago

* empty log message *

  • Property svn:keywords set to Author Date Id Revision
File size: 6.0 KB
Line 
1###########################################################################
2#
3# AbstractPlug.pm --
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26
27#***************
28
29# don't bother using this anymore - simply split up abstracts manually with
30# the help of the format_abstract.pl script
31
32#***************
33
34
35# AbstractPlug processes the abstracts created for the Niupepa collection
36# (they're word documents). Each series has just one word document that
37# contains all the abstracts for that series.
38# Word files should be named something like 01abstract.doc (the series number
39# at the beginning is the important thing).
40
41# Use this plugin (along with import.pl) to split word documents into
42# multiple files - this is done as an initial step - instead of creating
43# meaningful files in the archives directory this plugin will create all the
44# split abstract files in niupepa/newabstracts. these files should then be
45# copied into the correct place in the real import directory so that import.pl
46# can be rerun.
47# This is kind of an ugly way to do it but NPPlug needs to know if a matching
48# abstract exists when it's processing an issue and it can't know that until
49# the doc files are split up by issue.
50
51# any archives created by this plugin are an unwanted side-effect (as I'm too
52# lazy to override the read() function in this plugin so BasPlug::read() will
53# create an empty gml file for each doc file we process).
54
55package AbstractPlug;
56
57use ConvertToPlug;
58use util;
59
60sub BEGIN {
61 @ISA = ('ConvertToPlug');
62}
63
64sub get_default_process_exp {
65 my $self = shift (@_);
66
67 return q^(?i)\.doc$^;
68}
69
70# do plugin specific processing of doc_obj
71sub process {
72 my $self = shift (@_);
73 my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj) = @_;
74
75 my ($seriesnum) = $file =~ /^(\d+)/;
76 my $dir = &util::filename_cat($ENV{'GSDLCOLLECTDIR'}, "newabstracts", $seriesnum);
77
78 &util::mk_all_dir ($dir) unless -d $dir;
79
80 # clean up the html we got from the conversion process
81 $self->clean_html ($textref);
82
83 # process each issue
84 $$textref =~ s/<p(?:\s[^>]*)?>\s*\n
85 [^\n]+\n # series title line
86 <\/p>\n
87 <p(?:\s[^>]*)?>\s*\n
88 <b>(.*?)<\/b>.*?\n # issue line (i.e. volume, issue number, date)
89 <\/p>\n
90 (<table[^>]*>.*?<\/table>)? # the table itself (may not always be defined)
91 /$self->process_issue($1, $2, $seriesnum, $dir)/isgxe;
92}
93
94sub process_issue {
95 my $self = shift @_;
96 my ($issue, $text, $seriesnum, $dir) = @_;
97
98 my $OID = $seriesnum . "_";
99
100 my ($volume) = $issue =~ /vol(?:ume)?\s*\.?\s*(\d+)/i;
101 if (defined $volume) {
102 $OID .= $volume . "_";
103# print STDERR "volume: $volume ($issue)\n";
104 } else {
105 $OID .= "_";
106 my $outhandle = $self->{'outhandle'};
107 print $outhandle "AbstractPlug: Warning: No volume found ($issue)\n";
108 }
109
110 my ($number) = $issue =~ /(?:no|num(?:ber)?)\s*\.?\s*(\d+)/i;
111 if (defined $number) {
112 $OID .= $number;
113# print STDERR "number: $number ($issue)\n";
114 } else {
115 my $outhandle = $self->{'outhandle'};
116 print $outhandle "AbstractPlug: Warning: No number found ($issue)\n";
117 }
118
119 my $abfile = &util::filename_cat ($dir, "$OID.abstract");
120 open (ABFILE, ">$abfile") || die;
121
122 # links to page numbers
123 if (defined $text) {
124 $text =~ s/(td(?:\s[^>]*)?>\s*\n
125 <p(?:\s[^>]*)?>\s*\n)
126 (pp?\s*\.\s*(\d+)(?:-\d+)?)(\s*\n)
127 /$1<a href=\"_httpdocument_&cl=_cgiargcl_&d=${OID}\.$3&gg=prev\">$2<\/a>$4/isgx;
128
129 print ABFILE $text;
130 } else {
131 # some don't have any abstract information - we'll just use
132 # the issue line to prevent a completely empty page
133 print ABFILE $issue;
134 }
135 close ABFILE;
136}
137
138sub clean_html {
139 my $self = shift (@_);
140 my ($textref) = @_;
141
142 $$textref =~ s/^.*?<body[^>]*>//is; # remove html headers
143 $$textref =~ s/(<div[^>]+>|<\/div>)//isg; # don't really need divs either
144 $$textref =~ s/\s*?border=\"\d+\"//igs; # many tables have borders that we don't want
145 $$textref =~ s/\s*?(row|col)span=\"1\"//igs; # rowspan|colspan=1 don't seem real useful
146 $$textref =~ s/\s*?line\-height:[^;]+;//igs; # don't really want hard-coded line heights either
147 $$textref =~ s/(<td)/$1 valign=top/igs; # like to valign tables
148
149 # convert macron characters to _amn_ type macros
150
151 # the following characters aren't actually what they should be for utf-8 macron
152 # characters
153 # conversion to utf-8 by wvHtml doesn't appear to correctly encode macrons.
154 $$textref =~ s/\xC3\xA2/_amn_/g;
155 $$textref =~ s/\xC3\xA7/_emn_/g;
156 $$textref =~ s/\xC3\xB4/_omn_/g;
157 $$textref =~ s/\xC3\xBB/_umn_/g;
158 $$textref =~ s/\xC3\x94/_Omn_/g;
159
160 # there might also be some umlauts used in some places
161 $$textref =~ s/\xC3\xAF/_imn_/g;
162
163 # pound sign
164 $$textref =~ s/\xC2\xA3/&\#163/g;
165
166 # check if we've missed any (if this warning is triggered then the corresponding
167 # character(s) should be added to the above lists
168 if ($$textref =~ /([^\x00-\x7F])/) {
169 my $outhandle = $self->{'outhandle'};
170 print $outhandle "AbstractPlug: Warning: multibyte character found which ";
171 print $outhandle "could not be processed ($1)\n";
172 }
173
174 $$textref =~ s/\n+/\n/g; # remove all those extra blank lines
175}
176
177
1781;
179
Note: See TracBrowser for help on using the repository browser.