source: trunk/niupepa/perllib/plugins/AbstractPlug.pm@ 1554

Last change on this file since 1554 was 1554, checked in by sjboddie, 24 years ago

* empty log message *

  • Property svn:keywords set to Author Date Id Revision
File size: 5.9 KB
Line 
1###########################################################################
2#
3# AbstractPlug.pm --
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26# AbstractPlug processes the abstracts created for the Niupepa collection
27# (they're word documents). Each series has just one word document that
28# contains all the abstracts for that series.
29# Word files should be named something like 01abstract.doc (the series number
30# at the beginning is the important thing).
31
32# Use this plugin (along with import.pl) to split word documents into
33# multiple files - this is done as an initial step - instead of creating
34# meaningful files in the archives directory this plugin will create all the
35# split abstract files in niupepa/newabstracts. these files should then be
36# copied into the correct place in the real import directory so that import.pl
37# can be rerun.
38# This is kind of an ugly way to do it but NPPlug needs to know if a matching
39# abstract exists when it's processing an issue and it can't know that until
40# the doc files are split up by issue.
41
42# any archives created by this plugin are an unwanted side-effect (as I'm too
43# lazy to override the read() function in this plugin so BasPlug::read() will
44# create an empty gml file for each doc file we process).
45
46package AbstractPlug;
47
48use ConvertToPlug;
49use util;
50
51sub BEGIN {
52 @ISA = ('ConvertToPlug');
53}
54
55sub get_default_process_exp {
56 my $self = shift (@_);
57
58 return q^(?i)\.doc$^;
59}
60
61# do plugin specific processing of doc_obj
62sub process {
63 my $self = shift (@_);
64 my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj) = @_;
65
66 my ($seriesnum) = $file =~ /^(\d+)/;
67 my $dir = &util::filename_cat($ENV{'GSDLCOLLECTDIR'}, "newabstracts", $seriesnum);
68
69 &util::mk_all_dir ($dir) unless -d $dir;
70
71 # clean up the html we got from the conversion process
72 $self->clean_html ($textref);
73
74 # process each issue
75 $$textref =~ s/<p(?:\s[^>]*)?>\s*\n
76 [^\n]+\n # series title line
77 <\/p>\n
78 <p(?:\s[^>]*)?>\s*\n
79 <b>(.*?)<\/b>.*?\n # issue line (i.e. volume, issue number, date)
80 <\/p>\n
81 (<table[^>]*>.*?<\/table>)? # the table itself (may not always be defined)
82 /$self->process_issue($1, $2, $seriesnum, $dir)/isgxe;
83}
84
85sub process_issue {
86 my $self = shift @_;
87 my ($issue, $text, $seriesnum, $dir) = @_;
88
89 my $OID = $seriesnum . "_";
90
91 my ($volume) = $issue =~ /vol(?:ume)?\s*\.?\s*(\d+)/i;
92 if (defined $volume) {
93 $OID .= $volume . "_";
94# print STDERR "volume: $volume ($issue)\n";
95 } else {
96 $OID .= "_";
97 my $outhandle = $self->{'outhandle'};
98 print $outhandle "AbstractPlug: Warning: No volume found ($issue)\n";
99 }
100
101 my ($number) = $issue =~ /(?:no|num(?:ber)?)\s*\.?\s*(\d+)/i;
102 if (defined $number) {
103 $OID .= $number;
104# print STDERR "number: $number ($issue)\n";
105 } else {
106 my $outhandle = $self->{'outhandle'};
107 print $outhandle "AbstractPlug: Warning: No number found ($issue)\n";
108 }
109
110 my $abfile = &util::filename_cat ($dir, "$OID.abstract");
111 open (ABFILE, ">$abfile") || die;
112
113 # links to page numbers
114 if (defined $text) {
115 $text =~ s/(td(?:\s[^>]*)?>\s*\n
116 <p(?:\s[^>]*)?>\s*\n)
117 (pp?\s*\.\s*(\d+)(?:-\d+)?)(\s*\n)
118 /$1<a href=\"_httpdocument_&cl=_cgiargcl_&d=${OID}\.$3&gg=prev\">$2<\/a>$4/isgx;
119
120 print ABFILE $text;
121 } else {
122 # some don't have any abstract information - we'll just use
123 # the issue line to prevent a completely empty page
124 print ABFILE $issue;
125 }
126 close ABFILE;
127}
128
129sub clean_html {
130 my $self = shift (@_);
131 my ($textref) = @_;
132
133 $$textref =~ s/^.*?<body[^>]*>//is; # remove html headers
134 $$textref =~ s/(<div[^>]+>|<\/div>)//isg; # don't really need divs either
135 $$textref =~ s/\s*?border=\"\d+\"//igs; # many tables have borders that we don't want
136 $$textref =~ s/\s*?(row|col)span=\"1\"//igs; # rowspan|colspan=1 don't seem real useful
137 $$textref =~ s/\s*?line\-height:[^;]+;//igs; # don't really want hard-coded line heights either
138 $$textref =~ s/(<td)/$1 valign=top/igs; # like to valign tables
139
140 # convert macron characters to _amn_ type macros
141
142 # the following characters aren't actually what they should be for utf-8 macron
143 # characters
144 # conversion to utf-8 by wvHtml doesn't appear to correctly encode macrons.
145 $$textref =~ s/\xC3\xA2/_amn_/g;
146 $$textref =~ s/\xC3\xA7/_emn_/g;
147 $$textref =~ s/\xC3\xB4/_omn_/g;
148 $$textref =~ s/\xC3\xBB/_umn_/g;
149 $$textref =~ s/\xC3\x94/_Omn_/g;
150
151 # there might also be some umlauts used in some places
152 $$textref =~ s/\xC3\xAF/_imn_/g;
153
154 # pound sign
155 $$textref =~ s/\xC2\xA3/&\#163/g;
156
157 # check if we've missed any (if this warning is triggered then the corresponding
158 # character(s) should be added to the above lists
159 if ($$textref =~ /([^\x00-\x7F])/) {
160 my $outhandle = $self->{'outhandle'};
161 print $outhandle "AbstractPlug: Warning: multibyte character found which ";
162 print $outhandle "could not be processed ($1)\n";
163 }
164
165 $$textref =~ s/\n+/\n/g; # remove all those extra blank lines
166}
167
168
1691;
170
Note: See TracBrowser for help on using the repository browser.