source: trunk/niupepa/perllib/plugins/AbstractPlug.pm@ 1551

Last change on this file since 1551 was 1551, checked in by sjboddie, 24 years ago

New plugin for doing the splitting up of abstract word files

  • Property svn:keywords set to Author Date Id Revision
File size: 5.6 KB
Line 
1###########################################################################
2#
3# AbstractPlug.pm --
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26# AbstractPlug processes the abstracts created for the Niupepa collection
27# (they're word documents). Each series has just one word document that
28# contains all the abstracts for that series.
29# Word files should be named something like 01abstract.doc (the series number
30# at the beginning is the important thing).
31
32# Use this plugin (along with import.pl) to split word documents into
33# multiple files - this is done as an initial step - instead of creating
34# meaningful files in the archives directory this plugin will create all the
35# split abstract files in niupepa/newabstracts. these files should then be
36# copied into the correct place in the real import directory so that import.pl
37# can be rerun.
38# This is kind of an ugly way to do it but NPPlug needs to know if a matching
39# abstract exists when it's processing an issue and it can't know that until
40# the doc files are split up by issue.
41
42# any archives created by this plugin are an unwanted side-effect (as I'm too
43# lazy to override the read() function in this plugin so BasPlug::read() will
44# create an empty gml file for each doc file we process).
45
46package AbstractPlug;
47
48use ConvertToPlug;
49use util;
50
51sub BEGIN {
52 @ISA = ('ConvertToPlug');
53}
54
55sub get_default_process_exp {
56 my $self = shift (@_);
57
58 return q^(?i)\.doc$^;
59}
60
61# do plugin specific processing of doc_obj
62sub process {
63 my $self = shift (@_);
64 my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj) = @_;
65
66 my ($seriesnum) = $file =~ /^(\d+)/;
67 my $dir = &util::filename_cat($ENV{'GSDLCOLLECTDIR'}, "newabstracts", $seriesnum);
68
69 &util::mk_all_dir ($dir) unless -d $dir;
70
71 # clean up the html we got from the conversion process
72 $self->clean_html ($textref);
73
74 # process each issue
75 $$textref =~ s/(<p(?:\s[^>]*)?>\s*\n
76 [^\n]+\n # series title line
77 <\/p>\n
78 <p(?:\s[^>]*)?>\s*\n
79 <b>(.*?)<\/b>.*?\n # issue line (i.e. volume, issue number, date)
80 <\/p>\n
81 (?:<table[^>]*>.*?<\/table>)?) # the table itself (may not always be defined)
82 /$self->process_issue($1, $2, $seriesnum, $dir)/isgxe;
83}
84
85sub process_issue {
86 my $self = shift @_;
87 my ($text, $issue, $seriesnum, $dir) = @_;
88
89 my $OID = $seriesnum . "_";
90
91 my ($volume) = $issue =~ /vol(?:ume)?\s*\.?\s*(\d+)/i;
92 if (defined $volume) {
93 $OID .= $volume . "_";
94# print STDERR "volume: $volume ($issue)\n";
95 } else {
96 $OID .= "_";
97 my $outhandle = $self->{'outhandle'};
98 print $outhandle "AbstractPlug: Warning: No volume found ($issue)\n";
99 }
100
101 my ($number) = $issue =~ /(?:no|num(?:ber)?)\s*\.?\s*(\d+)/i;
102 if (defined $number) {
103 $OID .= $number;
104# print STDERR "number: $number ($issue)\n";
105 } else {
106 my $outhandle = $self->{'outhandle'};
107 print $outhandle "AbstractPlug: Warning: No number found ($issue)\n";
108 }
109
110 # links to page numbers
111 $text =~ s/(td(?:\s[^>]*)?>\s*\n
112 <p(?:\s[^>]*)?>\s*\n)
113 (pp?\s*\.\s*(\d+)(?:-\d+)?)(\s*\n)
114 /$1<a href=\"_httpdocument_&cl=_cgiargcl_&d=${OID}\.$3\">$2<\/a>$4/isgx;
115
116 my $abfile = &util::filename_cat ($dir, "$OID.abstract");
117 open (ABFILE, ">$abfile") || die;
118 print ABFILE $text;
119 close ABFILE;
120}
121
122sub clean_html {
123 my $self = shift (@_);
124 my ($textref) = @_;
125
126 $$textref =~ s/^.*?<body[^>]*>//is; # remove html headers
127 $$textref =~ s/(<div[^>]+>|<\/div>)//isg; # don't really need divs either
128 $$textref =~ s/\s*?border=\"\d+\"//igs; # many tables have borders that we don't want
129 $$textref =~ s/\s*?(row|col)span=\"1\"//igs; # rowspan|colspan=1 don't seem real useful
130 $$textref =~ s/\s*?line\-height:[^;]+;//igs; # don't really want hard-coded line heights either
131
132 # convert macron characters to _amn_ type macros
133
134 # the following characters aren't actually what they should be for utf-8 macron
135 # characters
136 # conversion to utf-8 by wvHtml doesn't appear to correctly encode macrons.
137 $$textref =~ s/\xC3\xA2/_amn_/g;
138 $$textref =~ s/\xC3\xA7/_emn_/g;
139 $$textref =~ s/\xC3\xB4/_omn_/g;
140 $$textref =~ s/\xC3\xBB/_umn_/g;
141 $$textref =~ s/\xC3\x94/_Omn_/g;
142
143 # there might also be some umlauts used in some places
144 $$textref =~ s/\xC3\xAF/_imn_/g;
145
146 # pound sign
147 $$textref =~ s/\xC2\xA3/&\#163/g;
148
149 # check if we've missed any (if this warning is triggered then the corresponding
150 # character(s) should be added to the above lists
151 if ($$textref =~ /([^\x00-\x7F])/) {
152 my $outhandle = $self->{'outhandle'};
153 print $outhandle "AbstractPlug: Warning: multibyte character found which ";
154 print $outhandle "could not be processed ($1)\n";
155 }
156
157 $$textref =~ s/\n+/\n/g; # remove all those extra blank lines
158}
159
160
1611;
162
Note: See TracBrowser for help on using the repository browser.