1 | ###########################################################################
|
---|
2 | #
|
---|
3 | # AbstractPlug.pm --
|
---|
4 | # A component of the Greenstone digital library software
|
---|
5 | # from the New Zealand Digital Library Project at the
|
---|
6 | # University of Waikato, New Zealand.
|
---|
7 | #
|
---|
8 | # Copyright (C) 1999 New Zealand Digital Library Project
|
---|
9 | #
|
---|
10 | # This program is free software; you can redistribute it and/or modify
|
---|
11 | # it under the terms of the GNU General Public License as published by
|
---|
12 | # the Free Software Foundation; either version 2 of the License, or
|
---|
13 | # (at your option) any later version.
|
---|
14 | #
|
---|
15 | # This program is distributed in the hope that it will be useful,
|
---|
16 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
|
---|
17 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
---|
18 | # GNU General Public License for more details.
|
---|
19 | #
|
---|
20 | # You should have received a copy of the GNU General Public License
|
---|
21 | # along with this program; if not, write to the Free Software
|
---|
22 | # Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
|
---|
23 | #
|
---|
24 | ###########################################################################
|
---|
25 |
|
---|
26 | # AbstractPlug processes the abstracts created for the Niupepa collection
|
---|
27 | # (they're word documents). Each series has just one word document that
|
---|
28 | # contains all the abstracts for that series.
|
---|
29 | # Word files should be named something like 01abstract.doc (the series number
|
---|
30 | # at the beginning is the important thing).
|
---|
31 |
|
---|
32 | # Use this plugin (along with import.pl) to split word documents into
|
---|
33 | # multiple files - this is done as an initial step - instead of creating
|
---|
34 | # meaningful files in the archives directory this plugin will create all the
|
---|
35 | # split abstract files in niupepa/newabstracts. these files should then be
|
---|
36 | # copied into the correct place in the real import directory so that import.pl
|
---|
37 | # can be rerun.
|
---|
38 | # This is kind of an ugly way to do it but NPPlug needs to know if a matching
|
---|
39 | # abstract exists when it's processing an issue and it can't know that until
|
---|
40 | # the doc files are split up by issue.
|
---|
41 |
|
---|
42 | # any archives created by this plugin are an unwanted side-effect (as I'm too
|
---|
43 | # lazy to override the read() function in this plugin so BasPlug::read() will
|
---|
44 | # create an empty gml file for each doc file we process).
|
---|
45 |
|
---|
46 | package AbstractPlug;
|
---|
47 |
|
---|
48 | use ConvertToPlug;
|
---|
49 | use util;
|
---|
50 |
|
---|
51 | sub BEGIN {
|
---|
52 | @ISA = ('ConvertToPlug');
|
---|
53 | }
|
---|
54 |
|
---|
55 | sub get_default_process_exp {
|
---|
56 | my $self = shift (@_);
|
---|
57 |
|
---|
58 | return q^(?i)\.doc$^;
|
---|
59 | }
|
---|
60 |
|
---|
61 | # do plugin specific processing of doc_obj
|
---|
62 | sub process {
|
---|
63 | my $self = shift (@_);
|
---|
64 | my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj) = @_;
|
---|
65 |
|
---|
66 | my ($seriesnum) = $file =~ /^(\d+)/;
|
---|
67 | my $dir = &util::filename_cat($ENV{'GSDLCOLLECTDIR'}, "newabstracts", $seriesnum);
|
---|
68 |
|
---|
69 | &util::mk_all_dir ($dir) unless -d $dir;
|
---|
70 |
|
---|
71 | # clean up the html we got from the conversion process
|
---|
72 | $self->clean_html ($textref);
|
---|
73 |
|
---|
74 | # process each issue
|
---|
75 | $$textref =~ s/<p(?:\s[^>]*)?>\s*\n
|
---|
76 | [^\n]+\n # series title line
|
---|
77 | <\/p>\n
|
---|
78 | <p(?:\s[^>]*)?>\s*\n
|
---|
79 | <b>(.*?)<\/b>.*?\n # issue line (i.e. volume, issue number, date)
|
---|
80 | <\/p>\n
|
---|
81 | (<table[^>]*>.*?<\/table>)? # the table itself (may not always be defined)
|
---|
82 | /$self->process_issue($1, $2, $seriesnum, $dir)/isgxe;
|
---|
83 | }
|
---|
84 |
|
---|
85 | sub process_issue {
|
---|
86 | my $self = shift @_;
|
---|
87 | my ($issue, $text, $seriesnum, $dir) = @_;
|
---|
88 |
|
---|
89 | my $OID = $seriesnum . "_";
|
---|
90 |
|
---|
91 | my ($volume) = $issue =~ /vol(?:ume)?\s*\.?\s*(\d+)/i;
|
---|
92 | if (defined $volume) {
|
---|
93 | $OID .= $volume . "_";
|
---|
94 | # print STDERR "volume: $volume ($issue)\n";
|
---|
95 | } else {
|
---|
96 | $OID .= "_";
|
---|
97 | my $outhandle = $self->{'outhandle'};
|
---|
98 | print $outhandle "AbstractPlug: Warning: No volume found ($issue)\n";
|
---|
99 | }
|
---|
100 |
|
---|
101 | my ($number) = $issue =~ /(?:no|num(?:ber)?)\s*\.?\s*(\d+)/i;
|
---|
102 | if (defined $number) {
|
---|
103 | $OID .= $number;
|
---|
104 | # print STDERR "number: $number ($issue)\n";
|
---|
105 | } else {
|
---|
106 | my $outhandle = $self->{'outhandle'};
|
---|
107 | print $outhandle "AbstractPlug: Warning: No number found ($issue)\n";
|
---|
108 | }
|
---|
109 |
|
---|
110 | my $abfile = &util::filename_cat ($dir, "$OID.abstract");
|
---|
111 | open (ABFILE, ">$abfile") || die;
|
---|
112 |
|
---|
113 | # links to page numbers
|
---|
114 | if (defined $text) {
|
---|
115 | $text =~ s/(td(?:\s[^>]*)?>\s*\n
|
---|
116 | <p(?:\s[^>]*)?>\s*\n)
|
---|
117 | (pp?\s*\.\s*(\d+)(?:-\d+)?)(\s*\n)
|
---|
118 | /$1<a href=\"_httpdocument_&cl=_cgiargcl_&d=${OID}\.$3&gg=prev\">$2<\/a>$4/isgx;
|
---|
119 |
|
---|
120 | print ABFILE $text;
|
---|
121 | } else {
|
---|
122 | # some don't have any abstract information - we'll just use
|
---|
123 | # the issue line to prevent a completely empty page
|
---|
124 | print ABFILE $issue;
|
---|
125 | }
|
---|
126 | close ABFILE;
|
---|
127 | }
|
---|
128 |
|
---|
129 | sub clean_html {
|
---|
130 | my $self = shift (@_);
|
---|
131 | my ($textref) = @_;
|
---|
132 |
|
---|
133 | $$textref =~ s/^.*?<body[^>]*>//is; # remove html headers
|
---|
134 | $$textref =~ s/(<div[^>]+>|<\/div>)//isg; # don't really need divs either
|
---|
135 | $$textref =~ s/\s*?border=\"\d+\"//igs; # many tables have borders that we don't want
|
---|
136 | $$textref =~ s/\s*?(row|col)span=\"1\"//igs; # rowspan|colspan=1 don't seem real useful
|
---|
137 | $$textref =~ s/\s*?line\-height:[^;]+;//igs; # don't really want hard-coded line heights either
|
---|
138 | $$textref =~ s/(<td)/$1 valign=top/igs; # like to valign tables
|
---|
139 |
|
---|
140 | # convert macron characters to _amn_ type macros
|
---|
141 |
|
---|
142 | # the following characters aren't actually what they should be for utf-8 macron
|
---|
143 | # characters
|
---|
144 | # conversion to utf-8 by wvHtml doesn't appear to correctly encode macrons.
|
---|
145 | $$textref =~ s/\xC3\xA2/_amn_/g;
|
---|
146 | $$textref =~ s/\xC3\xA7/_emn_/g;
|
---|
147 | $$textref =~ s/\xC3\xB4/_omn_/g;
|
---|
148 | $$textref =~ s/\xC3\xBB/_umn_/g;
|
---|
149 | $$textref =~ s/\xC3\x94/_Omn_/g;
|
---|
150 |
|
---|
151 | # there might also be some umlauts used in some places
|
---|
152 | $$textref =~ s/\xC3\xAF/_imn_/g;
|
---|
153 |
|
---|
154 | # pound sign
|
---|
155 | $$textref =~ s/\xC2\xA3/&\#163/g;
|
---|
156 |
|
---|
157 | # check if we've missed any (if this warning is triggered then the corresponding
|
---|
158 | # character(s) should be added to the above lists
|
---|
159 | if ($$textref =~ /([^\x00-\x7F])/) {
|
---|
160 | my $outhandle = $self->{'outhandle'};
|
---|
161 | print $outhandle "AbstractPlug: Warning: multibyte character found which ";
|
---|
162 | print $outhandle "could not be processed ($1)\n";
|
---|
163 | }
|
---|
164 |
|
---|
165 | $$textref =~ s/\n+/\n/g; # remove all those extra blank lines
|
---|
166 | }
|
---|
167 |
|
---|
168 |
|
---|
169 | 1;
|
---|
170 |
|
---|