source: trunk/cic-hcap/perllib/plugins/CICWordPlug.pm@ 13108

Last change on this file since 13108 was 13108, checked in by mdewsnip, 18 years ago

Now removes superscript tags from non-place references.

  • Property svn:keywords set to Author Date Id Revision
File size: 5.0 KB
Line 
1###########################################################################
2#
3# CICWordPlug.pm
4#
5# Copyright (C) 2006 New Zealand Digital Library Project
6#
7# This program is free software; you can redistribute it and/or modify
8# it under the terms of the GNU General Public License as published by
9# the Free Software Foundation; either version 2 of the License, or
10# (at your option) any later version.
11#
12# This program is distributed in the hope that it will be useful,
13# but WITHOUT ANY WARRANTY; without even the implied warranty of
14# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15# GNU General Public License for more details.
16#
17# You should have received a copy of the GNU General Public License
18# along with this program; if not, write to the Free Software
19# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
20#
21###########################################################################
22
23package CICWordPlug;
24
25
26use ConvertToPlug;
27use strict;
28no strict 'refs';
29
30
31sub BEGIN {
32 @CICWordPlug::ISA = ('ConvertToPlug');
33}
34
35
36my $arguments = [ ];
37
38my $options = { 'name' => "CICWordPlug",
39 'desc' => "{CICWordPlug.desc}",
40 'abstract' => "no",
41 'inherits' => "yes",
42 'args' => $arguments };
43
44
45sub get_default_process_exp
46{
47 return q^(?i)\.doc$^;
48}
49
50
51sub new {
52 my ($class) = shift (@_);
53 my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
54 push(@$pluginlist, $class);
55
56 if(defined $arguments){ push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});}
57 if(defined $options) { push(@{$hashArgOptLists->{"OptList"}},$options)};
58
59 my $self = new ConvertToPlug($pluginlist, $inputargs, $hashArgOptLists);
60
61 return bless $self, $class;
62}
63
64
65sub read
66{
67 my $self = shift (@_);
68 my ($pluginfo, $base_dir, $file, $metadata, $processor, $maxdocs, $total_count, $gli) = @_;
69
70 $self->{'file'} = $file;
71 $self->{'processor'} = $processor;
72 $self->{'gli'} = $gli;
73
74 $self->SUPER::read($pluginfo, $base_dir, $file, $metadata, $processor, $maxdocs, $total_count, $gli);
75}
76
77
78sub convert_post_process
79{
80 my $self = shift(@_);
81 my $conv_filename = shift(@_);
82 my $fail_log_handle = $self->{'failhandle'};
83
84 my $in_record = 0;
85 my $num_records = 0;
86 open(HTML_IN, $conv_filename);
87 while (my $line = <HTML_IN>) {
88 # Is this the end of a record?
89 if ($line =~ /^<\/p>/) {
90 $in_record = 0;
91 }
92 # Is this the start of a possible record?
93 if ($line =~ /^<p /) {
94 $in_record = 1;
95 }
96 elsif ($in_record == 1) {
97 # Ignore lines completely in bold
98 next if ($line =~ /^<b>(.*)<\/b>$/);
99
100 my $line_sans_tags = $line;
101 $line_sans_tags =~ s/<.*?>//g;
102
103 # Type 1: No author, title in double quotes
104 if ($line =~ /^&ldquo;(.*)&rdquo;/) {
105 $num_records++;
106 $self->add_reference_document($num_records, $line, $1, "");
107 }
108
109 # Type 2: Author, title in italics or quotes
110 elsif ($line =~ /^(.*)<i>(.*)<\/i>/ || $line =~ /^(.*)&quot;(.*)&quot;/ || $line =~ /^(.*)&ldquo;(.*)&rdquo;/) {
111 $num_records++;
112 $self->add_reference_document($num_records, $line, $2, $1);
113 }
114
115 # Warning: Could not parse record
116 elsif ($line_sans_tags =~ /\w/) {
117 print STDERR "<ProcessingError n='Bibliography' p='CICWordPlug' r='Could not parse reference: $line'>\n" if ($self->{'gli'});
118 print STDERR "Warning: Bibliography -- Could not parse reference: $line\n";
119 print $fail_log_handle "Warning: Bibliography -- Could not parse reference: $line\n";
120 $num_records++;
121 $self->add_reference_document($num_records, $line, "", "");
122 }
123 }
124 }
125 close(HTML_IN);
126}
127
128
129sub add_reference_document
130{
131 my $self = shift(@_);
132 my $reference_id = shift(@_);
133 my $reference = shift(@_);
134 my $reference_title = shift(@_);
135 my $reference_author = shift(@_);
136
137 my $reference_doc_obj = new doc($self->{'file'} . "-", "indexed_doc");
138 $reference_doc_obj->set_OID("r$reference_id");
139 &new_metadata_entry($reference_doc_obj, "DocumentType", "Reference");
140 &new_metadata_entry($reference_doc_obj, "Reference", $reference);
141 &new_metadata_entry($reference_doc_obj, "ReferenceAuthor", $reference_author);
142 &new_metadata_entry($reference_doc_obj, "ReferenceTitle", $reference_title);
143
144 $reference_doc_obj->add_utf8_text($reference_doc_obj->get_top_section(), "Some dummy text.");
145 $self->{'processor'}->process($reference_doc_obj);
146 $self->{'num_processed'}++;
147}
148
149
150sub new_metadata_entry
151{
152 my ($doc_obj, $metadata_name, $metadata_value) = (@_);
153
154 # Don't bother with empty metadata
155 return if ($metadata_value eq "");
156
157 # Spaces aren't allowed in metadata names
158 $metadata_name =~ s/ /_/g;
159
160 # Anything from the database is ISO 8859-1 encoded, so convert to UTF-8
161 $metadata_value = &unicode::ascii2utf8(\$metadata_value);
162
163 # Remove '#' characters around links
164 if ($metadata_value =~ /\#(.*?)\#/) {
165 $metadata_value = $1;
166 }
167
168 # Remove any superscript tags
169 $metadata_value =~ s/\<sup\>//g;
170 $metadata_value =~ s/\<\/sup\>//g;
171
172 $doc_obj->add_utf8_metadata($doc_obj->get_top_section(), $metadata_name, $metadata_value);
173}
174
175
1761;
177
Note: See TracBrowser for help on using the repository browser.