source: trunk/cic-hcap/perllib/plugins/CICWordPlug.pm@ 13109

Last change on this file since 13109 was 13109, checked in by mdewsnip, 18 years ago

Now escapes square brackets in metadata so Greenstone doesn't try to treat the text as metadata elements.

  • Property svn:keywords set to Author Date Id Revision
File size: 5.2 KB
Line 
1###########################################################################
2#
3# CICWordPlug.pm
4#
5# Copyright (C) 2006 New Zealand Digital Library Project
6#
7# This program is free software; you can redistribute it and/or modify
8# it under the terms of the GNU General Public License as published by
9# the Free Software Foundation; either version 2 of the License, or
10# (at your option) any later version.
11#
12# This program is distributed in the hope that it will be useful,
13# but WITHOUT ANY WARRANTY; without even the implied warranty of
14# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15# GNU General Public License for more details.
16#
17# You should have received a copy of the GNU General Public License
18# along with this program; if not, write to the Free Software
19# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
20#
21###########################################################################
22
23package CICWordPlug;
24
25
26use ConvertToPlug;
27use strict;
28no strict 'refs';
29
30
31sub BEGIN {
32 @CICWordPlug::ISA = ('ConvertToPlug');
33}
34
35
36my $arguments = [ ];
37
38my $options = { 'name' => "CICWordPlug",
39 'desc' => "{CICWordPlug.desc}",
40 'abstract' => "no",
41 'inherits' => "yes",
42 'args' => $arguments };
43
44
45sub get_default_process_exp
46{
47 return q^(?i)\.doc$^;
48}
49
50
51sub new {
52 my ($class) = shift (@_);
53 my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
54 push(@$pluginlist, $class);
55
56 if(defined $arguments){ push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});}
57 if(defined $options) { push(@{$hashArgOptLists->{"OptList"}},$options)};
58
59 my $self = new ConvertToPlug($pluginlist, $inputargs, $hashArgOptLists);
60
61 return bless $self, $class;
62}
63
64
65sub read
66{
67 my $self = shift (@_);
68 my ($pluginfo, $base_dir, $file, $metadata, $processor, $maxdocs, $total_count, $gli) = @_;
69
70 $self->{'file'} = $file;
71 $self->{'processor'} = $processor;
72 $self->{'gli'} = $gli;
73
74 $self->SUPER::read($pluginfo, $base_dir, $file, $metadata, $processor, $maxdocs, $total_count, $gli);
75}
76
77
78sub convert_post_process
79{
80 my $self = shift(@_);
81 my $conv_filename = shift(@_);
82 my $fail_log_handle = $self->{'failhandle'};
83
84 my $in_record = 0;
85 my $num_records = 0;
86 open(HTML_IN, $conv_filename);
87 while (my $line = <HTML_IN>) {
88 # Is this the end of a record?
89 if ($line =~ /^<\/p>/) {
90 $in_record = 0;
91 }
92 # Is this the start of a possible record?
93 if ($line =~ /^<p /) {
94 $in_record = 1;
95 }
96 elsif ($in_record == 1) {
97 # Ignore lines completely in bold
98 next if ($line =~ /^<b>(.*)<\/b>$/);
99
100 my $line_sans_tags = $line;
101 $line_sans_tags =~ s/<.*?>//g;
102
103 # Type 1: No author, title in double quotes
104 if ($line =~ /^&ldquo;(.*)&rdquo;/) {
105 $num_records++;
106 $self->add_reference_document($num_records, $line, $1, "");
107 }
108
109 # Type 2: Author, title in italics or quotes
110 elsif ($line =~ /^(.*)<i>(.*)<\/i>/ || $line =~ /^(.*)&quot;(.*)&quot;/ || $line =~ /^(.*)&ldquo;(.*)&rdquo;/) {
111 $num_records++;
112 $self->add_reference_document($num_records, $line, $2, $1);
113 }
114
115 # Warning: Could not parse record
116 elsif ($line_sans_tags =~ /\w/) {
117 print STDERR "<ProcessingError n='Bibliography' p='CICWordPlug' r='Could not parse reference: $line'>\n" if ($self->{'gli'});
118 print STDERR "Warning: Bibliography -- Could not parse reference: $line\n";
119 print $fail_log_handle "Warning: Bibliography -- Could not parse reference: $line\n";
120 $num_records++;
121 $self->add_reference_document($num_records, $line, "", "");
122 }
123 }
124 }
125 close(HTML_IN);
126}
127
128
129sub add_reference_document
130{
131 my $self = shift(@_);
132 my $reference_id = shift(@_);
133 my $reference = shift(@_);
134 my $reference_title = shift(@_);
135 my $reference_author = shift(@_);
136
137 my $reference_doc_obj = new doc($self->{'file'} . "-", "indexed_doc");
138 $reference_doc_obj->set_OID("r$reference_id");
139 &new_metadata_entry($reference_doc_obj, "DocumentType", "Reference");
140 &new_metadata_entry($reference_doc_obj, "Reference", $reference);
141 &new_metadata_entry($reference_doc_obj, "ReferenceAuthor", $reference_author);
142 &new_metadata_entry($reference_doc_obj, "ReferenceTitle", $reference_title);
143
144 $reference_doc_obj->add_utf8_text($reference_doc_obj->get_top_section(), "Some dummy text.");
145 $self->{'processor'}->process($reference_doc_obj);
146 $self->{'num_processed'}++;
147}
148
149
150sub new_metadata_entry
151{
152 my ($doc_obj, $metadata_name, $metadata_value) = (@_);
153
154 # Don't bother with empty metadata
155 return if ($metadata_value eq "");
156
157 # Spaces aren't allowed in metadata names
158 $metadata_name =~ s/ /_/g;
159
160 # Anything from the database is ISO 8859-1 encoded, so convert to UTF-8
161 $metadata_value = &unicode::ascii2utf8(\$metadata_value);
162
163 # Remove '#' characters around links
164 if ($metadata_value =~ /\#(.*?)\#/) {
165 $metadata_value = $1;
166 }
167
168 # Escape any '[' and ']' characters so Greenstone doesn't try to treat the text as metadata...
169 $metadata_value =~ s/\[/&\#91;/g;
170 $metadata_value =~ s/\]/&\#93;/g;
171
172 # Remove any superscript tags
173 $metadata_value =~ s/\<sup\>//g;
174 $metadata_value =~ s/\<\/sup\>//g;
175
176 $doc_obj->add_utf8_metadata($doc_obj->get_top_section(), $metadata_name, $metadata_value);
177}
178
179
1801;
181
Note: See TracBrowser for help on using the repository browser.