source: trunk/cic-hcap/perllib/plugins/CICWordPlug.pm@ 12170

Last change on this file since 12170 was 12170, checked in by mdewsnip, 18 years ago

First cut at a new plugin for processing the non-place references.

  • Property svn:keywords set to Author Date Id Revision
File size: 4.3 KB
Line 
1###########################################################################
2#
3# CICWordPlug.pm
4#
5# Copyright (C) 2006 New Zealand Digital Library Project
6#
7# This program is free software; you can redistribute it and/or modify
8# it under the terms of the GNU General Public License as published by
9# the Free Software Foundation; either version 2 of the License, or
10# (at your option) any later version.
11#
12# This program is distributed in the hope that it will be useful,
13# but WITHOUT ANY WARRANTY; without even the implied warranty of
14# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15# GNU General Public License for more details.
16#
17# You should have received a copy of the GNU General Public License
18# along with this program; if not, write to the Free Software
19# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
20#
21###########################################################################
22
23package CICWordPlug;
24
25
26use ConvertToPlug;
27use strict;
28no strict 'refs';
29
30
31sub BEGIN {
32 @CICWordPlug::ISA = ('ConvertToPlug');
33}
34
35
36my $arguments = [ ];
37
38my $options = { 'name' => "CICWordPlug",
39 'desc' => "{CICWordPlug.desc}",
40 'abstract' => "no",
41 'inherits' => "yes",
42 'args' => $arguments };
43
44
45sub get_default_process_exp
46{
47 return q^(?i)\.doc$^;
48}
49
50
51sub new {
52 my ($class) = shift (@_);
53 my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
54 push(@$pluginlist, $class);
55
56 if(defined $arguments){ push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});}
57 if(defined $options) { push(@{$hashArgOptLists->{"OptList"}},$options)};
58
59 my $self = new ConvertToPlug($pluginlist, $inputargs, $hashArgOptLists);
60
61 return bless $self, $class;
62}
63
64
65sub read
66{
67 my $self = shift (@_);
68 my ($pluginfo, $base_dir, $file, $metadata, $processor, $maxdocs, $total_count, $gli) = @_;
69
70 $self->{'file'} = $file;
71 $self->{'processor'} = $processor;
72 $self->{'gli'} = $gli;
73
74 $self->SUPER::read($pluginfo, $base_dir, $file, $metadata, $processor, $maxdocs, $total_count, $gli);
75}
76
77
78sub convert_post_process
79{
80 my $self = shift(@_);
81 my $conv_filename = shift(@_);
82
83 my $in_record = 0;
84 my $num_records = 0;
85 open(HTML_IN, $conv_filename);
86 while (my $line = <HTML_IN>) {
87 # Is this the end of a record?
88 if ($line =~ /^<\/p>/) {
89 $in_record = 0;
90 }
91 # Is this the start of a possible record?
92 if ($line =~ /^<p /) {
93 $in_record = 1;
94 }
95 elsif ($in_record == 1) {
96 my $line_sans_tags = $line;
97 $line_sans_tags =~ s/<.*?>//g;
98
99 # Type 1: No author, title in double quotes
100 if ($line =~ /^&ldquo;(.*)&rdquo;/) {
101 $num_records++;
102 $self->add_reference_document($num_records, $line, $1, "");
103 }
104
105 # Type 2: Author, title in italics or quotes
106 elsif ($line =~ /^(.*)<i>(.*)<\/i>/ || $line =~ /^(.*)&quot;(.*)&quot;/ || $line =~ /^(.*)&ldquo;(.*)&rdquo;/) {
107 $num_records++;
108 $self->add_reference_document($num_records, $line, $2, $1);
109 }
110
111 # Error: Could not parse record
112 elsif ($line_sans_tags =~ /\w/) {
113 print "Unknown type: $line";
114 }
115 }
116 }
117 close(HTML_IN);
118}
119
120
121sub add_reference_document
122{
123 my $self = shift(@_);
124 my $reference_id = shift(@_);
125 my $reference = shift(@_);
126 my $reference_title = shift(@_);
127 my $reference_author = shift(@_);
128
129 my $reference_doc_obj = new doc($self->{'file'} . "-", "indexed_doc");
130 $reference_doc_obj->set_OID("r$reference_id");
131 &new_metadata_entry($reference_doc_obj, "DocumentType", "Reference");
132 &new_metadata_entry($reference_doc_obj, "Author", $reference_author);
133 &new_metadata_entry($reference_doc_obj, "Title", $reference_title);
134
135 $reference_doc_obj->add_utf8_text($reference_doc_obj->get_top_section(), $reference);
136 $self->{'processor'}->process($reference_doc_obj);
137 $self->{'num_processed'}++;
138}
139
140
141sub new_metadata_entry
142{
143 my ($doc_obj, $metadata_name, $metadata_value) = (@_);
144
145 # Don't bother with empty metadata
146 return if ($metadata_value eq "");
147
148 # Spaces aren't allowed in metadata names
149 $metadata_name =~ s/ /_/g;
150
151 # Anything from the database is ISO 8859-1 encoded, so convert to UTF-8
152 $metadata_value = &unicode::ascii2utf8(\$metadata_value);
153
154 # Remove '#' characters around links
155 if ($metadata_value =~ /\#(.*?)\#/) {
156 $metadata_value = $1;
157 }
158
159 $doc_obj->add_utf8_metadata($doc_obj->get_top_section(), $metadata_name, $metadata_value);
160}
161
162
1631;
164
Note: See TracBrowser for help on using the repository browser.