source: main/trunk/greenstone2/perllib/plugouts/OpenAIGPTsHTMLPlugout.pm

Last change on this file was 38986, checked in by davidb, 3 weeks ago

Updates to reflect its new Perl package name

File size: 8.6 KB
Line 
1###########################################################################
2#
3# OpenAIGPTsHTMLPlugout.pm -- the plugout module to output docs in a form
4# suitable for ingest into OpenAI's GTPs capabilty
5#
6# A component of the Greenstone digital library software
7# from the New Zealand Digital Library Project at the
8# University of Waikato, New Zealand.
9#
10# Copyright (C) 2006 New Zealand Digital Library Project
11#
12# This program is free software; you can redistribute it and/or modify
13# it under the terms of the GNU General Public License as published by
14# the Free Software Foundation; either version 2 of the License, or
15# (at your option) any later version.
16#
17# This program is distributed in the hope that it will be useful,
18# but WITHOUT ANY WARRANTY; without even the implied warranty of
19# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20# GNU General Public License for more details.
21#
22# You should have received a copy of the GNU General Public License
23# along with this program; if not, write to the Free Software
24# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
25#
26###########################################################################
27
28# The role of this plugout is essentially to turn a raw doc_obj
29# representation into a decent to look at HTML file based on the
30# metadata and text, along with all the associate files
31
32# Approach taken is to encode an XML Transform which changes
33# Greenstone's doc.xml format into HTML
34
35package OpenAIGPTsHTMLPlugout;
36
37use strict;
38no strict 'refs';
39no strict 'subs';
40
41eval {require bytes};
42use util;
43use FileUtils;
44use GreenstoneXMLPlugout;
45use docprint;
46
47sub BEGIN {
48 @OpenAIGPTsPlugout::ISA = ('GreenstoneXMLPlugout');
49}
50
51my $arguments = [
52 { 'name' => "xslt_file",
53 'desc' => "{BasPlugout.xslt_file}",
54 'type' => "string",
55 'reqd' => "no",
56 'deft' => "gsdom2gpts.xsl",
57 'hiddengli' => "no"}
58 ];
59
60my $options = { 'name' => "OpenAIGPTsHTMLPlugout",
61 'desc' => "{OpenAIGPTsHTMLPlugout.desc}",
62 'abstract' => "no",
63 'inherits' => "yes",
64 'args' => $arguments };
65
66sub new {
67 my ($class) = shift (@_);
68 my ($plugoutlist, $inputargs,$hashArgOptLists) = @_;
69 push(@$plugoutlist, $class);
70
71 push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
72 push(@{$hashArgOptLists->{"OptList"}},$options);
73
74 my $self = new GreenstoneXMLPlugout($plugoutlist,$inputargs,$hashArgOptLists);
75
76 if ($self->{'info_only'}) {
77 # don't worry about any options etc
78 return bless $self, $class;
79 }
80
81 return bless $self, $class;
82}
83
84
85sub get_doc_xml_filename {
86 my $self = shift (@_);
87 my ($doc_obj) = @_;
88
89 # my $top_section = $doc_obj->get_top_section();
90
91 my $oid = $doc_obj->get_OID();
92
93 return "$oid.html";
94}
95
96# Note: This type of grouping (into dirs) is different to the base
97# classes group_size. 'group_size' is about concatenating multiple
98# Greenstone documents into one file (useful for small docs/records
99# such as MARC). The 'grouped_into_dirs' variables provide
100# an ability to multiple, separate documents, saved into the same
101# archives/export folder.
102
103my $grouped_into_dirs_root = "group-";
104my $grouped_into_dirs_doc_count = 0;
105
106my $max_docs_per_grouped_dir = 10;
107my $grouped_into_dirs_group_count = 0;
108
109
110sub get_new_doc_dir
111{
112 my $self = shift (@_);
113 my($working_info,$working_dir,$OID) = @_;
114
115 my $doc_dir;
116
117 if (defined $grouped_into_dirs_root) {
118 $grouped_into_dirs_doc_count++;
119
120 if (($grouped_into_dirs_doc_count % $max_docs_per_grouped_dir) == 0) {
121 $grouped_into_dirs_group_count++;
122 }
123 $doc_dir = sprintf("${grouped_into_dirs_root}%04d", $grouped_into_dirs_group_count);
124 }
125 else {
126
127 # A slimmed down version of BasePlugout::get_new_doc_dir()
128 # which creates a flat file structure, rather then nested
129
130 my $doc_dir_rest = $OID;
131
132 # remove any \ and / from the OID
133 $doc_dir_rest =~ s/[\\\/]//g;
134
135 # Remove ":" if we are on Windows OS, as otherwise they get confused with the drive letters
136 if ($ENV{'GSDLOS'} =~ /^windows$/i)
137 {
138 $doc_dir_rest =~ s/\://g;
139 }
140
141 $doc_dir = $doc_dir_rest;
142 }
143
144 my $created_directory = 0;
145
146 my $full_doc_dir = &FileUtils::filenameConcatenate($working_dir, $doc_dir . '.dir');
147 if(!FileUtils::directoryExists($full_doc_dir))
148 {
149 &FileUtils::makeAllDirectories($full_doc_dir);
150 $created_directory = 1;
151 }
152 else {
153 $created_directory = 1;
154 }
155
156 if (!$created_directory)
157 {
158 die("Error! Failed to create directory for document: $doc_dir\n");
159 }
160
161 return $doc_dir . '.dir';
162}
163
164
165
166sub get_group_doc_dir {
167 my $self = shift (@_);
168 my ($doc_obj) = @_;
169
170 # If this Plugout is being used to with grouped_into_dirs, then
171 # how get_group_dor_dir() needs to operate is different. In fact
172 # it is simpler than the super-class implementation, because (due
173 # to the prefix manipulation of gsdlassocfiles) it is safe for
174 # associated files to be saved in the same directory as other
175 # documents.
176
177 my $doc_dir = undef;
178
179 if (defined $grouped_into_dirs_root) {
180
181 my $outhandle = $self->{'output_handle'};
182 my $OID = $doc_obj->get_OID();
183 $OID = "NULL" unless defined $OID;
184
185 my $groupsize = $self->{'group_size'};
186 my $gs_count = $self->{'gs_count'};
187
188 my $open_new_file = (($gs_count % $groupsize)==0);
189
190 # opening a new file
191 if (($open_new_file) || !defined($self->{'gs_doc_dir'})) {
192 # first we close off the old output
193 if ($gs_count>0)
194 {
195 return if (!$self->close_group_output());
196 }
197
198 # this will create the directory
199 $doc_dir = $self->get_doc_dir ($doc_obj);
200 $self->{'new_doc_dir'} = 1;
201 $self->{'gs_doc_dir'} = $doc_dir;
202 $self->{'group_position'} = 1;
203 }
204 else {
205 $doc_dir = $self->{'gs_doc_dir'};
206 $self->{'new_doc_dir'} = 0;
207 }
208
209 }
210 else {
211 $doc_dir = $self->SUPER::get_group_doc_dir();
212 }
213
214 return $doc_dir;
215}
216
217
218
219
220sub recursive_process_section_content
221{
222 my $self = shift (@_);
223 my ($doc_obj, $section) = @_;
224
225 my $section_ptr = $doc_obj->_lookup_section ($section);
226 return unless defined $section_ptr;
227
228 my $oid = $doc_obj->get_OID();
229
230 my $text = $section_ptr->{'text'};
231 $text =~ s/_httpdocimg_\//$oid-/g;
232 $section_ptr->{'text'} = $text;
233
234 # Turn into text
235 $text =~ s/<style[^>]*>.*?<\/style>//si;
236 $text =~ s/<[^>]*>/ /g;
237 $text =~ s/\s+ / /mg;
238
239 my $gsdoc_marker = "<span fromGSDocId=\"$oid\"></span>";
240 $text =~ s/((?:[^\s]+\s*){10})/$1$gsdoc_marker/sg;
241
242
243 $section_ptr->{'text'} = "<div gsdocid=\"$oid\">$gsdoc_marker$text$gsdoc_marker</div>";
244
245 # print STDERR "*** text = $text\n";
246
247 # work through all the sub-sections
248 foreach my $subsection (@{$section_ptr->{'subsection_order'}}) {
249 $self->recursive_process_section_content($doc_obj, "$section.$subsection");
250 }
251}
252
253sub process_content
254{
255 my $self = shift (@_);
256 my ($doc_obj) = @_;
257
258 my $top_section = $doc_obj->get_top_section();
259 $self->recursive_process_section_content($doc_obj,$top_section);
260}
261
262
263sub process_assoc_files {
264 my $self = shift (@_);
265 my ($doc_obj, $doc_dir, $handle) = @_;
266
267 my $assoc_files = $doc_obj->get_assoc_files();
268
269 my $oid = $doc_obj->get_OID();
270
271 my $updated_associated_files = [];
272
273 foreach my $assoc_file_rec (@$assoc_files) {
274 my ($real_full_filename,$assoc_file,$mime_type,$section) = @$assoc_file_rec;
275
276 my $updated_assoc_file = "$oid-$assoc_file";
277
278 push(@$updated_associated_files,[$real_full_filename,$updated_assoc_file,$mime_type,$section]);
279 }
280
281 # Not the cleanest way to do this, but gets the job done
282 $doc_obj->{'associated_files'} = $updated_associated_files;
283
284 # Also need to prcess the content elements, as links to associated images have now changed
285 $self->process_content($doc_obj);
286
287 $self->SUPER::process_assoc_files($doc_obj,$doc_dir,$handle);
288
289}
290
291sub output_xml_header {
292 my $self = shift (@_);
293 my ($outhandle, $doc_oid) = @_;
294
295 # Transitional??
296 # !DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd">
297
298 # <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" "http://www.w3.org/TR/xhtml1/DTD/strict.dtd">
299 # <html xmlns="http://www.w3.org/TR/xhtml1/strict" >
300
301 print $outhandle '<?xml version="1.0" encoding="utf-8" standalone="no"?>' . "\n";
302 # print $outhandle "<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Strict//EN\" \"http://www.w3.org/TR/xhtml1/DTD/strict.dtd\">\n";
303 #print $outhandle "<html xmlns=\"http://www.w3.org/TR/xhtml1/strict\">\n";
304
305 print $outhandle "<Archive>\n";
306}
307
308sub output_xml_footer {
309 my $self = shift (@_);
310 my ($outhandle) = @_;
311
312 print $outhandle "</Archive>\n";
313}
314
315
316
3171;
318
Note: See TracBrowser for help on using the repository browser.