Context Navigation

← Previous Revision
Next Revision →
Blame
Revision Log

OpenAIGPTsHTMLPlugout.pm

Last change on this file was 38986, checked in by davidb, 3 weeks ago
Updates to reflect its new Perl package name
File size: 8.6 KB

Line
1	###########################################################################
2	#
3	# OpenAIGPTsHTMLPlugout.pm -- the plugout module to output docs in a form
4	# suitable for ingest into OpenAI's GTPs capabilty
5	#
6	# A component of the Greenstone digital library software
7	# from the New Zealand Digital Library Project at the
8	# University of Waikato, New Zealand.
9	#
10	# Copyright (C) 2006 New Zealand Digital Library Project
11	#
12	# This program is free software; you can redistribute it and/or modify
13	# it under the terms of the GNU General Public License as published by
14	# the Free Software Foundation; either version 2 of the License, or
15	# (at your option) any later version.
16	#
17	# This program is distributed in the hope that it will be useful,
18	# but WITHOUT ANY WARRANTY; without even the implied warranty of
19	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20	# GNU General Public License for more details.
21	#
22	# You should have received a copy of the GNU General Public License
23	# along with this program; if not, write to the Free Software
24	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
25	#
26	###########################################################################
27
28	# The role of this plugout is essentially to turn a raw doc_obj
29	# representation into a decent to look at HTML file based on the
30	# metadata and text, along with all the associate files
31
32	# Approach taken is to encode an XML Transform which changes
33	# Greenstone's doc.xml format into HTML
34
35	package OpenAIGPTsHTMLPlugout;
36
37	use strict;
38	no strict 'refs';
39	no strict 'subs';
40
41	eval {require bytes};
42	use util;
43	use FileUtils;
44	use GreenstoneXMLPlugout;
45	use docprint;
46
47	sub BEGIN {
48	@OpenAIGPTsPlugout::ISA = ('GreenstoneXMLPlugout');
49	}
50
51	my $arguments = [
52	{ 'name' => "xslt_file",
53	'desc' => "{BasPlugout.xslt_file}",
54	'type' => "string",
55	'reqd' => "no",
56	'deft' => "gsdom2gpts.xsl",
57	'hiddengli' => "no"}
58	];
59
60	my $options = { 'name' => "OpenAIGPTsHTMLPlugout",
61	'desc' => "{OpenAIGPTsHTMLPlugout.desc}",
62	'abstract' => "no",
63	'inherits' => "yes",
64	'args' => $arguments };
65
66	sub new {
67	my ($class) = shift (@_);
68	my ($plugoutlist, $inputargs,$hashArgOptLists) = @_;
69	push(@$plugoutlist, $class);
70
71	push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
72	push(@{$hashArgOptLists->{"OptList"}},$options);
73
74	my $self = new GreenstoneXMLPlugout($plugoutlist,$inputargs,$hashArgOptLists);
75
76	if ($self->{'info_only'}) {
77	# don't worry about any options etc
78	return bless $self, $class;
79	}
80
81	return bless $self, $class;
82	}
83
84
85	sub get_doc_xml_filename {
86	my $self = shift (@_);
87	my ($doc_obj) = @_;
88
89	# my $top_section = $doc_obj->get_top_section();
90
91	my $oid = $doc_obj->get_OID();
92
93	return "$oid.html";
94	}
95
96	# Note: This type of grouping (into dirs) is different to the base
97	# classes group_size. 'group_size' is about concatenating multiple
98	# Greenstone documents into one file (useful for small docs/records
99	# such as MARC). The 'grouped_into_dirs' variables provide
100	# an ability to multiple, separate documents, saved into the same
101	# archives/export folder.
102
103	my $grouped_into_dirs_root = "group-";
104	my $grouped_into_dirs_doc_count = 0;
105
106	my $max_docs_per_grouped_dir = 10;
107	my $grouped_into_dirs_group_count = 0;
108
109
110	sub get_new_doc_dir
111	{
112	my $self = shift (@_);
113	my($working_info,$working_dir,$OID) = @_;
114
115	my $doc_dir;
116
117	if (defined $grouped_into_dirs_root) {
118	$grouped_into_dirs_doc_count++;
119
120	if (($grouped_into_dirs_doc_count % $max_docs_per_grouped_dir) == 0) {
121	$grouped_into_dirs_group_count++;
122	}
123	$doc_dir = sprintf("${grouped_into_dirs_root}%04d", $grouped_into_dirs_group_count);
124	}
125	else {
126
127	# A slimmed down version of BasePlugout::get_new_doc_dir()
128	# which creates a flat file structure, rather then nested
129
130	my $doc_dir_rest = $OID;
131
132	# remove any \ and / from the OID
133	$doc_dir_rest =~ s/[\\\/]//g;
134
135	# Remove ":" if we are on Windows OS, as otherwise they get confused with the drive letters
136	if ($ENV{'GSDLOS'} =~ /^windows$/i)
137	{
138	$doc_dir_rest =~ s/\://g;
139	}
140
141	$doc_dir = $doc_dir_rest;
142	}
143
144	my $created_directory = 0;
145
146	my $full_doc_dir = &FileUtils::filenameConcatenate($working_dir, $doc_dir . '.dir');
147	if(!FileUtils::directoryExists($full_doc_dir))
148	{
149	&FileUtils::makeAllDirectories($full_doc_dir);
150	$created_directory = 1;
151	}
152	else {
153	$created_directory = 1;
154	}
155
156	if (!$created_directory)
157	{
158	die("Error! Failed to create directory for document: $doc_dir\n");
159	}
160
161	return $doc_dir . '.dir';
162	}
163
164
165
166	sub get_group_doc_dir {
167	my $self = shift (@_);
168	my ($doc_obj) = @_;
169
170	# If this Plugout is being used to with grouped_into_dirs, then
171	# how get_group_dor_dir() needs to operate is different. In fact
172	# it is simpler than the super-class implementation, because (due
173	# to the prefix manipulation of gsdlassocfiles) it is safe for
174	# associated files to be saved in the same directory as other
175	# documents.
176
177	my $doc_dir = undef;
178
179	if (defined $grouped_into_dirs_root) {
180
181	my $outhandle = $self->{'output_handle'};
182	my $OID = $doc_obj->get_OID();
183	$OID = "NULL" unless defined $OID;
184
185	my $groupsize = $self->{'group_size'};
186	my $gs_count = $self->{'gs_count'};
187
188	my $open_new_file = (($gs_count % $groupsize)==0);
189
190	# opening a new file
191	if (($open_new_file) \|\| !defined($self->{'gs_doc_dir'})) {
192	# first we close off the old output
193	if ($gs_count>0)
194	{
195	return if (!$self->close_group_output());
196	}
197
198	# this will create the directory
199	$doc_dir = $self->get_doc_dir ($doc_obj);
200	$self->{'new_doc_dir'} = 1;
201	$self->{'gs_doc_dir'} = $doc_dir;
202	$self->{'group_position'} = 1;
203	}
204	else {
205	$doc_dir = $self->{'gs_doc_dir'};
206	$self->{'new_doc_dir'} = 0;
207	}
208
209	}
210	else {
211	$doc_dir = $self->SUPER::get_group_doc_dir();
212	}
213
214	return $doc_dir;
215	}
216
217
218
219
220	sub recursive_process_section_content
221	{
222	my $self = shift (@_);
223	my ($doc_obj, $section) = @_;
224
225	my $section_ptr = $doc_obj->_lookup_section ($section);
226	return unless defined $section_ptr;
227
228	my $oid = $doc_obj->get_OID();
229
230	my $text = $section_ptr->{'text'};
231	$text =~ s/_httpdocimg_\//$oid-/g;
232	$section_ptr->{'text'} = $text;
233
234	# Turn into text
235	$text =~ s/<style[^>]>.?<\/style>//si;
236	$text =~ s/<[^>]*>/ /g;
237	$text =~ s/\s+ / /mg;
238
239	my $gsdoc_marker = "<span fromGSDocId=\"$oid\"></span>";
240	$text =~ s/((?:[^\s]+\s*){10})/$1$gsdoc_marker/sg;
241
242
243	$section_ptr->{'text'} = "<div gsdocid=\"$oid\">$gsdoc_marker$text$gsdoc_marker</div>";
244
245	# print STDERR "*** text = $text\n";
246
247	# work through all the sub-sections
248	foreach my $subsection (@{$section_ptr->{'subsection_order'}}) {
249	$self->recursive_process_section_content($doc_obj, "$section.$subsection");
250	}
251	}
252
253	sub process_content
254	{
255	my $self = shift (@_);
256	my ($doc_obj) = @_;
257
258	my $top_section = $doc_obj->get_top_section();
259	$self->recursive_process_section_content($doc_obj,$top_section);
260	}
261
262
263	sub process_assoc_files {
264	my $self = shift (@_);
265	my ($doc_obj, $doc_dir, $handle) = @_;
266
267	my $assoc_files = $doc_obj->get_assoc_files();
268
269	my $oid = $doc_obj->get_OID();
270
271	my $updated_associated_files = [];
272
273	foreach my $assoc_file_rec (@$assoc_files) {
274	my ($real_full_filename,$assoc_file,$mime_type,$section) = @$assoc_file_rec;
275
276	my $updated_assoc_file = "$oid-$assoc_file";
277
278	push(@$updated_associated_files,[$real_full_filename,$updated_assoc_file,$mime_type,$section]);
279	}
280
281	# Not the cleanest way to do this, but gets the job done
282	$doc_obj->{'associated_files'} = $updated_associated_files;
283
284	# Also need to prcess the content elements, as links to associated images have now changed
285	$self->process_content($doc_obj);
286
287	$self->SUPER::process_assoc_files($doc_obj,$doc_dir,$handle);
288
289	}
290
291	sub output_xml_header {
292	my $self = shift (@_);
293	my ($outhandle, $doc_oid) = @_;
294
295	# Transitional??
296	# !DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd">
297
298	# <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" "http://www.w3.org/TR/xhtml1/DTD/strict.dtd">
299	# <html xmlns="http://www.w3.org/TR/xhtml1/strict" >
300
301	print $outhandle '<?xml version="1.0" encoding="utf-8" standalone="no"?>' . "\n";
302	# print $outhandle "<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Strict//EN\" \"http://www.w3.org/TR/xhtml1/DTD/strict.dtd\">\n";
303	#print $outhandle "<html xmlns=\"http://www.w3.org/TR/xhtml1/strict\">\n";
304
305	print $outhandle "<Archive>\n";
306	}
307
308	sub output_xml_footer {
309	my $self = shift (@_);
310	my ($outhandle) = @_;
311
312	print $outhandle "</Archive>\n";
313	}
314
315
316
317	1;
318

Note: See TracBrowser for help on using the repository browser.

Context Navigation

source: main/trunk/greenstone2/perllib/plugouts/OpenAIGPTsHTMLPlugout.pm

Download in other formats: