root/main/trunk/greenstone2/perllib/docprint.pm @ 32532

Revision 32532, 4.2 KB (checked in by ak19, 21 months ago)

Correction to changes: recursive call in docprint should still be passing around options map. Otherwise, when using GS SQL Plugout, we end up with meta and text of subsections always still being written out to doc.xml

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
Line 
1###########################################################################
2#
3# docprint.pm --
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 2006 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26# This is used to output an XML representation of a doc_obj - this will be
27# Greenstone XML format.
28# This is used by GreenstoneXMLPlugout and doc.pm
29
30package docprint;
31
32use constant OUTPUT_NONE => 0;
33use constant OUTPUT_META_ONLY => 1;
34use constant OUTPUT_TEXT_ONLY => 2;
35use constant OUTPUT_ALL => 3;
36
37use strict;
38
39sub get_section_xml {
40    return &get_section_xml_from_root(@_);
41}
42
43sub get_section_xml_from_root {
44    my ($doc_obj, $options) = @_;
45    return &recursive_get_section_xml($doc_obj, $doc_obj->get_top_section(), $options);
46}
47
48sub recursive_get_section_xml {   
49    my ($doc_obj, $section, $options) = @_;
50   
51    # 'output' can be OUTPUT_ALL|OUTPUT_META_ONLY|OUTPUT_TEXT_ONLY|OUTPUT_NONE
52    # If not provided, it defaults to OUTPUT_ALL.
53    # If OUTPUT_ALL, the metadata and full text both go into doc.xml
54    # If OUTPUT_META_ONLY, the metadata goes into doc.xml and full text goes elsewhere (mysql db).
55    # If OUTPUT_TEXT_ONLY, the full text goes into doc.xml and metadata goes elsewhere (mysql db).
56    # If OUTPUT_NONE, the full text and metadata goes elsewhere (mysql db)
57    # In the last 3 cases, an XML comment is left behind as a 'breadcrumb' to indicate
58    # that the "missing" doc information is stored elsewhere.
59    if(!defined $options) {
60    $options = { 'output' => OUTPUT_ALL };
61    }
62   
63    my $section_ptr = $doc_obj->_lookup_section ($section);
64    return "" unless defined $section_ptr;
65
66    my $all_text = "<Section>\n";
67    $all_text .= "  <Description>\n";
68
69    # scalar comparisons on a constant is allowed (but not string evaluation of scalars)
70    # https://www.perlmonks.org/?node_id=559456
71    if($options->{'output'} == OUTPUT_ALL || $options->{'output'} == OUTPUT_META_ONLY) {
72    # output metadata
73    foreach my $data (@{$section_ptr->{'metadata'}}) {
74        my $escaped_value = &escape_text($data->[1]);
75        $all_text .= '    <Metadata name="' . $data->[0] . '">' . $escaped_value . "</Metadata>\n";
76    }
77    } else {
78    $all_text .= "<!-- metadata is stored elsewhere (MySQL database) -->\n";
79    }
80
81    $all_text .= "  </Description>\n";
82
83    # output the text
84    $all_text .= "  <Content>";
85    if($options->{'output'} == OUTPUT_ALL || $options->{'output'} == OUTPUT_TEXT_ONLY) {
86    $all_text .= &escape_text($section_ptr->{'text'});
87    } else {
88    $all_text .= "<!-- full text is stored elsewhere (MySQL database) -->\n";
89    }
90    $all_text .= "</Content>\n";
91   
92    # output all the subsections
93    foreach my $subsection (@{$section_ptr->{'subsection_order'}}) {
94    $all_text .= &recursive_get_section_xml($doc_obj, "$section.$subsection", $options);
95    }
96   
97    $all_text .=  "</Section>\n";
98
99    # make sure no nasty control characters have snuck through
100    # (XML::Parser will barf on anything it doesn't consider to be
101    # valid UTF-8 text, including things like \c@, \cC etc.)
102    $all_text =~ s/[\x00-\x09\x0B\x0C\x0E-\x1F]//g;
103
104    return $all_text;
105}
106
107sub escape_text {
108    my ($text) = @_;
109    # special characters in the xml encoding
110    $text =~ s/&&/& &/g;
111    $text =~ s/&/&amp;/g; # this has to be first...
112    $text =~ s/</&lt;/g;
113    $text =~ s/>/&gt;/g;
114    $text =~ s/\"/&quot;/g;
115
116    return $text;
117}
118
1191;
Note: See TracBrowser for help on using the browser.