root/main/trunk/greenstone2/perllib/docprint.pm @ 32573

Revision 32573, 5.0 KB (checked in by ak19, 20 months ago)

1. Caching the prepared SQL insert statements inside the gssql class rather than in the GS SQL Plugout. 2. docprint.pm provides an escape_textref function besides the existing escape_text function, to deal with large text (e.g. fulltxt). The old escape_text function has been rewritten to internally use the escape_textref function.

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
Line 
1###########################################################################
2#
3# docprint.pm --
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 2006 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26# This is used to output an XML representation of a doc_obj - this will be
27# Greenstone XML format.
28# This is used by GreenstoneXMLPlugout and doc.pm
29# It's also used by GreenstoneSQLPlugout to output a doc_obj's meta and/or
30# fulltxt to an SQL db, while subroutine  unescape_text() is used by
31# GreenstoneSQLPlugin for unescaping txt read back in.
32
33package docprint;
34
35use constant OUTPUT_NONE => 0;
36use constant OUTPUT_META_ONLY => 1;
37use constant OUTPUT_TEXT_ONLY => 2;
38use constant OUTPUT_ALL => 3;
39
40use strict;
41
42sub get_section_xml {
43    return &get_section_xml_from_root(@_);
44}
45
46sub get_section_xml_from_root {
47    my ($doc_obj, $options) = @_;
48    return &recursive_get_section_xml($doc_obj, $doc_obj->get_top_section(), $options);
49}
50
51sub recursive_get_section_xml {   
52    my ($doc_obj, $section, $options) = @_;
53   
54    # 'output' can be OUTPUT_ALL|OUTPUT_META_ONLY|OUTPUT_TEXT_ONLY|OUTPUT_NONE
55    # If not provided, it defaults to OUTPUT_ALL.
56    # If OUTPUT_ALL, the metadata and full text both go into doc.xml
57    # If OUTPUT_META_ONLY, the metadata goes into doc.xml and full text goes elsewhere (mysql db).
58    # If OUTPUT_TEXT_ONLY, the full text goes into doc.xml and metadata goes elsewhere (mysql db).
59    # If OUTPUT_NONE, the full text and metadata goes elsewhere (mysql db)
60    # In the last 3 cases, an XML comment is left behind as a 'breadcrumb' to indicate
61    # that the "missing" doc information is stored elsewhere.
62    if(!defined $options) {
63    $options = { 'output' => OUTPUT_ALL };
64    }
65   
66    my $section_ptr = $doc_obj->_lookup_section ($section);
67    return "" unless defined $section_ptr;
68
69    my $all_text = "<Section>\n";
70    $all_text .= "  <Description>\n";
71
72    # scalar comparisons on a constant is allowed (but not string evaluation of scalars)
73    # https://www.perlmonks.org/?node_id=559456
74    if($options->{'output'} == OUTPUT_ALL || $options->{'output'} == OUTPUT_META_ONLY) {
75    # output metadata
76    foreach my $data (@{$section_ptr->{'metadata'}}) {
77        my $escaped_value = &escape_text($data->[1]);
78        $all_text .= '    <Metadata name="' . $data->[0] . '">' . $escaped_value . "</Metadata>\n";
79    }
80    } else {
81    $all_text .= "<!-- metadata is stored elsewhere (MySQL database) -->\n";
82    }
83
84    $all_text .= "  </Description>\n";
85
86    # output the text
87    $all_text .= "  <Content>";
88    if($options->{'output'} == OUTPUT_ALL || $options->{'output'} == OUTPUT_TEXT_ONLY) {
89    $all_text .= &escape_text($section_ptr->{'text'});
90    } else {
91    $all_text .= "<!-- full text is stored elsewhere (MySQL database) -->\n";
92    }
93    $all_text .= "</Content>\n";
94   
95    # output all the subsections
96    foreach my $subsection (@{$section_ptr->{'subsection_order'}}) {
97    $all_text .= &recursive_get_section_xml($doc_obj, "$section.$subsection", $options);
98    }
99   
100    $all_text .=  "</Section>\n";
101
102    # make sure no nasty control characters have snuck through
103    # (XML::Parser will barf on anything it doesn't consider to be
104    # valid UTF-8 text, including things like \c@, \cC etc.)
105    $all_text =~ s/[\x00-\x09\x0B\x0C\x0E-\x1F]//g;
106
107    return $all_text;
108}
109
110# pass by ref version: gets a ref to a string and returns ref to the modified string
111# so use this for large strings (fulltext)
112sub escape_textref {
113    my ($textref) = @_;
114    # special characters in the xml encoding
115    $$textref =~ s/&&/& &/g;
116    $$textref =~ s/&/&amp;/g; # this has to be first...
117    $$textref =~ s/</&lt;/g;
118    $$textref =~ s/>/&gt;/g;
119    $$textref =~ s/\"/&quot;/g;
120
121    return $textref;
122}
123
124# pass by value - the behaviour of the escape_text method used so far is to pass copies of strings
125sub escape_text {
126    my ($text) = @_;
127    my $textref = &escape_textref(\$text);
128    return $$textref;
129}
130
131# used by GreenstoneSQLPlugin when reading back from sqldb
132sub unescape_text {
133    my ($text) = @_;
134    # special characters in the xml encoding
135    $text =~ s/& &/&&/g;
136    $text =~ s/&amp;/&/g; # this has to be first...
137    $text =~ s/&lt;/</g;
138    $text =~ s/&gt;/>/g;
139    $text =~ s/&quot;/"/g;
140
141    return $text;
142}
143
1441;
Note: See TracBrowser for help on using the browser.