source: main/trunk/greenstone2/perllib/docprint.pm@ 32575

Last change on this file since 32575 was 32575, checked in by ak19, 5 years ago
  1. gssql now does fetching all rows internally upon select. With this the statement and database handles have been hidden away in the gssql.pm class. Hopefully this makes the GreenstoneSQLPlugin and GreenstoneSQLPlugout code easier to read and follow. 2. new method docprint::unescape_textref() takes a textref and returns a ref to the modified text. This method is now used internally by the older docprint::unescape_text() variant of the method. unescape_textref(), like the recently added escape_textref(), should hopefully do what I think it does. Then it can be used to pass large strings, like fulltext in particular, by ref instead of value.
  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 5.2 KB
Line 
1###########################################################################
2#
3# docprint.pm --
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 2006 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26# This is used to output an XML representation of a doc_obj - this will be
27# Greenstone XML format.
28# This is used by GreenstoneXMLPlugout and doc.pm
29# It's also used by GreenstoneSQLPlugout to output a doc_obj's meta and/or
30# fulltxt to an SQL db, while subroutine unescape_text() is used by
31# GreenstoneSQLPlugin for unescaping txt read back in.
32
33package docprint;
34
35use constant OUTPUT_NONE => 0;
36use constant OUTPUT_META_ONLY => 1;
37use constant OUTPUT_TEXT_ONLY => 2;
38use constant OUTPUT_ALL => 3;
39
40use strict;
41
42sub get_section_xml {
43 return &get_section_xml_from_root(@_);
44}
45
46sub get_section_xml_from_root {
47 my ($doc_obj, $options) = @_;
48 return &recursive_get_section_xml($doc_obj, $doc_obj->get_top_section(), $options);
49}
50
51sub recursive_get_section_xml {
52 my ($doc_obj, $section, $options) = @_;
53
54 # 'output' can be OUTPUT_ALL|OUTPUT_META_ONLY|OUTPUT_TEXT_ONLY|OUTPUT_NONE
55 # If not provided, it defaults to OUTPUT_ALL.
56 # If OUTPUT_ALL, the metadata and full text both go into doc.xml
57 # If OUTPUT_META_ONLY, the metadata goes into doc.xml and full text goes elsewhere (mysql db).
58 # If OUTPUT_TEXT_ONLY, the full text goes into doc.xml and metadata goes elsewhere (mysql db).
59 # If OUTPUT_NONE, the full text and metadata goes elsewhere (mysql db)
60 # In the last 3 cases, an XML comment is left behind as a 'breadcrumb' to indicate
61 # that the "missing" doc information is stored elsewhere.
62 if(!defined $options) {
63 $options = { 'output' => OUTPUT_ALL };
64 }
65
66 my $section_ptr = $doc_obj->_lookup_section ($section);
67 return "" unless defined $section_ptr;
68
69 my $all_text = "<Section>\n";
70 $all_text .= " <Description>\n";
71
72 # scalar comparisons on a constant is allowed (but not string evaluation of scalars)
73 # https://www.perlmonks.org/?node_id=559456
74 if($options->{'output'} == OUTPUT_ALL || $options->{'output'} == OUTPUT_META_ONLY) {
75 # output metadata
76 foreach my $data (@{$section_ptr->{'metadata'}}) {
77 my $escaped_value = &escape_text($data->[1]);
78 $all_text .= ' <Metadata name="' . $data->[0] . '">' . $escaped_value . "</Metadata>\n";
79 }
80 } else {
81 $all_text .= "<!-- metadata is stored elsewhere (MySQL database) -->\n";
82 }
83
84 $all_text .= " </Description>\n";
85
86 # output the text
87 $all_text .= " <Content>";
88 if($options->{'output'} == OUTPUT_ALL || $options->{'output'} == OUTPUT_TEXT_ONLY) {
89 $all_text .= &escape_text($section_ptr->{'text'});
90 } else {
91 $all_text .= "<!-- full text is stored elsewhere (MySQL database) -->\n";
92 }
93 $all_text .= "</Content>\n";
94
95 # output all the subsections
96 foreach my $subsection (@{$section_ptr->{'subsection_order'}}) {
97 $all_text .= &recursive_get_section_xml($doc_obj, "$section.$subsection", $options);
98 }
99
100 $all_text .= "</Section>\n";
101
102 # make sure no nasty control characters have snuck through
103 # (XML::Parser will barf on anything it doesn't consider to be
104 # valid UTF-8 text, including things like \c@, \cC etc.)
105 $all_text =~ s/[\x00-\x09\x0B\x0C\x0E-\x1F]//g;
106
107 return $all_text;
108}
109
110# pass by ref version: gets a ref to a string and returns ref to the modified string
111# so use this for large strings (fulltext)
112sub escape_textref {
113 my ($textref) = @_;
114 # special characters in the xml encoding
115 $$textref =~ s/&&/& &/g;
116 $$textref =~ s/&/&amp;/g; # this has to be first...
117 $$textref =~ s/</&lt;/g;
118 $$textref =~ s/>/&gt;/g;
119 $$textref =~ s/\"/&quot;/g;
120
121 return $textref;
122}
123
124# pass by value - the behaviour of the escape_text method used so far is to pass copies of strings
125sub escape_text {
126 my ($text) = @_;
127 my $textref = &escape_textref(\$text);
128 return $$textref;
129}
130
131# Unescape variants are used by GreenstoneSQLPlugin when reading text back from SQL db
132# Pass by ref version
133sub unescape_textref {
134 my ($textref) = @_;
135 # special characters in the xml encoding
136 $$textref =~ s/& &/&&/g;
137 $$textref =~ s/&amp;/&/g; # this has to be first...
138 $$textref =~ s/&lt;/</g;
139 $$textref =~ s/&gt;/>/g;
140 $$textref =~ s/&quot;/"/g;
141
142 return $textref;
143}
144
145# Pass by value version
146sub unescape_text {
147 my ($text) = @_;
148 my $textref = &unescape_textref(\$text);
149 return $$textref;
150}
151
1521;
Note: See TracBrowser for help on using the repository browser.