source: main/trunk/greenstone2/perllib/docprint.pm

Last change on this file was 34221, checked in by ak19, 4 years ago

Undid the change of converting tabstops to their entities in docprint.pm (which has gone back to removing them now) and moved this conversion into TextPlugin.pm after all. In case this has an unforeseen effect, wanted to break as little as possible. Also, only want pre tags to preserve tabs and other html can be cleaned of this. TextPlugin definitely adds pre tags when converting txt to html, so it makes sense to always preserve tabstops there, whereas it doesn't make sense to assume the same need in all cases where html is produced as they may not contain pre tags.

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 5.3 KB
Line 
1###########################################################################
2#
3# docprint.pm --
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 2006 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26# This is used to output an XML representation of a doc_obj - this will be
27# Greenstone XML format.
28# This is used by GreenstoneXMLPlugout and doc.pm
29# It's also used by GreenstoneSQLPlugout to output a doc_obj's meta and/or
30# fulltxt to an SQL db, while subroutine unescape_text() is used by
31# GreenstoneSQLPlugin for unescaping txt read back in.
32
33package docprint;
34
35use constant OUTPUT_NONE => 0;
36use constant OUTPUT_META_ONLY => 1;
37use constant OUTPUT_TEXT_ONLY => 2;
38use constant OUTPUT_ALL => 3;
39
40use strict;
41
42sub get_section_xml {
43 return &get_section_xml_from_root(@_);
44}
45
46sub get_section_xml_from_root {
47 my ($doc_obj, $options) = @_;
48 return &recursive_get_section_xml($doc_obj, $doc_obj->get_top_section(), $options);
49}
50
51sub recursive_get_section_xml {
52 my ($doc_obj, $section, $options) = @_;
53
54 # 'output' can be OUTPUT_ALL|OUTPUT_META_ONLY|OUTPUT_TEXT_ONLY|OUTPUT_NONE
55 # If not provided, it defaults to OUTPUT_ALL.
56 # If OUTPUT_ALL, the metadata and full text both go into doc.xml
57 # If OUTPUT_META_ONLY, the metadata goes into doc.xml and full text goes elsewhere (mysql db).
58 # If OUTPUT_TEXT_ONLY, the full text goes into doc.xml and metadata goes elsewhere (mysql db).
59 # If OUTPUT_NONE, the full text and metadata goes elsewhere (mysql db)
60 # In the last 3 cases, an XML comment is left behind as a 'breadcrumb' to indicate
61 # that the "missing" doc information is stored elsewhere.
62 if(!defined $options) {
63 $options = { 'output' => OUTPUT_ALL };
64 }
65
66 my $section_ptr = $doc_obj->_lookup_section ($section);
67 return "" unless defined $section_ptr;
68
69 my $all_text = "<Section>\n";
70 $all_text .= " <Description>\n";
71
72 # scalar comparisons on a constant is allowed (but not string evaluation of scalars)
73 # https://www.perlmonks.org/?node_id=559456
74 if($options->{'output'} == OUTPUT_ALL || $options->{'output'} == OUTPUT_META_ONLY) {
75 # output metadata
76 foreach my $data (@{$section_ptr->{'metadata'}}) {
77 my $escaped_value = &escape_text($data->[1]);
78 $all_text .= ' <Metadata name="' . $data->[0] . '">' . $escaped_value . "</Metadata>\n";
79 }
80 } else {
81 $all_text .= "<!-- metadata is stored elsewhere (MySQL database) -->\n";
82 }
83
84 $all_text .= " </Description>\n";
85
86 # output the text
87 $all_text .= " <Content>";
88 if($options->{'output'} == OUTPUT_ALL || $options->{'output'} == OUTPUT_TEXT_ONLY) {
89 $all_text .= &escape_text($section_ptr->{'text'});
90 } else {
91 $all_text .= "<!-- full text is stored elsewhere (MySQL database) -->\n";
92 }
93 $all_text .= "</Content>\n";
94
95 # output all the subsections
96 foreach my $subsection (@{$section_ptr->{'subsection_order'}}) {
97 $all_text .= &recursive_get_section_xml($doc_obj, "$section.$subsection", $options);
98 }
99
100 $all_text .= "</Section>\n";
101
102 # make sure no nasty control characters have snuck through
103 # (XML::Parser will barf on anything it doesn't consider to be
104 # valid UTF-8 text, including things like \c@, \cC etc.)
105 # and the tab character too (x09)
106
107 $all_text =~ s/[\x00-\x09\x0B\x0C\x0E-\x1F]//g;
108
109 return $all_text;
110}
111
112# pass by ref version: gets a ref to a string and returns ref to the modified string
113# so use this for large strings (fulltext)
114sub escape_textref {
115 my ($textref) = @_;
116 # special characters in the xml encoding
117 $$textref =~ s/&&/& &/g;
118 $$textref =~ s/&/&amp;/g; # this has to be first...
119 $$textref =~ s/</&lt;/g;
120 $$textref =~ s/>/&gt;/g;
121 $$textref =~ s/\"/&quot;/g;
122
123 return $textref;
124}
125
126# pass by value - the behaviour of the escape_text method used so far is to pass copies of strings
127sub escape_text {
128 my ($text) = @_;
129 my $textref = &escape_textref(\$text);
130 return $$textref;
131}
132
133# Unescape variants are used by GreenstoneSQLPlugin when reading text back from SQL db
134# Pass by ref version
135sub unescape_textref {
136 my ($textref) = @_;
137 # special characters in the xml encoding
138 $$textref =~ s/& &/&&/g;
139 $$textref =~ s/&amp;/&/g; # this has to be first...
140 $$textref =~ s/&lt;/</g;
141 $$textref =~ s/&gt;/>/g;
142 $$textref =~ s/&quot;/"/g;
143
144 return $textref;
145}
146
147# Pass by value version
148sub unescape_text {
149 my ($text) = @_;
150 my $textref = &unescape_textref(\$text);
151 return $$textref;
152}
153
1541;
Note: See TracBrowser for help on using the repository browser.