source: main/trunk/greenstone2/perllib/docprint.pm@ 34220

Last change on this file since 34220 was 34220, checked in by ak19, 4 years ago
  1. TextPlugin takes care to preserve whitespace formatting when converting txt to html, by nesting text in pre tags. In a recent carefully tabspaced txt file, the final document produced by GS had lost all these tabs. It turns out that this was done to allow XMLParser not to choke on control chars. Have encoded tabs as entities as they're going into doc.xml and ultimately html context instead of tabs being destructively removed. 2. TextPlugin now skips opening punctuation too, not just spaces before setting the title meta to the first non-newline sequence of content.
  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 6.2 KB
Line 
1###########################################################################
2#
3# docprint.pm --
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 2006 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26# This is used to output an XML representation of a doc_obj - this will be
27# Greenstone XML format.
28# This is used by GreenstoneXMLPlugout and doc.pm
29# It's also used by GreenstoneSQLPlugout to output a doc_obj's meta and/or
30# fulltxt to an SQL db, while subroutine unescape_text() is used by
31# GreenstoneSQLPlugin for unescaping txt read back in.
32
33package docprint;
34
35use constant OUTPUT_NONE => 0;
36use constant OUTPUT_META_ONLY => 1;
37use constant OUTPUT_TEXT_ONLY => 2;
38use constant OUTPUT_ALL => 3;
39
40use strict;
41
42sub get_section_xml {
43 return &get_section_xml_from_root(@_);
44}
45
46sub get_section_xml_from_root {
47 my ($doc_obj, $options) = @_;
48 return &recursive_get_section_xml($doc_obj, $doc_obj->get_top_section(), $options);
49}
50
51sub recursive_get_section_xml {
52 my ($doc_obj, $section, $options) = @_;
53
54 # 'output' can be OUTPUT_ALL|OUTPUT_META_ONLY|OUTPUT_TEXT_ONLY|OUTPUT_NONE
55 # If not provided, it defaults to OUTPUT_ALL.
56 # If OUTPUT_ALL, the metadata and full text both go into doc.xml
57 # If OUTPUT_META_ONLY, the metadata goes into doc.xml and full text goes elsewhere (mysql db).
58 # If OUTPUT_TEXT_ONLY, the full text goes into doc.xml and metadata goes elsewhere (mysql db).
59 # If OUTPUT_NONE, the full text and metadata goes elsewhere (mysql db)
60 # In the last 3 cases, an XML comment is left behind as a 'breadcrumb' to indicate
61 # that the "missing" doc information is stored elsewhere.
62 if(!defined $options) {
63 $options = { 'output' => OUTPUT_ALL };
64 }
65
66 my $section_ptr = $doc_obj->_lookup_section ($section);
67 return "" unless defined $section_ptr;
68
69 my $all_text = "<Section>\n";
70 $all_text .= " <Description>\n";
71
72 # scalar comparisons on a constant is allowed (but not string evaluation of scalars)
73 # https://www.perlmonks.org/?node_id=559456
74 if($options->{'output'} == OUTPUT_ALL || $options->{'output'} == OUTPUT_META_ONLY) {
75 # output metadata
76 foreach my $data (@{$section_ptr->{'metadata'}}) {
77 my $escaped_value = &escape_text($data->[1]);
78 $all_text .= ' <Metadata name="' . $data->[0] . '">' . $escaped_value . "</Metadata>\n";
79 }
80 } else {
81 $all_text .= "<!-- metadata is stored elsewhere (MySQL database) -->\n";
82 }
83
84 $all_text .= " </Description>\n";
85
86 # output the text
87 $all_text .= " <Content>";
88 if($options->{'output'} == OUTPUT_ALL || $options->{'output'} == OUTPUT_TEXT_ONLY) {
89 $all_text .= &escape_text($section_ptr->{'text'});
90 } else {
91 $all_text .= "<!-- full text is stored elsewhere (MySQL database) -->\n";
92 }
93 $all_text .= "</Content>\n";
94
95 # output all the subsections
96 foreach my $subsection (@{$section_ptr->{'subsection_order'}}) {
97 $all_text .= &recursive_get_section_xml($doc_obj, "$section.$subsection", $options);
98 }
99
100 $all_text .= "</Section>\n";
101
102 # make sure no nasty control characters have snuck through
103 # (XML::Parser will barf on anything it doesn't consider to be
104 # valid UTF-8 text, including things like \c@, \cC etc.)
105 # Will treat tab chars, \x09, as a special case right after this
106 $all_text =~ s/[\x00-\x08\x0B\x0C\x0E-\x1F]//g;
107
108 # $all_text gets written out into an xml context and represents the html version of a doc,
109 # allowing the use of html entities for the tab character (&#09;)
110 # Tabs (ASCII \x09) may be meaningful spacing in such cases whether the html emanated from a
111 # text file, original html or other doc. Particularly when tabs are nested in <pre> tags.
112 # Instead of removing tabs, replacing tabs with their entity reference will allow <pre> tags
113 # to continue preserving any tabs in the final html display.
114 # Hopefully with this, XML::Parser will not choke on tabs, and we get tab stop spaces preserved
115 # in the html output.
116 # This may be the best location to do this replacement and not in TextPlugin, because an html
117 # source doc may contain <pre> elements with tab stops, so then HTMLPlugin would have to do the
118 # replacement too.
119 $all_text =~ s/\x09/&#09;/g;
120
121 return $all_text;
122}
123
124# pass by ref version: gets a ref to a string and returns ref to the modified string
125# so use this for large strings (fulltext)
126sub escape_textref {
127 my ($textref) = @_;
128 # special characters in the xml encoding
129 $$textref =~ s/&&/& &/g;
130 $$textref =~ s/&/&amp;/g; # this has to be first...
131 $$textref =~ s/</&lt;/g;
132 $$textref =~ s/>/&gt;/g;
133 $$textref =~ s/\"/&quot;/g;
134
135 return $textref;
136}
137
138# pass by value - the behaviour of the escape_text method used so far is to pass copies of strings
139sub escape_text {
140 my ($text) = @_;
141 my $textref = &escape_textref(\$text);
142 return $$textref;
143}
144
145# Unescape variants are used by GreenstoneSQLPlugin when reading text back from SQL db
146# Pass by ref version
147sub unescape_textref {
148 my ($textref) = @_;
149 # special characters in the xml encoding
150 $$textref =~ s/& &/&&/g;
151 $$textref =~ s/&amp;/&/g; # this has to be first...
152 $$textref =~ s/&lt;/</g;
153 $$textref =~ s/&gt;/>/g;
154 $$textref =~ s/&quot;/"/g;
155
156 return $textref;
157}
158
159# Pass by value version
160sub unescape_text {
161 my ($text) = @_;
162 my $textref = &unescape_textref(\$text);
163 return $$textref;
164}
165
1661;
Note: See TracBrowser for help on using the repository browser.