source: main/trunk/greenstone2/perllib/docprint.pm@ 32536

Last change on this file since 32536 was 32536, checked in by ak19, 6 years ago

First commit to do with reading back in from the SQL DB. This commit introduces the new GreenstoneSQLPlugin for this purpose, which should ideally only be used during buildcol (but its init(), deinit() and read() methods are also called on import.pl). The new plugin works with GreenstoneSQLPlugout which wrote meta and txt to the SQL DB. Lots of TODOs and questions still here, some debug statements too. Also have to run some decisions by Dr Bainbridge. There are many hardcoded values which still have to be parameterised (not always completely sure how) and still have to test the 2 cases of sending just meta and just fulltxt to db. Next commit will tidy some things up.

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 4.7 KB
Line 
1###########################################################################
2#
3# docprint.pm --
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 2006 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26# This is used to output an XML representation of a doc_obj - this will be
27# Greenstone XML format.
28# This is used by GreenstoneXMLPlugout and doc.pm
29# It's also used by GreenstoneSQLPlugout to output a doc_obj's meta and/or
30# fulltxt to an SQL db, while subroutine unescape_text() is used by
31# GreenstoneSQLPlugin for unescaping txt read back in.
32
33package docprint;
34
35use constant OUTPUT_NONE => 0;
36use constant OUTPUT_META_ONLY => 1;
37use constant OUTPUT_TEXT_ONLY => 2;
38use constant OUTPUT_ALL => 3;
39
40use strict;
41
42sub get_section_xml {
43 return &get_section_xml_from_root(@_);
44}
45
46sub get_section_xml_from_root {
47 my ($doc_obj, $options) = @_;
48 return &recursive_get_section_xml($doc_obj, $doc_obj->get_top_section(), $options);
49}
50
51sub recursive_get_section_xml {
52 my ($doc_obj, $section, $options) = @_;
53
54 # 'output' can be OUTPUT_ALL|OUTPUT_META_ONLY|OUTPUT_TEXT_ONLY|OUTPUT_NONE
55 # If not provided, it defaults to OUTPUT_ALL.
56 # If OUTPUT_ALL, the metadata and full text both go into doc.xml
57 # If OUTPUT_META_ONLY, the metadata goes into doc.xml and full text goes elsewhere (mysql db).
58 # If OUTPUT_TEXT_ONLY, the full text goes into doc.xml and metadata goes elsewhere (mysql db).
59 # If OUTPUT_NONE, the full text and metadata goes elsewhere (mysql db)
60 # In the last 3 cases, an XML comment is left behind as a 'breadcrumb' to indicate
61 # that the "missing" doc information is stored elsewhere.
62 if(!defined $options) {
63 $options = { 'output' => OUTPUT_ALL };
64 }
65
66 my $section_ptr = $doc_obj->_lookup_section ($section);
67 return "" unless defined $section_ptr;
68
69 my $all_text = "<Section>\n";
70 $all_text .= " <Description>\n";
71
72 # scalar comparisons on a constant is allowed (but not string evaluation of scalars)
73 # https://www.perlmonks.org/?node_id=559456
74 if($options->{'output'} == OUTPUT_ALL || $options->{'output'} == OUTPUT_META_ONLY) {
75 # output metadata
76 foreach my $data (@{$section_ptr->{'metadata'}}) {
77 my $escaped_value = &escape_text($data->[1]);
78 $all_text .= ' <Metadata name="' . $data->[0] . '">' . $escaped_value . "</Metadata>\n";
79 }
80 } else {
81 $all_text .= "<!-- metadata is stored elsewhere (MySQL database) -->\n";
82 }
83
84 $all_text .= " </Description>\n";
85
86 # output the text
87 $all_text .= " <Content>";
88 if($options->{'output'} == OUTPUT_ALL || $options->{'output'} == OUTPUT_TEXT_ONLY) {
89 $all_text .= &escape_text($section_ptr->{'text'});
90 } else {
91 $all_text .= "<!-- full text is stored elsewhere (MySQL database) -->\n";
92 }
93 $all_text .= "</Content>\n";
94
95 # output all the subsections
96 foreach my $subsection (@{$section_ptr->{'subsection_order'}}) {
97 $all_text .= &recursive_get_section_xml($doc_obj, "$section.$subsection", $options);
98 }
99
100 $all_text .= "</Section>\n";
101
102 # make sure no nasty control characters have snuck through
103 # (XML::Parser will barf on anything it doesn't consider to be
104 # valid UTF-8 text, including things like \c@, \cC etc.)
105 $all_text =~ s/[\x00-\x09\x0B\x0C\x0E-\x1F]//g;
106
107 return $all_text;
108}
109
110sub escape_text {
111 my ($text) = @_;
112 # special characters in the xml encoding
113 $text =~ s/&&/& &/g;
114 $text =~ s/&/&amp;/g; # this has to be first...
115 $text =~ s/</&lt;/g;
116 $text =~ s/>/&gt;/g;
117 $text =~ s/\"/&quot;/g;
118
119 return $text;
120}
121
122# used by GreenstoneSQLPlugin when reading back from sqldb
123sub unescape_text {
124 my ($text) = @_;
125 # special characters in the xml encoding
126 $text =~ s/& &/&&/g;
127 $text =~ s/&amp;/&/g; # this has to be first...
128 $text =~ s/&lt;/</g;
129 $text =~ s/&gt;/>/g;
130 $text =~ s/&quot;/"/g;
131
132 return $text;
133}
134
1351;
Note: See TracBrowser for help on using the repository browser.