source: main/trunk/greenstone2/perllib/docprint.pm@ 32527

Last change on this file since 32527 was 32518, checked in by ak19, 6 years ago

Untested first attempt at MySQLPlugout that is meant to write metadata and/or fulltext to a mysql db rather than to doc.xml. We're switching over to using perl mysql libs, dbd (mysql) for drivers and dbi (mysql), for db access. However, I still want to commit this initial attempt at manually doing database operations in perl.

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 4.1 KB
RevLine 
[782]1###########################################################################
2#
[13170]3# docprint.pm --
[782]4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
[13170]8# Copyright (C) 2006 New Zealand Digital Library Project
[782]9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
[13170]26# This is used to output an XML representation of a doc_obj - this will be
[17747]27# Greenstone XML format.
28# This is used by GreenstoneXMLPlugout and doc.pm
[782]29
30package docprint;
31
[32514]32use constant OUTPUT_NONE => 0;
[32512]33use constant OUTPUT_META_ONLY => 1;
34use constant OUTPUT_TEXT_ONLY => 2;
35use constant OUTPUT_ALL => 3;
36
[13170]37use strict;
[782]38
[13170]39sub get_section_xml {
[32512]40 return &get_section_xml_from_root(@_);
41}
42
43sub get_section_xml_from_root {
44 my ($doc_obj, $options) = @_;
[32514]45 return &recursive_get_section_xml($doc_obj, $doc_obj->get_top_section(), $options);
[32512]46}
47
48sub recursive_get_section_xml {
49 my ($doc_obj, $section, $options) = @_;
[13170]50
[32514]51 # 'output' can be OUTPUT_ALL|OUTPUT_META_ONLY|OUTPUT_TEXT_ONLY|OUTPUT_NONE
[32512]52 # If not provided, it defaults to OUTPUT_ALL.
53 # If OUTPUT_ALL, the metadata and full text both go into doc.xml
54 # If OUTPUT_META_ONLY, the metadata goes into doc.xml and full text goes elsewhere (mysql db).
55 # If OUTPUT_TEXT_ONLY, the full text goes into doc.xml and metadata goes elsewhere (mysql db).
[32514]56 # If OUTPUT_NONE, the full text and metadata goes elsewhere (mysql db)
57 # In the last 3 cases, an XML comment is left behind as a 'breadcrumb' to indicate
58 # that the "missing" doc information is stored elsewhere.
[32512]59 if(!defined $options) {
[32514]60 $options = { 'output' => OUTPUT_ALL };
[32512]61 }
62
[13170]63 my $section_ptr = $doc_obj->_lookup_section ($section);
64 return "" unless defined $section_ptr;
[782]65
[13170]66 my $all_text = "<Section>\n";
67 $all_text .= " <Description>\n";
[32512]68
[32518]69 # scalar comparisons on a constant is allowed (but not string evaluation of scalars)
70 # https://www.perlmonks.org/?node_id=559456
[32512]71 if($options->{'output'} == OUTPUT_ALL || $options->{'output'} == OUTPUT_META_ONLY) {
72 # output metadata
73 foreach my $data (@{$section_ptr->{'metadata'}}) {
74 my $escaped_value = &escape_text($data->[1]);
75 $all_text .= ' <Metadata name="' . $data->[0] . '">' . $escaped_value . "</Metadata>\n";
76 }
77 } else {
78 $all_text .= "<!-- metadata is stored elsewhere (MySQL database) -->\n";
[13170]79 }
[782]80
[13170]81 $all_text .= " </Description>\n";
[782]82
[13170]83 # output the text
84 $all_text .= " <Content>";
[32512]85 if($options->{'output'} == OUTPUT_ALL || $options->{'output'} == OUTPUT_TEXT_ONLY) {
86 $all_text .= &escape_text($section_ptr->{'text'});
87 } else {
88 $all_text .= "<!-- full text is stored elsewhere (MySQL database) -->\n";
89 }
[13170]90 $all_text .= "</Content>\n";
91
92 # output all the subsections
93 foreach my $subsection (@{$section_ptr->{'subsection_order'}}) {
[32512]94 $all_text .= &recursive_get_section_xml($doc_obj, "$section.$subsection");
[782]95 }
96
[13170]97 $all_text .= "</Section>\n";
98
99 # make sure no nasty control characters have snuck through
100 # (XML::Parser will barf on anything it doesn't consider to be
101 # valid UTF-8 text, including things like \c@, \cC etc.)
102 $all_text =~ s/[\x00-\x09\x0B\x0C\x0E-\x1F]//g;
103
104 return $all_text;
[782]105}
106
[13170]107sub escape_text {
108 my ($text) = @_;
109 # special characters in the xml encoding
[19214]110 $text =~ s/&&/& &/g;
[13170]111 $text =~ s/&/&amp;/g; # this has to be first...
112 $text =~ s/</&lt;/g;
113 $text =~ s/>/&gt;/g;
114 $text =~ s/\"/&quot;/g;
[782]115
[13170]116 return $text;
117}
118
[782]1191;
Note: See TracBrowser for help on using the repository browser.