########################################################################### # # docprint.pm -- # A component of the Greenstone digital library software # from the New Zealand Digital Library Project at the # University of Waikato, New Zealand. # # Copyright (C) 2006 New Zealand Digital Library Project # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation; either version 2 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program; if not, write to the Free Software # Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. # ########################################################################### # This is used to output an XML representation of a doc_obj - this will be # Greenstone XML format. # This is used by GreenstoneXMLPlugout and doc.pm package docprint; use constant OUTPUT_NONE => 0; use constant OUTPUT_META_ONLY => 1; use constant OUTPUT_TEXT_ONLY => 2; use constant OUTPUT_ALL => 3; use strict; sub get_section_xml { return &get_section_xml_from_root(@_); } sub get_section_xml_from_root { my ($doc_obj, $options) = @_; return &recursive_get_section_xml($doc_obj, $doc_obj->get_top_section(), $options); } sub recursive_get_section_xml { my ($doc_obj, $section, $options) = @_; # 'output' can be OUTPUT_ALL|OUTPUT_META_ONLY|OUTPUT_TEXT_ONLY|OUTPUT_NONE # If not provided, it defaults to OUTPUT_ALL. # If OUTPUT_ALL, the metadata and full text both go into doc.xml # If OUTPUT_META_ONLY, the metadata goes into doc.xml and full text goes elsewhere (mysql db). # If OUTPUT_TEXT_ONLY, the full text goes into doc.xml and metadata goes elsewhere (mysql db). # If OUTPUT_NONE, the full text and metadata goes elsewhere (mysql db) # In the last 3 cases, an XML comment is left behind as a 'breadcrumb' to indicate # that the "missing" doc information is stored elsewhere. if(!defined $options) { $options = { 'output' => OUTPUT_ALL }; } my $section_ptr = $doc_obj->_lookup_section ($section); return "" unless defined $section_ptr; my $all_text = "
\n"; $all_text .= " \n"; if($options->{'output'} == OUTPUT_ALL || $options->{'output'} == OUTPUT_META_ONLY) { # output metadata foreach my $data (@{$section_ptr->{'metadata'}}) { my $escaped_value = &escape_text($data->[1]); $all_text .= ' ' . $escaped_value . "\n"; } } else { $all_text .= "\n"; } $all_text .= " \n"; # output the text $all_text .= " "; if($options->{'output'} == OUTPUT_ALL || $options->{'output'} == OUTPUT_TEXT_ONLY) { $all_text .= &escape_text($section_ptr->{'text'}); } else { $all_text .= "\n"; } $all_text .= "\n"; # output all the subsections foreach my $subsection (@{$section_ptr->{'subsection_order'}}) { $all_text .= &recursive_get_section_xml($doc_obj, "$section.$subsection"); } $all_text .= "
\n"; # make sure no nasty control characters have snuck through # (XML::Parser will barf on anything it doesn't consider to be # valid UTF-8 text, including things like \c@, \cC etc.) $all_text =~ s/[\x00-\x09\x0B\x0C\x0E-\x1F]//g; return $all_text; } sub escape_text { my ($text) = @_; # special characters in the xml encoding $text =~ s/&&/& &/g; $text =~ s/&/&/g; # this has to be first... $text =~ s//>/g; $text =~ s/\"/"/g; return $text; } 1;