source: gs2-extensions/parallel-building/trunk/src/perllib/dbutil/stdoutxml.pm@ 27915

Last change on this file since 27915 was 27915, checked in by jmt12, 11 years ago

A new PlugOut that doesn't write any intermediate files (bar those directly associated with a document) and instead sends them to STDOUT in XML form (and InfoDBEntry elements) allowing proper parsing up in the Reduce phase in Hadoop.

File size: 5.4 KB
Line 
1###############################################################################
2#
3# dbutil::stdoutxml -- A dbutil implementation that, hopefully unsurprisingly,
4# writes infodb stuff as XML to STDOUT. This will be used as part of the Hadoop
5# process - where this output will be passed through to the reduce phase to be
6# merged with other documents.
7#
8# A component of the Greenstone digital library software from the New Zealand
9# Digital Library Project at the University of Waikato, New Zealand.
10#
11# Copyright (C) 2009
12#
13# This program is free software; you can redistribute it and/or modify it under
14# the terms of the GNU General Public License as published by the Free Software
15# Foundation; either version 2 of the License, or (at your option) any later
16# version.
17#
18# This program is distributed in the hope that it will be useful, but WITHOUT
19# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
20# FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
21# details.
22#
23# You should have received a copy of the GNU General Public License along with
24# this program; if not, write to the Free Software Foundation, Inc., 675 Mass
25# Ave, Cambridge, MA 02139, USA.
26#
27###############################################################################
28
29package dbutil::stdoutxml;
30
31# Pragma
32use strict;
33
34# Perl built-in modules
35use Time::HiRes qw( gettimeofday );
36
37# Greenstone modules
38use FileUtils;
39
40# -----------------------------------------------------------------------------
41# STDOUT XML IMPLEMENTATION
42# -----------------------------------------------------------------------------
43my $debug = 0;
44my $type = 'doc';
45
46## @function open_infodb_write_handle()
47#
48# Since we are writing to STDOUT we don't have to open a handle
49#
50sub open_infodb_write_handle
51{
52 my $infodb_file_path = shift(@_);
53 my $opt_append = shift(@_);
54 if ($debug)
55 {
56 print STDERR "stdoutxml::open_infodb_write_handle(\"$infodb_file_path\", \"$opt_append\")\n";
57 }
58 # Store this for later
59 $type = $infodb_file_path;
60 return 1;
61}
62## open_infodb_write_handle() ##
63
64## @function
65#
66# In this function we not close the handle we didn't open :P
67#
68sub close_infodb_write_handle
69{
70 my $infodb_handle = shift(@_);
71 if ($debug)
72 {
73 print STDERR "stdoutxml::close_infodb_write_handle()\n";
74 }
75}
76## close_infodb_write_handle() ##
77
78
79## @function delete_infodb_entry()
80#
81sub delete_infodb_entry
82{
83 my $infodb_handle = shift(@_);
84 my $infodb_key = shift(@_);
85 print "!implement delete_infodb_entry()\n";
86}
87## delete_infodb_entry() ##
88
89
90## @function get_infodb_file_path()
91#
92sub get_infodb_file_path
93{
94 my $collection_name = shift(@_);
95 my $infodb_directory_path = shift(@_);
96 my $infodb_type = 'error';
97 if ($debug)
98 {
99 print STDERR "stdoutxml::get_infodb_file_path(\"$collection_name\", \"$infodb_directory_path\")\n";
100 }
101 if ($collection_name =~ /(datestamp|doc|src|rss)/)
102 {
103 $infodb_type = $1;
104 }
105 return $infodb_type;
106}
107## get_infodb_file_path() ##
108
109
110## @function read_infodb_file()
111#
112sub read_infodb_file
113{
114 my $infodb_file_path = shift(@_);
115 my $infodb_map = shift(@_);
116 print "!implement read_infodb_file()\n";
117}
118## read_infodb_file() ##
119
120
121## @function read_infodb_keys()
122#
123sub read_infodb_keys
124{
125 my $infodb_file_path = shift(@_);
126 my $infodb_map = shift(@_);
127 print "!implement read_infodb_keys()\n";
128}
129## read_infodb_keys() ##
130
131
132sub set_infodb_entry
133{
134 my $infodb_file_path = shift(@_);
135 my $infodb_key = shift(@_);
136 my $infodb_map = shift(@_);
137 my $status = undef;
138 print "!implement set_infodb_entry()\n";
139 return $status;
140}
141## set_infodb_entry() ##
142
143
144## @function supportsDatestamp
145#
146# Why, yes, yes it does!
147#
148sub supportsDatestamp
149{
150 return 1;
151}
152## supportsDatestamp() ##
153
154## @function supportsRSS
155#
156# Why, yes, yes it does!
157#
158sub supportsRSS
159{
160 return 1;
161}
162## supportsRSS() ##
163
164## @function write_infodb_entry()
165#
166sub write_infodb_entry
167{
168 my $infodb_handle = shift(@_);
169 my $infodb_key = shift(@_);
170 my $infodb_map = shift(@_);
171 if ($debug)
172 {
173 print STDERR "stdoutxml::write_infodb_enty(<infodb_handle>, \"$infodb_key\", <infodb_map>)\n";
174 }
175 my @values;
176 foreach my $infodb_value_key (sort keys(%$infodb_map))
177 {
178 foreach my $infodb_value (@{$infodb_map->{$infodb_value_key}})
179 {
180 if ($infodb_value =~ /-{70,}/)
181 {
182 # if value contains 70 or more hyphens in a row we need to escape them
183 # to prevent txt2db from treating them as a separator
184 $infodb_value =~ s/-/&\#045;/gi;
185 }
186 push(@values, '<' . $infodb_value_key . '>' . $infodb_value);
187 }
188 }
189 my $values_str = join("\n", @values);
190 $values_str =~ s/&/&amp;/g;
191 $values_str =~ s/</&lt;/g;
192 $values_str =~ s/>/&gt;/g;
193 $values_str =~ s/"/&quot;/g;
194 $values_str =~ s/'/&apos;/g;
195 my ($seconds, $microseconds) = gettimeofday();
196 print '<InfoDBEntry type="' . $type . '" key="' . $infodb_key . '" mode="set" timestamp="' . $seconds . '.' . $microseconds . '">' . $values_str . '</InfoDBEntry>' . "\n";
197}
198## write_infodb_entry() ##
199
200## @function write_infodb_rawentry()
201#
202sub write_infodb_rawentry
203{
204 my $infodb_handle = shift(@_);
205 my $infodb_key = shift(@_);
206 my $infodb_val = shift(@_);
207 $infodb_val =~ s/&/&amp;/g;
208 $infodb_val =~ s/</&lt;/g;
209 $infodb_val =~ s/>/&gt;/g;
210 $infodb_val =~ s/"/&quot;/g;
211 $infodb_val =~ s/'/&apos;/g;
212 my ($seconds, $microseconds) = gettimeofday();
213 print '<InfoDBEntry type="' . $type . '" key="' . $infodb_key . '" mode="set" timestamp="' . $seconds . '.' . $microseconds . '">' . $infodb_val . '</InfoDBEntry>' . "\n";
214}
215
2161;
Note: See TracBrowser for help on using the repository browser.