source: main/trunk/greenstone2/perllib/plugins/TextPlugin.pm@ 32215

Last change on this file since 32215 was 31492, checked in by kjdon, 7 years ago

renamed EncodingUtil to CommonUtil, BasePlugin to BaseImporter. The idea is that only top level plugins that you can specify in your collection get to have plugin in their name. Modified all other plugins to reflect these name changes

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 8.4 KB
Line 
1###########################################################################
2#
3# TextPlugin.pm -- simple text plugin
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26# creates simple single-level document. Adds Title metadata
27# of first line of text (up to 100 characters long).
28
29package TextPlugin;
30
31use ReadTextFile;
32
33use strict;
34no strict 'refs'; # allow filehandles to be variables and viceversa
35no strict 'subs';
36
37sub BEGIN {
38 @TextPlugin::ISA = ('ReadTextFile');
39}
40
41my $arguments =
42 [ { 'name' => "process_exp",
43 'desc' => "{BaseImporter.process_exp}",
44 'type' => "regexp",
45 'deft' => &get_default_process_exp(),
46 'reqd' => "no" } ,
47 { 'name' => "title_sub",
48 'desc' => "{TextPlugin.title_sub}",
49 'type' => "regexp",
50 'deft' => "",
51 'reqd' => "no" } ];
52
53my $options = { 'name' => "TextPlugin",
54 'desc' => "{TextPlugin.desc}",
55 'abstract' => "no",
56 'inherits' => "yes",
57 'srcreplaceable' => "yes", # Source docs in regular txt format can be replaced with GS-generated html
58 'args' => $arguments };
59
60
61sub new {
62 my ($class) = shift (@_);
63 my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
64 push(@$pluginlist, $class);
65
66 push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
67 push(@{$hashArgOptLists->{"OptList"}},$options);
68
69 my $self = new ReadTextFile($pluginlist, $inputargs, $hashArgOptLists);
70
71 return bless $self, $class;
72}
73
74sub get_default_process_exp {
75 my $self = shift (@_);
76
77 return q^(?i)\.te?xt$^;
78}
79
80# do plugin specific processing of doc_obj
81sub process {
82 my $self = shift (@_);
83 my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli) = @_;
84 my $outhandle = $self->{'outhandle'};
85
86 my $cursection = $doc_obj->get_top_section();
87 # get title metadata
88 # (don't need to get title if it has been passed
89 # in from another plugin)
90 if (!defined $metadata->{'Title'}) {
91 my $title = $self->get_title_metadata($textref);
92 $doc_obj->add_utf8_metadata ($cursection, "Title", $title);
93 }
94 # Add FileFormat metadata
95 $doc_obj->add_metadata($cursection, "FileFormat", "Text");
96
97 # insert preformat tags and add text to document object
98 $self->text_to_html($textref); # modifies the text
99 $doc_obj->add_utf8_text($cursection, $$textref);
100
101 return 1;
102}
103
104sub get_title_metadata {
105 my $self = shift (@_);
106 my ($textref) = @_;
107
108 my ($title) = $$textref;
109 $title =~ /^\s+/s;
110 if (defined $self->{'title_sub'} && $self->{'title_sub'}) {
111 $title =~ s/$self->{'title_sub'}//;
112 }
113 $title =~ /^\s*([^\n]*)/s; $title=$1;
114 $title =~ s/\t/ /g;
115 $title =~ s/\r?\n?$//s; # remove any carriage returns and/or line feeds at line end,
116 # else the metadata won't appear in GLI even though it will appear in doc.xml
117 if (length($title) > 100) {
118 $title = substr ($title, 0, 100) . "...";
119 }
120 $title =~ s/\[/&\#91;/g;
121 $title =~ s/\[/&\#93;/g;
122 $title =~ s/\</&\#60;/g;
123 $title =~ s/\>/&\#62;/g;
124
125 return $title;
126}
127
128sub text_to_html {
129 my $self = shift (@_);
130 my ($textref) = @_;
131
132 # we need to escape the escape character, or else mg will convert into
133 # eg literal newlines, instead of leaving the text as '\n'
134 $$textref =~ s/\\/\\\\/g; # macro language
135 $$textref =~ s/_/\\_/g; # macro language
136 $$textref =~ s/</&lt;/g;
137 $$textref =~ s/>/&gt;/g;
138
139 # insert preformat tags and add text to document object
140 $$textref = "<pre>\n$$textref\n</pre>";
141}
142
143
144# replace_srcdoc_with_html.pl requires all subroutines that support src_replaceable
145# to contain a method called tmp_area_convert_file - this is indeed the case with all
146# Perl modules that are subclasses of ConvertToPlug.pm, but as we want TextPlugin to also
147# be srcreplaceable and because TextPlugin does not inherit from ConvertToPlug.pm, we have
148# a similar subroutine with the same name here.
149sub tmp_area_convert_file {
150 my $self = shift (@_);
151 my ($output_ext, $input_filename) = @_;
152
153 my $outhandle = $self->{'outhandle'};
154 #my $failhandle = $self->{'failhandle'};
155
156 # derive output filename from input filename
157 my ($tailname, $dirname, $suffix)
158 = &File::Basename::fileparse($input_filename, "\\.[^\\.]+\$");
159
160 # Softlink to collection tmp dir
161 # First find a temporary directory to create the output file in
162 my $tmp_dirname = $dirname;
163 if(defined $ENV{'GSDLCOLLECTDIR'}) {
164 $tmp_dirname = $ENV{'GSDLCOLLECTDIR'};
165 } elsif(defined $ENV{'GSDLHOME'}) {
166 $tmp_dirname = $ENV{'GSDLHOME'};
167 }
168 $tmp_dirname = &FileUtils::filenameConcatenate($tmp_dirname, "tmp");
169 &FileUtils::makeDirectory($tmp_dirname) if (!-e $tmp_dirname);
170
171 # convert to utf-8 otherwise we have problems with the doc.xml file
172 # later on
173 $tailname = $self->SUPER::filepath_to_utf8($tailname) unless &unicode::check_is_utf8($tailname);
174
175 $suffix = lc($suffix);
176 my $tmp_filename = &FileUtils::filenameConcatenate($tmp_dirname, "$tailname$suffix");
177
178 # Make sure we have the absolute path to the input file
179 # (If gsdl is remote, we're given relative path to input file, of the form import/tailname.suffix
180 # But we can't softlink to relative paths. Therefore, we need to ensure that
181 # the input_filename is the absolute path.
182 my $ensure_path_absolute = 1; # true
183
184 # Now make the softlink, so we don't accidentally damage the input file
185 # softlinking creates a symbolic link to (or, if that's not possible, it makes a copy of) the original
186 &FileUtils::softLink($input_filename, $tmp_filename, $ensure_path_absolute);
187
188 my $verbosity = $self->{'verbosity'};
189 if ($verbosity > 0) {
190 # need this output statement, as GShell.java's runRemote() sets status to CANCELLED
191 # if there is no output! (Therefore, it only had this adverse affect when running GSDL remotely)
192 print $outhandle "Converting $tailname$suffix to html\n";
193 }
194
195 #my $output_filename = $tailname$output_ext; #output_ext has to be html!
196 my $output_filename = &FileUtils::filenameConcatenate($tmp_dirname, $tailname.".html");
197
198 # Read contents of text file line by line into an array
199 # create an HTML file from the text file
200 # Recreate the original file for writing the updated contents
201 unless(open(TEXT, "<$tmp_filename")) { # open it as a new file for writing
202 print STDERR "TextPlugin.pm: Unable to open and read from $tmp_filename for converting to html...ERROR: $!\n";
203 return ""; # no file name
204 }
205
206 # Read the entire file at once
207 my $text;
208 {
209 local $/ = undef; # Now can read the entire file at once. From http://perl.plover.com/local.html
210 $text = <TEXT>; # File is read in as one single 'line'
211 }
212 close(TEXT); # close the file
213
214 # Get the title before embedding the text in pre tags
215 my $title = $self->get_title_metadata(\$text);
216
217 # Now convert the text
218 $self->text_to_html(\$text);
219
220 # try creating this new file writing and try opening it for writing, else exit with error value
221 unless(open(HTML, ">$output_filename")) { # open the new html file for writing
222 print STDERR "TextPlugin.pm: Unable to create $output_filename for writing $tailname$suffix txt converted to html...ERROR: $!\n";
223 return ""; # no filename
224 }
225 # write the html contents of the text (which is embedded in <pre> tags) out to the file with proper enclosing tags
226 print HTML "<html>\n<head>\n<title>$title</title>\n</head>\n<body>\n";
227 print HTML $text;
228 print HTML "\n</body>\n</html>";
229 close HTML;
230
231 # remove the copy of the original file/remove the symbolic link to original file
232 &FileUtils::removeFiles($tmp_filename);
233
234 return $output_filename; # return the output file path
235}
236
237
2381;
Note: See TracBrowser for help on using the repository browser.