source: main/trunk/greenstone2/perllib/plugins/TextPlugin.pm@ 34221

Last change on this file since 34221 was 34221, checked in by ak19, 4 years ago

Undid the change of converting tabstops to their entities in docprint.pm (which has gone back to removing them now) and moved this conversion into TextPlugin.pm after all. In case this has an unforeseen effect, wanted to break as little as possible. Also, only want pre tags to preserve tabs and other html can be cleaned of this. TextPlugin definitely adds pre tags when converting txt to html, so it makes sense to always preserve tabstops there, whereas it doesn't make sense to assume the same need in all cases where html is produced as they may not contain pre tags.

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 9.2 KB
Line 
1###########################################################################
2#
3# TextPlugin.pm -- simple text plugin
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26# creates simple single-level document. Adds Title metadata
27# of first line of text (up to 100 characters long).
28
29package TextPlugin;
30
31use ReadTextFile;
32
33use strict;
34no strict 'refs'; # allow filehandles to be variables and viceversa
35no strict 'subs';
36
37sub BEGIN {
38 @TextPlugin::ISA = ('ReadTextFile');
39}
40
41my $arguments =
42 [ { 'name' => "process_exp",
43 'desc' => "{BaseImporter.process_exp}",
44 'type' => "regexp",
45 'deft' => &get_default_process_exp(),
46 'reqd' => "no" } ,
47 { 'name' => "title_sub",
48 'desc' => "{TextPlugin.title_sub}",
49 'type' => "regexp",
50 'deft' => "",
51 'reqd' => "no" } ];
52
53my $options = { 'name' => "TextPlugin",
54 'desc' => "{TextPlugin.desc}",
55 'abstract' => "no",
56 'inherits' => "yes",
57 'srcreplaceable' => "yes", # Source docs in regular txt format can be replaced with GS-generated html
58 'args' => $arguments };
59
60
61sub new {
62 my ($class) = shift (@_);
63 my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
64 push(@$pluginlist, $class);
65
66 push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
67 push(@{$hashArgOptLists->{"OptList"}},$options);
68
69 my $self = new ReadTextFile($pluginlist, $inputargs, $hashArgOptLists);
70
71 return bless $self, $class;
72}
73
74sub get_default_process_exp {
75 my $self = shift (@_);
76
77 return q^(?i)\.te?xt$^;
78}
79
80# do plugin specific processing of doc_obj
81sub process {
82 my $self = shift (@_);
83 my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli) = @_;
84 my $outhandle = $self->{'outhandle'};
85
86 my $cursection = $doc_obj->get_top_section();
87 # get title metadata
88 # (don't need to get title if it has been passed
89 # in from another plugin)
90 if (!defined $metadata->{'Title'}) {
91 my $title = $self->get_title_metadata($textref);
92 $doc_obj->add_utf8_metadata ($cursection, "Title", $title);
93 }
94 # Add FileFormat metadata
95 $doc_obj->add_metadata($cursection, "FileFormat", "Text");
96
97 # insert preformat tags and add text to document object
98 $self->text_to_html($textref); # modifies the text
99 $doc_obj->add_utf8_text($cursection, $$textref);
100
101 return 1;
102}
103
104sub get_title_metadata {
105 my $self = shift (@_);
106 my ($textref) = @_;
107
108 my ($title) = $$textref;
109 $title =~ /^\s+/s;
110 if (defined $self->{'title_sub'} && $self->{'title_sub'}) {
111 $title =~ s/$self->{'title_sub'}//;
112 }
113 # A series of spaces and/or punctuation too can be skipped to get at a meaningful title?
114 # https://www.geeksforgeeks.org/perl-special-character-classes-in-regular-expressions/
115 $title =~ /^[\s|[:punct:]]*([^\n]*)/s; $title=$1;
116 $title =~ s/\t/ /g;
117 $title =~ s/\r?\n?$//s; # remove any carriage returns and/or line feeds at line end,
118 # else the metadata won't appear in GLI even though it will appear in doc.xml
119 if (length($title) > 100) {
120 $title = substr ($title, 0, 100) . "...";
121 }
122 $title =~ s/\[/&\#91;/g;
123 $title =~ s/\[/&\#93;/g;
124 $title =~ s/\</&\#60;/g;
125 $title =~ s/\>/&\#62;/g;
126
127 return $title;
128}
129
130sub text_to_html {
131 my $self = shift (@_);
132 my ($textref) = @_;
133
134 # we need to escape the escape character, or else mg will convert into
135 # eg literal newlines, instead of leaving the text as '\n'
136 $$textref =~ s/\\/\\\\/g; # macro language
137 $$textref =~ s/_/\\_/g; # macro language
138 $$textref =~ s/</&lt;/g;
139 $$textref =~ s/>/&gt;/g;
140
141
142 # $all_text gets written out into an xml context and represents the html version of a doc,
143 # allowing the use of html entities for the tab character (&#09;)
144 # But docprint.pm, which writes the doc_obj into doc.xml, removes tabs for XMLParser reasons
145 # Tabs (ASCII \x09) may be meaningful spacing in text files to preserve whitespace formatting
146 # as we're trying to do by nesting tabs in <pre> tags.
147 # So before docprint.pm removes tabs stops, replacing them here with their entity reference
148 # to allow <pre> tags to continue preserving any tabs in the final html display.
149 $$textref =~ s/\x09/&#09;/g;
150
151
152 # insert preformat tags and add text to document object
153 $$textref = "<pre>\n$$textref\n</pre>";
154}
155
156
157# replace_srcdoc_with_html.pl requires all subroutines that support src_replaceable
158# to contain a method called tmp_area_convert_file - this is indeed the case with all
159# Perl modules that are subclasses of ConvertToPlug.pm, but as we want TextPlugin to also
160# be srcreplaceable and because TextPlugin does not inherit from ConvertToPlug.pm, we have
161# a similar subroutine with the same name here.
162sub tmp_area_convert_file {
163 my $self = shift (@_);
164 my ($output_ext, $input_filename) = @_;
165
166 my $outhandle = $self->{'outhandle'};
167 #my $failhandle = $self->{'failhandle'};
168
169 # derive output filename from input filename
170 my ($tailname, $dirname, $suffix)
171 = &File::Basename::fileparse($input_filename, "\\.[^\\.]+\$");
172
173 # Softlink to collection tmp dir
174 # First find a temporary directory to create the output file in
175 my $tmp_dirname = $dirname;
176 if(defined $ENV{'GSDLCOLLECTDIR'}) {
177 $tmp_dirname = $ENV{'GSDLCOLLECTDIR'};
178 } elsif(defined $ENV{'GSDLHOME'}) {
179 $tmp_dirname = $ENV{'GSDLHOME'};
180 }
181 $tmp_dirname = &FileUtils::filenameConcatenate($tmp_dirname, "tmp");
182 &FileUtils::makeDirectory($tmp_dirname) if (!-e $tmp_dirname);
183
184 # convert to utf-8 otherwise we have problems with the doc.xml file
185 # later on
186 $tailname = $self->SUPER::filepath_to_utf8($tailname) unless &unicode::check_is_utf8($tailname);
187
188 $suffix = lc($suffix);
189 my $tmp_filename = &FileUtils::filenameConcatenate($tmp_dirname, "$tailname$suffix");
190
191 # Make sure we have the absolute path to the input file
192 # (If gsdl is remote, we're given relative path to input file, of the form import/tailname.suffix
193 # But we can't softlink to relative paths. Therefore, we need to ensure that
194 # the input_filename is the absolute path.
195 my $ensure_path_absolute = 1; # true
196
197 # Now make the softlink, so we don't accidentally damage the input file
198 # softlinking creates a symbolic link to (or, if that's not possible, it makes a copy of) the original
199 &FileUtils::softLink($input_filename, $tmp_filename, $ensure_path_absolute);
200
201 my $verbosity = $self->{'verbosity'};
202 if ($verbosity > 0) {
203 # need this output statement, as GShell.java's runRemote() sets status to CANCELLED
204 # if there is no output! (Therefore, it only had this adverse affect when running GSDL remotely)
205 print $outhandle "Converting $tailname$suffix to html\n";
206 }
207
208 #my $output_filename = $tailname$output_ext; #output_ext has to be html!
209 my $output_filename = &FileUtils::filenameConcatenate($tmp_dirname, $tailname.".html");
210
211 # Read contents of text file line by line into an array
212 # create an HTML file from the text file
213 # Recreate the original file for writing the updated contents
214 unless(open(TEXT, "<$tmp_filename")) { # open it as a new file for writing
215 print STDERR "TextPlugin.pm: Unable to open and read from $tmp_filename for converting to html...ERROR: $!\n";
216 return ""; # no file name
217 }
218
219 # Read the entire file at once
220 my $text;
221 {
222 local $/ = undef; # Now can read the entire file at once. From http://perl.plover.com/local.html
223 $text = <TEXT>; # File is read in as one single 'line'
224 }
225 close(TEXT); # close the file
226
227 # Get the title before embedding the text in pre tags
228 my $title = $self->get_title_metadata(\$text);
229
230 # Now convert the text
231 $self->text_to_html(\$text);
232
233 # try creating this new file writing and try opening it for writing, else exit with error value
234 unless(open(HTML, ">$output_filename")) { # open the new html file for writing
235 print STDERR "TextPlugin.pm: Unable to create $output_filename for writing $tailname$suffix txt converted to html...ERROR: $!\n";
236 return ""; # no filename
237 }
238 # write the html contents of the text (which is embedded in <pre> tags) out to the file with proper enclosing tags
239 print HTML "<html>\n<head>\n<title>$title</title>\n</head>\n<body>\n";
240 print HTML $text;
241 print HTML "\n</body>\n</html>";
242 close HTML;
243
244 # remove the copy of the original file/remove the symbolic link to original file
245 &FileUtils::removeFiles($tmp_filename);
246
247 return $output_filename; # return the output file path
248}
249
250
2511;
Note: See TracBrowser for help on using the repository browser.