Context Navigation

TextPlugin.pm@ 34221

Last change on this file since 34221 was 34221, checked in by ak19, 4 years ago
Undid the change of converting tabstops to their entities in docprint.pm (which has gone back to removing them now) and moved this conversion into TextPlugin.pm after all. In case this has an unforeseen effect, wanted to break as little as possible. Also, only want pre tags to preserve tabs and other html can be cleaned of this. TextPlugin definitely adds pre tags when converting txt to html, so it makes sense to always preserve tabstops there, whereas it doesn't make sense to assume the same need in all cases where html is produced as they may not contain pre tags.
Property svn:executable set to ``* Property svn:keywords set to `Author Date Id Revision`
File size: 9.2 KB

Line
1	###########################################################################
2	#
3	# TextPlugin.pm -- simple text plugin
4	# A component of the Greenstone digital library software
5	# from the New Zealand Digital Library Project at the
6	# University of Waikato, New Zealand.
7	#
8	# Copyright (C) 1999 New Zealand Digital Library Project
9	#
10	# This program is free software; you can redistribute it and/or modify
11	# it under the terms of the GNU General Public License as published by
12	# the Free Software Foundation; either version 2 of the License, or
13	# (at your option) any later version.
14	#
15	# This program is distributed in the hope that it will be useful,
16	# but WITHOUT ANY WARRANTY; without even the implied warranty of
17	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18	# GNU General Public License for more details.
19	#
20	# You should have received a copy of the GNU General Public License
21	# along with this program; if not, write to the Free Software
22	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23	#
24	###########################################################################
25
26	# creates simple single-level document. Adds Title metadata
27	# of first line of text (up to 100 characters long).
28
29	package TextPlugin;
30
31	use ReadTextFile;
32
33	use strict;
34	no strict 'refs'; # allow filehandles to be variables and viceversa
35	no strict 'subs';
36
37	sub BEGIN {
38	@TextPlugin::ISA = ('ReadTextFile');
39	}
40
41	my $arguments =
42	[ { 'name' => "process_exp",
43	'desc' => "{BaseImporter.process_exp}",
44	'type' => "regexp",
45	'deft' => &get_default_process_exp(),
46	'reqd' => "no" } ,
47	{ 'name' => "title_sub",
48	'desc' => "{TextPlugin.title_sub}",
49	'type' => "regexp",
50	'deft' => "",
51	'reqd' => "no" } ];
52
53	my $options = { 'name' => "TextPlugin",
54	'desc' => "{TextPlugin.desc}",
55	'abstract' => "no",
56	'inherits' => "yes",
57	'srcreplaceable' => "yes", # Source docs in regular txt format can be replaced with GS-generated html
58	'args' => $arguments };
59
60
61	sub new {
62	my ($class) = shift (@_);
63	my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
64	push(@$pluginlist, $class);
65
66	push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
67	push(@{$hashArgOptLists->{"OptList"}},$options);
68
69	my $self = new ReadTextFile($pluginlist, $inputargs, $hashArgOptLists);
70
71	return bless $self, $class;
72	}
73
74	sub get_default_process_exp {
75	my $self = shift (@_);
76
77	return q^(?i)\.te?xt$^;
78	}
79
80	# do plugin specific processing of doc_obj
81	sub process {
82	my $self = shift (@_);
83	my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli) = @_;
84	my $outhandle = $self->{'outhandle'};
85
86	my $cursection = $doc_obj->get_top_section();
87	# get title metadata
88	# (don't need to get title if it has been passed
89	# in from another plugin)
90	if (!defined $metadata->{'Title'}) {
91	my $title = $self->get_title_metadata($textref);
92	$doc_obj->add_utf8_metadata ($cursection, "Title", $title);
93	}
94	# Add FileFormat metadata
95	$doc_obj->add_metadata($cursection, "FileFormat", "Text");
96
97	# insert preformat tags and add text to document object
98	$self->text_to_html($textref); # modifies the text
99	$doc_obj->add_utf8_text($cursection, $$textref);
100
101	return 1;
102	}
103
104	sub get_title_metadata {
105	my $self = shift (@_);
106	my ($textref) = @_;
107
108	my ($title) = $$textref;
109	$title =~ /^\s+/s;
110	if (defined $self->{'title_sub'} && $self->{'title_sub'}) {
111	$title =~ s/$self->{'title_sub'}//;
112	}
113	# A series of spaces and/or punctuation too can be skipped to get at a meaningful title?
114	# https://www.geeksforgeeks.org/perl-special-character-classes-in-regular-expressions/
115	$title =~ /^[\s\|[:punct:]]([^\n])/s; $title=$1;
116	$title =~ s/\t/ /g;
117	$title =~ s/\r?\n?$//s; # remove any carriage returns and/or line feeds at line end,
118	# else the metadata won't appear in GLI even though it will appear in doc.xml
119	if (length($title) > 100) {
120	$title = substr ($title, 0, 100) . "...";
121	}
122	$title =~ s/\[/&\#91;/g;
123	$title =~ s/\[/&\#93;/g;
124	$title =~ s/\</&\#60;/g;
125	$title =~ s/\>/&\#62;/g;
126
127	return $title;
128	}
129
130	sub text_to_html {
131	my $self = shift (@_);
132	my ($textref) = @_;
133
134	# we need to escape the escape character, or else mg will convert into
135	# eg literal newlines, instead of leaving the text as '\n'
136	$$textref =~ s/\\/\\\\/g; # macro language
137	$$textref =~ s/_/\\_/g; # macro language
138	$$textref =~ s/</</g;
139	$$textref =~ s/>/>/g;
140
141
142	# $all_text gets written out into an xml context and represents the html version of a doc,
143	# allowing the use of html entities for the tab character ( )
144	# But docprint.pm, which writes the doc_obj into doc.xml, removes tabs for XMLParser reasons
145	# Tabs (ASCII \x09) may be meaningful spacing in text files to preserve whitespace formatting
146	# as we're trying to do by nesting tabs in <pre> tags.
147	# So before docprint.pm removes tabs stops, replacing them here with their entity reference
148	# to allow <pre> tags to continue preserving any tabs in the final html display.
149	$$textref =~ s/\x09/ /g;
150
151
152	# insert preformat tags and add text to document object
153	$$textref = "<pre>\n$$textref\n</pre>";
154	}
155
156
157	# replace_srcdoc_with_html.pl requires all subroutines that support src_replaceable
158	# to contain a method called tmp_area_convert_file - this is indeed the case with all
159	# Perl modules that are subclasses of ConvertToPlug.pm, but as we want TextPlugin to also
160	# be srcreplaceable and because TextPlugin does not inherit from ConvertToPlug.pm, we have
161	# a similar subroutine with the same name here.
162	sub tmp_area_convert_file {
163	my $self = shift (@_);
164	my ($output_ext, $input_filename) = @_;
165
166	my $outhandle = $self->{'outhandle'};
167	#my $failhandle = $self->{'failhandle'};
168
169	# derive output filename from input filename
170	my ($tailname, $dirname, $suffix)
171	= &File::Basename::fileparse($input_filename, "\\.[^\\.]+\$");
172
173	# Softlink to collection tmp dir
174	# First find a temporary directory to create the output file in
175	my $tmp_dirname = $dirname;
176	if(defined $ENV{'GSDLCOLLECTDIR'}) {
177	$tmp_dirname = $ENV{'GSDLCOLLECTDIR'};
178	} elsif(defined $ENV{'GSDLHOME'}) {
179	$tmp_dirname = $ENV{'GSDLHOME'};
180	}
181	$tmp_dirname = &FileUtils::filenameConcatenate($tmp_dirname, "tmp");
182	&FileUtils::makeDirectory($tmp_dirname) if (!-e $tmp_dirname);
183
184	# convert to utf-8 otherwise we have problems with the doc.xml file
185	# later on
186	$tailname = $self->SUPER::filepath_to_utf8($tailname) unless &unicode::check_is_utf8($tailname);
187
188	$suffix = lc($suffix);
189	my $tmp_filename = &FileUtils::filenameConcatenate($tmp_dirname, "$tailname$suffix");
190
191	# Make sure we have the absolute path to the input file
192	# (If gsdl is remote, we're given relative path to input file, of the form import/tailname.suffix
193	# But we can't softlink to relative paths. Therefore, we need to ensure that
194	# the input_filename is the absolute path.
195	my $ensure_path_absolute = 1; # true
196
197	# Now make the softlink, so we don't accidentally damage the input file
198	# softlinking creates a symbolic link to (or, if that's not possible, it makes a copy of) the original
199	&FileUtils::softLink($input_filename, $tmp_filename, $ensure_path_absolute);
200
201	my $verbosity = $self->{'verbosity'};
202	if ($verbosity > 0) {
203	# need this output statement, as GShell.java's runRemote() sets status to CANCELLED
204	# if there is no output! (Therefore, it only had this adverse affect when running GSDL remotely)
205	print $outhandle "Converting $tailname$suffix to html\n";
206	}
207
208	#my $output_filename = $tailname$output_ext; #output_ext has to be html!
209	my $output_filename = &FileUtils::filenameConcatenate($tmp_dirname, $tailname.".html");
210
211	# Read contents of text file line by line into an array
212	# create an HTML file from the text file
213	# Recreate the original file for writing the updated contents
214	unless(open(TEXT, "<$tmp_filename")) { # open it as a new file for writing
215	print STDERR "TextPlugin.pm: Unable to open and read from $tmp_filename for converting to html...ERROR: $!\n";
216	return ""; # no file name
217	}
218
219	# Read the entire file at once
220	my $text;
221	{
222	local $/ = undef; # Now can read the entire file at once. From http://perl.plover.com/local.html
223	$text = <TEXT>; # File is read in as one single 'line'
224	}
225	close(TEXT); # close the file
226
227	# Get the title before embedding the text in pre tags
228	my $title = $self->get_title_metadata(\$text);
229
230	# Now convert the text
231	$self->text_to_html(\$text);
232
233	# try creating this new file writing and try opening it for writing, else exit with error value
234	unless(open(HTML, ">$output_filename")) { # open the new html file for writing
235	print STDERR "TextPlugin.pm: Unable to create $output_filename for writing $tailname$suffix txt converted to html...ERROR: $!\n";
236	return ""; # no filename
237	}
238	# write the html contents of the text (which is embedded in <pre> tags) out to the file with proper enclosing tags
239	print HTML "<html>\n<head>\n<title>$title</title>\n</head>\n<body>\n";
240	print HTML $text;
241	print HTML "\n</body>\n</html>";
242	close HTML;
243
244	# remove the copy of the original file/remove the symbolic link to original file
245	&FileUtils::removeFiles($tmp_filename);
246
247	return $output_filename; # return the output file path
248	}
249
250
251	1;

Note: See TracBrowser for help on using the repository browser.

Context Navigation

source: main/trunk/greenstone2/perllib/plugins/TextPlugin.pm@ 34221

Download in other formats: