root/main/trunk/greenstone2/perllib/plugins/TextPlugin.pm @ 30857

Revision 30857, 8.4 KB (checked in by ak19, 3 years ago)

Unless new line endings (particularly carriage return characters missing line feeds) are removed from the end of the ex.Title metadata, the ex.Title is not displayed in GLI's enrich panel, even though it's stored (with the carriage return) in doc.xml.

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
Line 
1###########################################################################
2#
3# TextPlugin.pm -- simple text plugin
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26# creates simple single-level document. Adds Title metadata
27# of first line of text (up to 100 characters long).
28
29package TextPlugin;
30
31use ReadTextFile;
32
33use strict;
34no strict 'refs'; # allow filehandles to be variables and viceversa
35no strict 'subs';
36
37sub BEGIN {
38    @TextPlugin::ISA = ('ReadTextFile');
39}
40
41my $arguments =
42    [ { 'name' => "process_exp",
43    'desc' => "{BasePlugin.process_exp}",
44    'type' => "regexp",
45    'deft' => &get_default_process_exp(),
46    'reqd' => "no" } ,
47      { 'name' => "title_sub",
48    'desc' => "{TextPlugin.title_sub}",
49    'type' => "regexp",
50    'deft' => "",
51    'reqd' => "no" } ];
52
53my $options = { 'name'     => "TextPlugin",
54        'desc'     => "{TextPlugin.desc}",
55        'abstract' => "no",
56        'inherits' => "yes",
57        'srcreplaceable' => "yes", # Source docs in regular txt format can be replaced with GS-generated html
58        'args'     => $arguments };
59
60
61sub new {
62    my ($class) = shift (@_);
63    my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
64    push(@$pluginlist, $class);
65
66    push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
67    push(@{$hashArgOptLists->{"OptList"}},$options);
68
69    my $self = new ReadTextFile($pluginlist, $inputargs, $hashArgOptLists);
70
71    return bless $self, $class;
72}
73
74sub get_default_process_exp {
75    my $self = shift (@_);
76
77    return q^(?i)\.te?xt$^;
78}
79
80# do plugin specific processing of doc_obj
81sub process {
82    my $self = shift (@_);
83    my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli) = @_;
84    my $outhandle = $self->{'outhandle'};
85   
86    my $cursection = $doc_obj->get_top_section();
87    # get title metadata
88    # (don't need to get title if it has been passed
89    # in from another plugin)
90    if (!defined $metadata->{'Title'}) {
91    my $title = $self->get_title_metadata($textref);
92    $doc_obj->add_utf8_metadata ($cursection, "Title", $title);
93    }
94    # Add FileFormat metadata
95    $doc_obj->add_metadata($cursection, "FileFormat", "Text");
96
97    # insert preformat tags and add text to document object   
98    $self->text_to_html($textref); # modifies the text
99    $doc_obj->add_utf8_text($cursection, $$textref);
100
101    return 1;
102}
103
104sub get_title_metadata {
105    my $self = shift (@_);
106    my ($textref) = @_;
107   
108    my ($title) = $$textref;
109    $title =~ /^\s+/s;
110    if (defined $self->{'title_sub'} && $self->{'title_sub'}) {
111    $title =~ s/$self->{'title_sub'}//;
112    }
113    $title =~ /^\s*([^\n]*)/s; $title=$1;
114    $title =~ s/\t/ /g;
115    $title =~ s/\r?\n?$//s; # remove any carriage returns and/or line feeds at line end,
116       # else the metadata won't appear in GLI even though it will appear in doc.xml
117    if (length($title) > 100) {
118    $title = substr ($title, 0, 100) . "...";
119    }
120    $title =~ s/\[/&\#91;/g;
121    $title =~ s/\[/&\#93;/g;
122    $title =~ s/\</&\#60;/g;
123    $title =~ s/\>/&\#62;/g;
124   
125    return $title;
126}
127
128sub text_to_html {
129    my $self = shift (@_);
130    my ($textref) = @_;
131   
132    # we need to escape the escape character, or else mg will convert into
133    # eg literal newlines, instead of leaving the text as '\n'
134    $$textref =~ s/\\/\\\\/g; # macro language
135    $$textref =~ s/_/\\_/g; # macro language
136    $$textref =~ s/</&lt;/g;
137    $$textref =~ s/>/&gt;/g;
138   
139    # insert preformat tags and add text to document object
140    $$textref = "<pre>\n$$textref\n</pre>";
141}
142
143
144# replace_srcdoc_with_html.pl requires all subroutines that support src_replaceable
145# to contain a method called tmp_area_convert_file - this is indeed the case with all
146# Perl modules that are subclasses of ConvertToPlug.pm, but as we want TextPlugin to also
147# be srcreplaceable and because TextPlugin does not inherit from ConvertToPlug.pm, we have
148# a similar subroutine with the same name here.
149sub tmp_area_convert_file {
150    my $self = shift (@_);
151    my ($output_ext, $input_filename) = @_;
152   
153    my $outhandle = $self->{'outhandle'};
154    #my $failhandle = $self->{'failhandle'};
155   
156    # derive output filename from input filename
157    my ($tailname, $dirname, $suffix)
158    = &File::Basename::fileparse($input_filename, "\\.[^\\.]+\$");
159
160    # Softlink to collection tmp dir
161    # First find a temporary directory to create the output file in
162    my $tmp_dirname = $dirname;
163    if(defined $ENV{'GSDLCOLLECTDIR'}) {
164    $tmp_dirname = $ENV{'GSDLCOLLECTDIR'};
165    } elsif(defined $ENV{'GSDLHOME'}) {
166    $tmp_dirname = $ENV{'GSDLHOME'};
167    }
168    $tmp_dirname = &FileUtils::filenameConcatenate($tmp_dirname, "tmp");
169    &FileUtils::makeDirectory($tmp_dirname) if (!-e $tmp_dirname);
170
171    # convert to utf-8 otherwise we have problems with the doc.xml file
172    # later on
173    $tailname = $self->SUPER::filepath_to_utf8($tailname) unless &unicode::check_is_utf8($tailname);
174
175    $suffix = lc($suffix);
176    my $tmp_filename = &FileUtils::filenameConcatenate($tmp_dirname, "$tailname$suffix");
177   
178    # Make sure we have the absolute path to the input file
179    # (If gsdl is remote, we're given relative path to input file, of the form import/tailname.suffix
180    # But we can't softlink to relative paths. Therefore, we need to ensure that
181    # the input_filename is the absolute path.
182    my $ensure_path_absolute = 1; # true
183
184    # Now make the softlink,  so we don't accidentally damage the input file
185    # softlinking creates a symbolic link to (or, if that's not possible, it makes a copy of) the original
186    &FileUtils::softLink($input_filename, $tmp_filename, $ensure_path_absolute);
187     
188    my $verbosity = $self->{'verbosity'};
189    if ($verbosity > 0) {
190    # need this output statement, as GShell.java's runRemote() sets status to CANCELLED
191    # if there is no output! (Therefore, it only had this adverse affect when running GSDL remotely)
192    print $outhandle "Converting $tailname$suffix to html\n";
193    }
194
195    #my $output_filename = $tailname$output_ext; #output_ext has to be html!
196    my $output_filename = &FileUtils::filenameConcatenate($tmp_dirname, $tailname.".html");
197   
198    # Read contents of text file line by line into an array
199    # create an HTML file from the text file
200    # Recreate the original file for writing the updated contents
201    unless(open(TEXT, "<$tmp_filename")) { # open it as a new file for writing
202    print STDERR "TextPlugin.pm: Unable to open and read from $tmp_filename for converting to html...ERROR: $!\n";
203    return ""; # no file name
204    }
205
206    # Read the entire file at once
207    my $text;
208    {
209    local $/ = undef; # Now can read the entire file at once. From http://perl.plover.com/local.html
210    $text = <TEXT>;   # File is read in as one single 'line'
211    }
212    close(TEXT); # close the file
213
214    # Get the title before embedding the text in pre tags
215    my $title = $self->get_title_metadata(\$text);   
216
217    # Now convert the text
218    $self->text_to_html(\$text);
219   
220    # try creating this new file writing and try opening it for writing, else exit with error value
221    unless(open(HTML, ">$output_filename")) {  # open the new html file for writing
222    print STDERR "TextPlugin.pm: Unable to create $output_filename for writing $tailname$suffix txt converted to html...ERROR: $!\n";
223    return ""; # no filename
224    }
225    # write the html contents of the text (which is embedded in <pre> tags) out to the file with proper enclosing tags
226    print HTML "<html>\n<head>\n<title>$title</title>\n</head>\n<body>\n";
227    print HTML $text;
228    print HTML "\n</body>\n</html>";
229    close HTML;
230
231    # remove the copy of the original file/remove the symbolic link to original file
232    &FileUtils::removeFiles($tmp_filename);
233
234    return $output_filename; # return the output file path
235}
236
237
2381;
Note: See TracBrowser for help on using the browser.