source: main/trunk/greenstone2/bin/script/replace_srcdoc_with_html.pl@ 34219

Last change on this file since 34219 was 34219, checked in by ak19, 4 years ago

Replace_srcdoc_with_html needs cpan added on the path to have access to JSON.pm which is imported by doc.pm and used by ReadtTextFile

  • Property svn:executable set to *
File size: 9.0 KB
Line 
1#!/usr/bin/perl -w
2
3
4BEGIN {
5 die "GSDLHOME not set\n" unless defined $ENV{'GSDLHOME'};
6 unshift (@INC, "$ENV{'GSDLHOME'}/perllib");
7 unshift (@INC, "$ENV{'GSDLHOME'}/perllib/cpan");
8 unshift (@INC, "$ENV{'GSDLHOME'}/perllib/plugins");
9}
10
11use strict;
12no strict 'subs'; # allow barewords (eg STDERR) as function arguments
13no strict 'refs'; # allow filehandles to be variables and vice versa
14
15use encodings;
16use printusage;
17use parse2;
18use FileHandle;
19
20my $arguments =
21 [
22 { 'name' => "language",
23 'desc' => "{scripts.language}",
24 'type' => "string",
25 'reqd' => "no",
26 'hiddengli' => "yes" },
27 { 'name' => "plugin",
28 'desc' => "{srcreplace.plugin}",
29 'type' => "string",
30 'reqd' => "yes",
31 'hiddengli' => "yes"},
32 { 'name' => "verbosity",
33 'desc' => "{import.verbosity}",
34 'type' => "int",
35 'range' => "0,",
36 'deft' => "1",
37 'reqd' => "no",
38 'modegli' => "3" },
39 # Do not remove the following option, it's a flag for generating the xml of the options
40 # It WILL be used!
41 { 'name' => "xml", # run with -xml, the output generated should be valid XML. Used from GLI
42 'desc' => "",
43 'type' => "flag",
44 'reqd' => "no",
45 'hiddengli' => "yes" }
46 ];
47
48my $options = { 'name' => "replace_srcdoc_with_html.pl",
49 'desc' => "{srcreplace.desc}",
50 'args' => $arguments };
51
52
53sub main
54{
55 my ($language, $plugin, $verbosity);
56
57 my $xml = 0;
58
59 my $hashParsingResult = {};
60
61
62 # parse the options
63 my $intArgLeftinAfterParsing = parse2::parse(\@ARGV,$arguments,$hashParsingResult,"allow_extra_options");
64
65 # If parse returns -1 then something has gone wrong
66 if ($intArgLeftinAfterParsing == -1)
67 {
68 &PrintUsage::print_txt_usage($options, "{srcreplace.params}");
69 die "\n";
70 }
71
72 foreach my $strVariable (keys %$hashParsingResult)
73 {
74 eval "\$$strVariable = \$hashParsingResult->{\"\$strVariable\"}";
75 }
76
77 # If $language has been specified, load the appropriate resource bundle
78 # (Otherwise, the default resource bundle will be loaded automatically)
79 if ($language && $language =~ /\S/) {
80 &gsprintf::load_language_specific_resource_bundle($language);
81 }
82
83 if ($xml) {
84 &PrintUsage::print_xml_usage($options);
85 print "\n";
86 return;
87 }
88
89 # There should be one arg left after parsing (the filename)
90 # Or the user may have specified -h, in which case we output the usage
91 if($intArgLeftinAfterParsing != 1 || (@ARGV && $ARGV[0] =~ /^\-+h/))
92 {
93 &PrintUsage::print_txt_usage($options, "{srcreplace.params}");
94 die "\n";
95 }
96
97 # The filename of the document to be replaced is the first value
98 # that remains after the options have been parsed out
99 my $filename = $ARGV[0];
100 if (!defined $filename || $filename !~ /\w/) {
101
102 &PrintUsage::print_txt_usage($options, "{srcreplace.params}");
103 print STDERR "You need to specify a filename\n";
104 die "\n";
105 }
106 # check that file exists
107 if (!-e $filename) {
108 print STDERR "File $filename doesn't exist...\n";
109 die "\n";
110 }
111 # check required options
112 if (!defined $plugin || $plugin !~ /\w/) {
113 &PrintUsage::print_txt_usage($options, "{srcreplace.params}");
114 print STDERR "You need to specify a plugin";
115 die "\n";
116 }
117
118 # ConvertToPlug.pm's subclasses should be available here through GLI,
119 # but in cmdline version, these should be supplied
120 my $plugobj;
121 require "$plugin.pm";
122 eval ("\$plugobj = new $plugin()");
123 die "$@" if $@;
124
125 # ...and initialize it
126 $plugobj->init(1, "STDERR", "STDERR");
127
128 # find the import directory, where we want to create it in. This is where the file
129 # passed as parameter by GLI is located.
130
131 # derive tmp filename from input filename
132 my ($tailname, $import_dir, $suffix)
133 = &File::Basename::fileparse($filename, "\\.[^\\.]+\$");
134
135 # Use the plugin's tmp_area_convert_file function to avoid duplicating code.
136 # This method returns the name of the output file. In the case of Word docs,
137 # if converted with windows_scripting a "filename_files" folder might have been
138 # created for associated files. Same situation when using wvware with gsConvert.pl.
139 # (When old gsConvert.pl was used, wvware created no separate directory, instead files
140 # associated with the html generated would be at the same level in the tmp folder
141 # where the output file was created.) Now it's the same no matter whether wvware
142 # or windows_scripting did the conversion of the Word doc to html.
143 my $output_filename = $plugobj->tmp_area_convert_file("html", $filename);
144
145
146 # if something went wrong, then tmp_area_convert_file returns "", but can also check
147 # for whether the output file exists or not
148 if(!-e $output_filename || $output_filename eq "") {
149 # if no output html file was created, then die so that GLI displays error message
150 print STDERR "***replace_srcdoc_with_html.pl: no output file created for $filename ***\n";
151 die "No html file created for $filename. Replacement did not take place\n"; # Program NEEDS to die here,
152 # else the error that occurred is not transmitted to GLI and it thinks instead that execution was fine
153 #return 0; # error code 0 for false <- NO, needs to die, not return!
154 }
155 #else:
156
157 # now, find out what to move:
158 # it may be a single file, or, if it is a word doc, it may also have an image folder
159 # which has the name "filename-without-extension_files"
160 my ($tmp_name, $tmp_dir, $ext) = &File::Basename::fileparse($output_filename, "\\.[^\\.]+\$");
161
162 # the name of the folder of associated files (which may or may not exist) in the tmp dir
163 my $assoc_folder = &util::filename_cat($tmp_dir, $tmp_name."_files");
164
165 # Need to check for naming collisions: in case there is already a file or folder
166 # in the import directory by the name of those we want to move there from the tmp folder
167 # First need to work out the full paths to any assoc folder if it were copied into the
168 # import directory, and the main html file if it were copied into the import folder:
169 my $new_assoc_folder = &util::filename_cat($import_dir, $tmp_name."_files");
170 my $new_file = &util::filename_cat($import_dir, $tmp_name.$ext);
171
172 # If there is an image folder, any naming collisions now would mean that the links of
173 # the html file to the image folder would break if we changed the assoc_folder's name.
174 # Therefore, in such a case this process dies after deleting both the file and assoc_folder.
175 if(-e $assoc_folder && -e $new_assoc_folder) {
176 # so an associated folder was generated, AND a folder by that name already exists
177 # in the import folder where we want to copy the generated folder to.
178 &util::rm($output_filename);
179 &util::rm_r($assoc_folder); # we know directory exists, so remove dir
180 die "Image folder $new_assoc_folder already exists.\nDeleting generated file and folder, else links to images will break.\n";
181 }
182 # Finally, check that no file already exists with the same name as the generated stand-alone
183 # file. Have to do this *after* checking for name collisions with any assoc_folder, because
184 # that also tries to remove any output files.
185 if(-e $new_file) { # a file by that name already exists, delete the generated file
186 &util::rm($output_filename);
187 die "File $new_file already exists. Deleting generated file.\n";
188 }
189
190 # The following file reading section is a candidate to use FileUtils::readUTF8File()
191 # in place of calling sysread() directly. But only if we can reason we'd be working with UTF8
192 # Now we know we have no file name collisions. We 'move' the html file by copying its
193 # contents over and ensuring that these contents are utf8. If we don't do this, PDFs
194 # replaced by html may fail, whereas those converted with PDFPlug will have succeeded.
195 open(FIN,"<$output_filename") or die "replace_srcdoc_with_html.pl: Unable to open $output_filename to ensure utf8...ERROR: $!\n";
196 my $html_contents;
197 # Read in the entire contents of the file in one hit
198 sysread(FIN, $html_contents, -s FIN);
199 &unicode::ensure_utf8(\$html_contents); # turn any high bytes that aren't valid utf-8 into utf-8.
200 close(FIN);
201
202 # write the utf8 contents to the new file and delete the original.
203 open(FOUT, ">$new_file") or die "replace_srcdoc_with_html.pl: Unable to open $new_file for writing out utf8 html...ERROR: $!\n";
204 print FOUT $html_contents;
205 close(FOUT);
206 &util::rm($output_filename);
207
208 # move any associated folders containing associated files too
209 if(-e $assoc_folder) {
210 #print STDERR "****Folder for associated files is $assoc_folder\n";
211 #&util::mv($assoc_folder, $import_dir); # doesn't work for me
212 &util::cp_r($assoc_folder, $import_dir);
213 &util::rm_r($assoc_folder);
214 }
215
216 # Now we can remove the source doc permanently (there are no assocdirs for source doc)
217 &util::rm($filename);
218
219 # need this output statement here, as GShell.java's runRemote() sets status to CANCELLED
220 # if there is no output! (Therefore, it only had this adverse affect when running GSDL remotely)
221 # Do something useful with it: return the new filename without extension, used by remote GS server
222 print STDOUT "$tmp_name\n";
223}
224&main(@ARGV);
Note: See TracBrowser for help on using the repository browser.