source: main/trunk/greenstone2/bin/script/replace_srcdoc_with_html.pl@ 26441

Last change on this file since 26441 was 24475, checked in by ak19, 13 years ago

John Thompson's fix for efficient file-reading is useful here too, as he suggested. Tested replace_src_doc_with_html with ASCII and non-ASCII content in input (txt) file.

  • Property svn:executable set to *
File size: 8.8 KB
Line 
1#!/usr/bin/perl -w
2
3
4BEGIN {
5 die "GSDLHOME not set\n" unless defined $ENV{'GSDLHOME'};
6 unshift (@INC, "$ENV{'GSDLHOME'}/perllib");
7 unshift (@INC, "$ENV{'GSDLHOME'}/perllib/plugins");
8}
9
10use strict;
11no strict 'subs'; # allow barewords (eg STDERR) as function arguments
12no strict 'refs'; # allow filehandles to be variables and vice versa
13
14use encodings;
15use printusage;
16use parse2;
17use FileHandle;
18
19my $arguments =
20 [
21 { 'name' => "language",
22 'desc' => "{scripts.language}",
23 'type' => "string",
24 'reqd' => "no",
25 'hiddengli' => "yes" },
26 { 'name' => "plugin",
27 'desc' => "{srcreplace.plugin}",
28 'type' => "string",
29 'reqd' => "yes",
30 'hiddengli' => "yes"},
31 { 'name' => "verbosity",
32 'desc' => "{import.verbosity}",
33 'type' => "int",
34 'range' => "0,",
35 'deft' => "1",
36 'reqd' => "no",
37 'modegli' => "3" },
38 # Do not remove the following option, it's a flag for generating the xml of the options
39 # It WILL be used!
40 { 'name' => "xml", # run with -xml, the output generated should be valid XML. Used from GLI
41 'desc' => "",
42 'type' => "flag",
43 'reqd' => "no",
44 'hiddengli' => "yes" }
45 ];
46
47my $options = { 'name' => "replace_srcdoc_with_html.pl",
48 'desc' => "{srcreplace.desc}",
49 'args' => $arguments };
50
51
52sub main
53{
54 my ($language, $plugin, $verbosity);
55
56 my $xml = 0;
57
58 my $hashParsingResult = {};
59
60
61 # parse the options
62 my $intArgLeftinAfterParsing = parse2::parse(\@ARGV,$arguments,$hashParsingResult,"allow_extra_options");
63
64 # If parse returns -1 then something has gone wrong
65 if ($intArgLeftinAfterParsing == -1)
66 {
67 &PrintUsage::print_txt_usage($options, "{srcreplace.params}");
68 die "\n";
69 }
70
71 foreach my $strVariable (keys %$hashParsingResult)
72 {
73 eval "\$$strVariable = \$hashParsingResult->{\"\$strVariable\"}";
74 }
75
76 # If $language has been specified, load the appropriate resource bundle
77 # (Otherwise, the default resource bundle will be loaded automatically)
78 if ($language && $language =~ /\S/) {
79 &gsprintf::load_language_specific_resource_bundle($language);
80 }
81
82 if ($xml) {
83 &PrintUsage::print_xml_usage($options);
84 print "\n";
85 return;
86 }
87
88 # There should be one arg left after parsing (the filename)
89 # Or the user may have specified -h, in which case we output the usage
90 if($intArgLeftinAfterParsing != 1 || (@ARGV && $ARGV[0] =~ /^\-+h/))
91 {
92 &PrintUsage::print_txt_usage($options, "{srcreplace.params}");
93 die "\n";
94 }
95
96 # The filename of the document to be replaced is the first value
97 # that remains after the options have been parsed out
98 my $filename = $ARGV[0];
99 if (!defined $filename || $filename !~ /\w/) {
100
101 &PrintUsage::print_txt_usage($options, "{srcreplace.params}");
102 print STDERR "You need to specify a filename\n";
103 die "\n";
104 }
105 # check that file exists
106 if (!-e $filename) {
107 print STDERR "File $filename doesn't exist...\n";
108 die "\n";
109 }
110 # check required options
111 if (!defined $plugin || $plugin !~ /\w/) {
112 &PrintUsage::print_txt_usage($options, "{srcreplace.params}");
113 print STDERR "You need to specify a plugin";
114 die "\n";
115 }
116
117 # ConvertToPlug.pm's subclasses should be available here through GLI,
118 # but in cmdline version, these should be supplied
119 my $plugobj;
120 require "$plugin.pm";
121 eval ("\$plugobj = new $plugin()");
122 die "$@" if $@;
123
124 # ...and initialize it
125 $plugobj->init(1, "STDERR", "STDERR");
126
127 # find the import directory, where we want to create it in. This is where the file
128 # passed as parameter by GLI is located.
129
130 # derive tmp filename from input filename
131 my ($tailname, $import_dir, $suffix)
132 = &File::Basename::fileparse($filename, "\\.[^\\.]+\$");
133
134 # Use the plugin's tmp_area_convert_file function to avoid duplicating code.
135 # This method returns the name of the output file. In the case of Word docs,
136 # if converted with windows_scripting a "filename_files" folder might have been
137 # created for associated files. Same situation when using wvware with gsConvert.pl.
138 # (When old gsConvert.pl was used, wvware created no separate directory, instead files
139 # associated with the html generated would be at the same level in the tmp folder
140 # where the output file was created.) Now it's the same no matter whether wvware
141 # or windows_scripting did the conversion of the Word doc to html.
142 my $output_filename = $plugobj->tmp_area_convert_file("html", $filename);
143
144
145 # if something went wrong, then tmp_area_convert_file returns "", but can also check
146 # for whether the output file exists or not
147 if(!-e $output_filename || $output_filename eq "") {
148 # if no output html file was created, then die so that GLI displays error message
149 print STDERR "***replace_srcdoc_with_html.pl: no output file created for $filename ***\n";
150 die "No html file created for $filename. Replacement did not take place\n"; # Program NEEDS to die here,
151 # else the error that occurred is not transmitted to GLI and it thinks instead that execution was fine
152 #return 0; # error code 0 for false <- NO, needs to die, not return!
153 }
154 #else:
155
156 # now, find out what to move:
157 # it may be a single file, or, if it is a word doc, it may also have an image folder
158 # which has the name "filename-without-extension_files"
159 my ($tmp_name, $tmp_dir, $ext) = &File::Basename::fileparse($output_filename, "\\.[^\\.]+\$");
160
161 # the name of the folder of associated files (which may or may not exist) in the tmp dir
162 my $assoc_folder = &util::filename_cat($tmp_dir, $tmp_name."_files");
163
164 # Need to check for naming collisions: in case there is already a file or folder
165 # in the import directory by the name of those we want to move there from the tmp folder
166 # First need to work out the full paths to any assoc folder if it were copied into the
167 # import directory, and the main html file if it were copied into the import folder:
168 my $new_assoc_folder = &util::filename_cat($import_dir, $tmp_name."_files");
169 my $new_file = &util::filename_cat($import_dir, $tmp_name.$ext);
170
171 # If there is an image folder, any naming collisions now would mean that the links of
172 # the html file to the image folder would break if we changed the assoc_folder's name.
173 # Therefore, in such a case this process dies after deleting both the file and assoc_folder.
174 if(-e $assoc_folder && -e $new_assoc_folder) {
175 # so an associated folder was generated, AND a folder by that name already exists
176 # in the import folder where we want to copy the generated folder to.
177 &util::rm($output_filename);
178 &util::rm_r($assoc_folder); # we know directory exists, so remove dir
179 die "Image folder $new_assoc_folder already exists.\nDeleting generated file and folder, else links to images will break.\n";
180 }
181 # Finally, check that no file already exists with the same name as the generated stand-alone
182 # file. Have to do this *after* checking for name collisions with any assoc_folder, because
183 # that also tries to remove any output files.
184 if(-e $new_file) { # a file by that name already exists, delete the generated file
185 &util::rm($output_filename);
186 die "File $new_file already exists. Deleting generated file.\n";
187 }
188
189 # Now we know we have no file name collisions. We 'move' the html file by copying its
190 # contents over and ensuring that these contents are utf8. If we don't do this, PDFs
191 # replaced by html may fail, whereas those converted with PDFPlug will have succeeded.
192 open(FIN,"<$output_filename") or die "replace_srcdoc_with_html.pl: Unable to open $output_filename to ensure utf8...ERROR: $!\n";
193 my $html_contents;
194 # Read in the entire contents of the file in one hit
195 sysread(FIN, $html_contents, -s FIN);
196 &unicode::ensure_utf8(\$html_contents); # turn any high bytes that aren't valid utf-8 into utf-8.
197 close(FIN);
198
199 # write the utf8 contents to the new file and delete the original.
200 open(FOUT, ">$new_file") or die "replace_srcdoc_with_html.pl: Unable to open $new_file for writing out utf8 html...ERROR: $!\n";
201 print FOUT $html_contents;
202 close(FOUT);
203 &util::rm($output_filename);
204
205 # move any associated folders containing associated files too
206 if(-e $assoc_folder) {
207 #print STDERR "****Folder for associated files is $assoc_folder\n";
208 #&util::mv($assoc_folder, $import_dir); # doesn't work for me
209 &util::cp_r($assoc_folder, $import_dir);
210 &util::rm_r($assoc_folder);
211 }
212
213 # Now we can remove the source doc permanently (there are no assocdirs for source doc)
214 &util::rm($filename);
215
216 # need this output statement here, as GShell.java's runRemote() sets status to CANCELLED
217 # if there is no output! (Therefore, it only had this adverse affect when running GSDL remotely)
218 # Do something useful with it: return the new filename without extension, used by remote GS server
219 print STDOUT "$tmp_name\n";
220}
221&main(@ARGV);
Note: See TracBrowser for help on using the repository browser.