Changeset 32089
- Timestamp:
- 2017-12-08T19:16:12+13:00 (6 years ago)
- Files:
-
- 2 edited
Legend:
- Unmodified
- Added
- Removed
-
gs2-extensions/pdf-box/trunk/java/perllib/plugins/PDFBoxConverter.pm
r27510 r32089 32 32 no strict 'subs'; # allow barewords (eg STDERR) as function arguments 33 33 34 #use HTML::Entities; # for encoding characters into their HTML entities when PDFBox converts to text 35 34 36 use gsprintf 'gsprintf'; 37 use FileUtils; 35 38 36 39 # these two variables mustn't be initialised here or they will get stuck … … 257 260 #print STDERR "**** item file: $target_file_path\n"; 258 261 } 259 262 elsif ($self->{'converted_to'} eq "text") { 263 # ensure html entities are doubly escaped for pdfbox to text conversion: & -> & 264 # conversion to html does it automatically, but conversion to text doesn't 265 # and this results in illegal characters in doc.xml 266 267 my $fulltext = &FileUtils::readUTF8File($target_file_path); 268 #$fulltext = &HTML::Entities::encode($fulltext); # doesn't seem to help 269 $fulltext =~ s@&@&@sg; # Kathy's fix to ensure doc contents don't break XML 270 &FileUtils::writeUTF8File($target_file_path, \$fulltext); 271 } 272 260 273 if ($had_error) { 261 274 return (0, $result,$target_file_path); -
main/trunk/greenstone2/perllib/FileUtils.pm
r31724 r32089 60 60 # util::soft_link() => FileUtils::softLink() 61 61 62 # Functions that have been added, but not by John Thompson, 63 # So the implementations don't support parallel processing yet, but they print a warning and the 64 # correct implementation can be put into here. So that if all calls for reading and writing UTF8 65 # file content go through here, then they will do the right thing when the functions are updated. 66 # 67 # => FileUtils::readUTF8File() 68 # => FileUtils::writeUTF8File() 69 # 70 62 71 # Other functions in this file (perhaps some of these may have counterparts in util.pm too): 63 72 … … 962 971 ## readDirectory() 963 972 973 ## @function readUTF8File() 974 # 975 # read contents from a file containing UTF8. 976 # 977 # Parameter filename, the filepath to read from 978 # 979 sub readUTF8File 980 { 981 my $filename = shift(@_); 982 983 print STDERR "@@@ Warning FileUtils::readFile() not yet implemented for parallel processing. Using regular version...\n"; 984 985 open(FIN,"<$filename") or die "FileUtils::readFile: Unable to open $filename for reading...ERROR: $!\n"; 986 987 # decode the bytes in the file with UTF8 enc, 988 # to get unicode aware strings that represent utf8 chars 989 binmode(FIN,":utf8"); 990 991 my $contents; 992 # Read in the entire contents of the file in one hit 993 sysread(FIN, $contents, -s FIN); 994 close(FIN); 995 return $contents; 996 } 997 ## readUTF8File() 998 999 ## @function writeUTF8File() 1000 # 1001 # write UTF8 contents to a file. 1002 # 1003 # Parameter filename, the filepath to write to 1004 # Parameter contentRef, a *reference* to the contents to write out 1005 # 1006 sub writeUTF8File 1007 { 1008 my ($filename, $contentRef) = @_; 1009 1010 print STDERR "@@@ Warning FileUtils::writeFile() not yet implemented for parallel processing. Using regular version...\n"; 1011 1012 open(FOUT, ">$filename") or die "FileUtils::writeFile: Unable to open $filename for writing out contents...ERROR: $!\n"; 1013 # encode the unicode aware characters in the string as utf8 1014 # before writing out the resulting bytes 1015 binmode(FOUT,":utf8"); 1016 1017 print FOUT $$contentRef; 1018 close(FOUT); 1019 } 1020 ## writeUTF8File() 1021 964 1022 ## @function removeFiles() 965 1023 #
Note:
See TracChangeset
for help on using the changeset viewer.