source: main/trunk/greenstone2/bin/script/gti-process-google-spreadsheet.pl@ 30499

Last change on this file since 30499 was 25289, checked in by ak19, 12 years ago

Adding a perl file for processing translation spreadsheet files returned by translators working with the Google Translation Toolkit. Such unicode (UTF-16) spreadsheet text files have the keys in one column, source chunks in a second column and translated target chunks in a third.

  • Property svn:executable set to *
File size: 3.9 KB
Line 
1#!/usr/bin/perl -w
2
3###########################################################################
4#
5# gti-process-google-spreadsheet.pl
6#
7# A component of the Greenstone digital library software
8# from the New Zealand Digital Library Project at the
9# University of Waikato, New Zealand.
10#
11# Copyright (C) 2005 New Zealand Digital Library Project
12#
13# This program is free software; you can redistribute it and/or modify
14# it under the terms of the GNU General Public License as published by
15# the Free Software Foundation; either version 2 of the License, or
16# (at your option) any later version.
17#
18# This program is distributed in the hope that it will be useful,
19# but WITHOUT ANY WARRANTY; without even the implied warranty of
20# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
21# GNU General Public License for more details.
22#
23# You should have received a copy of the GNU General Public License
24# along with this program; if not, write to the Free Software
25# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
26#
27###########################################################################
28
29
30# This script takes a spreadsheet of 3 columns: the key, English, <translated language>
31# and returns XML that Greenstone likes.
32# Use of this file and where it fits in the processing of translation files is
33# explained in gti-xml-to-spreadsheet.xsl (and gti-tmx-to-spreadsheet.xsl).
34
35
36use strict;
37
38
39sub main
40{
41 # Required parameter: the path of the spreadsheet file saved in UTF-16 Excel text format
42 my $utf16_excel_txt_file_path = shift(@_);
43 if (!defined($utf16_excel_txt_file_path)) {
44 die "Usage: gti-process-spreadsheet.pl <txt-file-path>\n";
45 }
46
47 # Ensure the UTF-16 Excel text file exists
48 if (!-f $utf16_excel_txt_file_path) {
49 die "Error: UTF-16 Excel text file $utf16_excel_txt_file_path does not exist.\n";
50 }
51
52 # Convert the Excel text file from UTF-16 to UTF-8
53 my $excel_txt_file_path = $utf16_excel_txt_file_path . "-utf8";
54 if (!-f $excel_txt_file_path) {
55 # Only bother if the file doesn't already exist
56 `iconv -f UTF-16 -t UTF-8 $utf16_excel_txt_file_path -o $excel_txt_file_path`;
57 }
58
59 # Read the (UTF-8) Excel Unicode text file data
60 open(EXCEL_TXT_FILE, $excel_txt_file_path);
61 my @excel_txt_file_lines = <EXCEL_TXT_FILE>;
62 close(EXCEL_TXT_FILE);
63
64 print STDERR "Number of chunks: " . scalar(@excel_txt_file_lines) . "\n";
65 shift(@excel_txt_file_lines); # Ignore the header row (Key Source Target)
66
67 # Process each submitted chunk, row by row
68 foreach my $chunk (@excel_txt_file_lines) {
69
70 # Remove any nasty carriage returns (especially at the end of each row/line)
71 $chunk =~ s/\r//;
72 # Just in case the newline at the end of each line is not /r but /n
73 $chunk =~ s/\n//;
74
75 #print STDOUT "**** chunk: $chunk\n";
76
77 # each Excel row's 3 fields are delimited by tabs not commas
78 my ($key, $source, $target) = split(/\t/, $chunk);
79
80 # Remove the quotes around multiline chunks
81 if ($source =~ /^\"/ && $source =~ /\"$/) {
82 $source =~ s/^\"//;
83 $source =~ s/\"$//;
84 }
85 if ($target =~ /^\"/ && $target =~ /\"$/) {
86 $target =~ s/^\"//;
87 $target =~ s/\"$//;
88 }
89
90 # Legacy: trim any leading blank space
91 $source =~ s/^ //g;
92 $target =~ s/^ //g;
93
94 # Remove Excel's doubled-up quotes
95 $source =~ s/\"\"/\"/g;
96 $target =~ s/\"\"/\"/g;
97
98 # replace html entity for newline with actual newline
99 # No longer need to replace commas (&#44;) and double-quotes (&#34;) here, because
100 # gti-xml-to-spreadsheet.xslt doesn't insert entities for those anymore.
101 # http://www.w3.org/MarkUp/html3/latin1.html
102 $source =~ s/&#10;/\n/g;
103 $target =~ s/&#10;/\n/g;
104
105 #print STDOUT "***** key: $key\n";
106 #print STDOUT "***** \tsource: $source\n";
107 #print STDOUT "***** \ttarget: $target\n\n";
108
109 print "<SourceFileText key=\"source::" . $key . "\">\n" . $source . "\n</SourceFileText>\n";
110 print "<TargetFileText key=\"target::" . $key . "\">\n" . $target . "\n</TargetFileText>\n";
111
112 }
113}
114
115
116&main(@ARGV);
Note: See TracBrowser for help on using the repository browser.