source: main/trunk/greenstone2/bin/script/gti-process-spreadsheet.pl@ 28503

Last change on this file since 28503 was 28503, checked in by ak19, 11 years ago
  1. The process of submitting translation spreadsheets produced some errors in borderline cases saying that the source string of the spreadsheet didn't match the one in the original translation file owing to newline entity differences and differences as to when entities got expanded during the check for whether the source strings still matched. These cases are now handled. 2. Added debugging statements in key positions (but commented out).
  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 4.7 KB
Line 
1#!/usr/bin/perl -w
2
3###########################################################################
4#
5# gti-process-excel-xml.pl
6#
7# A component of the Greenstone digital library software
8# from the New Zealand Digital Library Project at the
9# University of Waikato, New Zealand.
10#
11# Copyright (C) 2005 New Zealand Digital Library Project
12#
13# This program is free software; you can redistribute it and/or modify
14# it under the terms of the GNU General Public License as published by
15# the Free Software Foundation; either version 2 of the License, or
16# (at your option) any later version.
17#
18# This program is distributed in the hope that it will be useful,
19# but WITHOUT ANY WARRANTY; without even the implied warranty of
20# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
21# GNU General Public License for more details.
22#
23# You should have received a copy of the GNU General Public License
24# along with this program; if not, write to the Free Software
25# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
26#
27###########################################################################
28
29
30use strict;
31
32
33sub main
34{
35 # Required parameter: the path of the spreadsheet file saved in UTF-16 Excel text format
36 my $utf16_excel_txt_file_path = shift(@_);
37 if (!defined($utf16_excel_txt_file_path)) {
38 die "Usage: gti-process-spreadsheet.pl <txt-file-path>\n";
39 }
40
41 # Ensure the UTF-16 Excel text file exists
42 if (!-f $utf16_excel_txt_file_path) {
43 die "Error: UTF-16 Excel text file $utf16_excel_txt_file_path does not exist.\n";
44 }
45
46 # Convert the Excel text file from UTF-16 to UTF-8
47 my $excel_txt_file_path = $utf16_excel_txt_file_path . "-utf8";
48 if (!-f $excel_txt_file_path) {
49 # Only bother if the file doesn't already exist
50 `iconv -f UTF-16 -t UTF-8 $utf16_excel_txt_file_path -o $excel_txt_file_path`;
51 }
52
53 # Read the (UTF-8) Excel Unicode text file data
54 open(EXCEL_TXT_FILE, $excel_txt_file_path);
55 my @excel_txt_file_lines = <EXCEL_TXT_FILE>;
56 my $excel_txt_file_data = join("", @excel_txt_file_lines);
57 close(EXCEL_TXT_FILE);
58
59 # Remove any nasty carriage returns
60 $excel_txt_file_data =~ s/\r//g;
61
62 # Make sure the first line is where we want it, and remove all stray whitespace
63 $excel_txt_file_data =~ s/^(\n)*/\n\n/;
64 $excel_txt_file_data =~ s/\n(\s*)\n/\n\n/g;
65
66 # Split into chunks
67 my @chunks = split(/\n\nsource::/, $excel_txt_file_data);
68 shift(@chunks); # Ignore the first (empty) chunk
69 print STDERR "Number of chunks: " . scalar(@chunks) . "\n";
70
71 # Check we've split the chunks correctly
72 my $total_number_of_chunks = ($excel_txt_file_data =~ s/source::/source::/g);
73 if (scalar(@chunks) != $total_number_of_chunks) {
74 die "Error: Expected $total_number_of_chunks chunks but only have " . scalar(@chunks) . " from splitting";
75 }
76
77 # Process each submitted chunk
78 foreach my $chunk (@chunks) {
79 my $source_file_chunk = (split(/\ntarget::/, $chunk))[0];
80 my $target_file_chunk = (split(/\ntarget::/, $chunk))[1];
81
82 # Parse the chunk key and chunk text
83 $source_file_chunk =~ /^(\S+)\s+((.|\n)*)$/;
84 my $source_file_chunk_key = $1;
85# print STDERR "******** key: |$source_file_chunk_key| ";
86 my $source_file_chunk_text = $2;
87# print STDERR "******** text: |$source_file_chunk_text|\n";
88 $target_file_chunk =~ /^(\S+)\s+((.|\n)*)$/;
89 my $target_file_chunk_key = $1;
90 my $target_file_chunk_text = $2;
91
92 # Remove the quotes around multiline chunks
93 if ($source_file_chunk_text =~ /^\"/ && $source_file_chunk_text =~ /\"$/) {
94# print STDERR "******** source text: |$source_file_chunk_text| \n";
95 $source_file_chunk_text =~ s/^\"//;
96 $source_file_chunk_text =~ s/\"$//;
97 }
98 if ($target_file_chunk_text =~ /^\"/ && $target_file_chunk_text =~ /\"$/) {
99# print STDERR "******** target text: |$target_file_chunk_text| \n";
100 $target_file_chunk_text =~ s/^\"//;
101 $target_file_chunk_text =~ s/\"$//;
102 }
103# else {
104# print STDERR "******** !target text: |$target_file_chunk_text| \n";
105# }
106
107 # Remove the blank space Excel adds at the start of each line
108 $source_file_chunk_text =~ s/\n /\n/g;
109 $target_file_chunk_text =~ s/\n /\n/g;
110
111 # Remove Excel's doubled-up quotes
112 $source_file_chunk_text =~ s/\"\"/\"/g;
113 $target_file_chunk_text =~ s/\"\"/\"/g;
114
115 # ensure newline html entities in the unicode txt file version of the spreadsheet are replaced with newlines
116 $source_file_chunk_text =~ s/&#10; /\n/g;
117 $target_file_chunk_text =~ s/&#10; /\n/g;
118
119 print "<SourceFileText key=\"" . $source_file_chunk_key . "\">\n" . $source_file_chunk_text . "\n</SourceFileText>\n";
120 print "<TargetFileText key=\"" . $target_file_chunk_key . "\">\n" . $target_file_chunk_text . "\n</TargetFileText>\n";
121 }
122}
123
124
125&main(@ARGV);
Note: See TracBrowser for help on using the repository browser.