source: main/trunk/greenstone2/build-src/src/java/org/nzdl/gsdl/ApplyXSLT.java@ 25285

Last change on this file since 25285 was 25285, checked in by ak19, 12 years ago

Updated to deal with paperspast.dm: added a new module into gti.pl for this. Because getting chunks from the paperspast.dm file results in attribute like values in the source and target strings, there are now functions in ApplyXSLT.java to remove and retrieve these. Finally, the function to remove these attribute-like values in the source and target strings is called in the gti-generate-tmx-xml.xslt file.

  • Property svn:keywords set to Author Date Id Revision
File size: 15.6 KB
Line 
1/**********************************************************************
2 *
3 * ApplyXSLT.java
4 *
5 * Copyright 2006-2010 The New Zealand Digital Library Project
6 *
7 * A component of the Greenstone digital library software
8 * from the New Zealand Digital Library Project at the
9 * University of Waikato, New Zealand.
10 *
11 * This program is free software; you can redistribute it and/or modify
12 * it under the terms of the GNU General Public License as published by
13 * the Free Software Foundation; either version 2 of the License, or
14 * (at your option) any later version.
15 *
16 * This program is distributed in the hope that it will be useful,
17 * but WITHOUT ANY WARRANTY; without even the implied warranty of
18 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19 * GNU General Public License for more details.
20 *
21 * You should have received a copy of the GNU General Public License
22 * along with this program; if not, write to the Free Software
23 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24 *
25 *********************************************************************/
26
27
28package org.nzdl.gsdl;
29
30import java.io.*;
31
32import javax.xml.transform.Transformer;
33import javax.xml.transform.TransformerConfigurationException;
34import javax.xml.transform.TransformerException;
35import javax.xml.transform.TransformerFactory;
36import javax.xml.transform.stream.StreamResult;
37import javax.xml.transform.stream.StreamSource;
38
39import javax.xml.parsers.*;
40import javax.xml.transform.dom.*;
41import org.w3c.dom.*;
42
43
44
45/**
46 * Use the TraX interface to perform a transformation in the simplest manner possible
47 * (3 statements).
48 */
49public class ApplyXSLT
50{
51
52 public static final String DOC_START = new String ("<?DocStart?>");
53 public static final String DOC_END = new String ("<?DocEnd?>");
54 public static final String INPUT_END = new String ("<?Done?>");
55
56 private static final String RECORD_ELEMENT = "record";
57 private static final String CONTROLFIELD_ELEMENT = "controlfield";
58 private static final String SUBFIELD_ELEMENT = "subfield";
59 private static final String LEADER_ELEMENT = "leader";
60
61 private final int BEFORE_READING = 0;
62 private final int IS_READING = 1;
63 private String xsl_file;
64 private String mapping_file;
65
66 private String sourcelang;
67 private String targetlang;
68
69 public ApplyXSLT(String sourcelang, String targetlang){
70 initLanguages(sourcelang, targetlang);
71 }
72
73 public ApplyXSLT(String xsl_file, String sourcelang, String targetlang)
74 {
75 this.xsl_file = xsl_file;
76 initLanguages(sourcelang, targetlang);
77 }
78
79 public ApplyXSLT(String xsl_file, String sourcelang, String targetlang, String mapping_file) {
80 this.xsl_file = xsl_file;
81 this.mapping_file = mapping_file;
82 initLanguages(sourcelang, targetlang);
83 }
84
85 private void initLanguages(String sourcelang, String targetlang)
86 {
87 this.sourcelang = sourcelang;
88 this.targetlang = targetlang;
89 // if only target language is provided, assume source language is English
90 if(sourcelang.equals("") && !targetlang.equals("")) {
91 this.sourcelang = "en";
92 }
93 }
94
95 private boolean process()
96 {
97 try{
98
99 // Use System InputStream to receive piped data from the perl program
100 InputStreamReader ir = new InputStreamReader(System.in, "UTF8");
101 BufferedReader br = new BufferedReader(ir);
102
103 int system_status = BEFORE_READING;
104 StringBuffer a_doc = new StringBuffer();
105 String output_file = new String();
106
107
108 while (br.ready()) {
109
110 String this_line = br.readLine();
111 if(system_status == BEFORE_READING){
112 if(this_line.compareTo(DOC_START) == 0){
113 output_file = br.readLine(); // read the next line as the output file name
114 system_status = IS_READING;
115 a_doc = new StringBuffer();
116 }
117 else if(this_line.compareTo(INPUT_END) == 0){
118 return true;
119 }
120 else{
121 System.err.println("Undefined process status:" + this_line);
122 system_status = BEFORE_READING;
123 }
124
125 }
126 else if(system_status == IS_READING){
127 if(this_line.compareTo(DOC_END) == 0){
128 boolean result = false;
129 if (mapping_file !=null && !mapping_file.equals("")){
130 result = translateXMLWithMapping(a_doc.toString(), output_file);
131 }
132 else{
133 result = translateXML(a_doc.toString(), output_file);
134 }
135
136 if (!result){
137 System.err.println("Translation Failed!!");
138 return false;
139 }
140
141 system_status = BEFORE_READING;
142
143 }
144 else{
145 a_doc.append(this_line + "\n");
146 }
147 }
148 else{
149 System.err.println ("Undefined system status in ApplyXSLT.java main().");
150 System.exit(-1);
151 }
152
153 }
154 }catch (Exception e)
155 {
156 System.err.println("Receiving piped data error!" + e.toString());
157 }
158
159 return false;
160 }
161
162
163 private boolean translateXML(String full_doc, String output_file)
164 throws IOException,TransformerException, TransformerConfigurationException, FileNotFoundException
165 {
166
167 StringReader str = new StringReader(full_doc) ;
168
169 TransformerFactory tFactory = TransformerFactory.newInstance();
170 Transformer transformer = tFactory.newTransformer(new StreamSource(xsl_file));
171
172 setTransformerLanguageParams(transformer); // sourcelang and targetlang
173
174 transformer.transform(new StreamSource(str), new StreamResult(new FileOutputStream(output_file)));
175 return true;
176 }
177
178 private boolean translateXMLWithMapping(String full_doc, String output_file)
179 throws IOException,TransformerException, TransformerConfigurationException, FileNotFoundException
180 {
181 StringReader str = new StringReader(full_doc) ;
182
183 try{
184 TransformerFactory tFactory = TransformerFactory.newInstance();
185 Transformer transformer = tFactory.newTransformer(new StreamSource(xsl_file));
186
187 Document mapping_doc = DocumentBuilderFactory.newInstance().newDocumentBuilder().parse(mapping_file);
188 Element mapping =mapping_doc.getDocumentElement();
189
190 transformer.setParameter("mapping",mapping);
191 setTransformerLanguageParams(transformer); // sourcelang and targetlang
192
193 Document output_doc = DocumentBuilderFactory.newInstance().newDocumentBuilder().newDocument();
194
195 transformer.transform(new StreamSource(str), new DOMResult(output_doc));
196
197 calculateRecordsLength(output_doc);
198
199 transformer = tFactory.newTransformer();
200
201 transformer.transform(new DOMSource(output_doc), new StreamResult(new FileOutputStream(output_file)));
202
203 }
204 catch(Exception e){
205 e.printStackTrace();
206 return false;
207 }
208
209 return true;
210 }
211
212 private void calculateRecordsLength(Document output_doc){
213 NodeList records = output_doc.getDocumentElement().getElementsByTagName(RECORD_ELEMENT);
214
215 for(int i=0;i<records.getLength();i++){
216 Element record = (Element)records.item(i);
217 calculateRecordLength(record);
218 }
219 }
220
221 private void calculateRecordLength(Element record){
222 int total_length =0;
223 NodeList controlfileds = record.getElementsByTagName(CONTROLFIELD_ELEMENT);
224 for(int i=0;i<controlfileds.getLength();i++){
225 Element controlfiled = (Element)controlfileds.item(i);
226 total_length +=getElementTextValue(controlfiled).length();
227 }
228
229 NodeList subfileds = record.getElementsByTagName(SUBFIELD_ELEMENT);
230 for(int i=0;i<subfileds.getLength();i++){
231 Element subfiled = (Element)subfileds.item(i);
232 total_length +=getElementTextValue(subfiled).length();
233 }
234
235 String record_length = total_length+"";
236 //fill in a extra digit as record length needs to be five characters long
237 if (total_length < 10000){
238 record_length = "0"+record_length;
239 if (total_length < 1000){
240 record_length = "0"+record_length;
241 }
242 if (total_length < 100){
243 record_length = "0"+record_length;
244 }
245 if (total_length < 10){
246 record_length = "0"+record_length;
247 }
248
249 }
250
251 NodeList leaders = record.getElementsByTagName(LEADER_ELEMENT);
252
253 //only one leader element
254 if (leaders.getLength() >0){
255 Element leader_element = (Element)leaders.item(0);
256 removeFirstTextNode(leader_element);
257 leader_element.insertBefore(leader_element.getOwnerDocument().createTextNode(record_length),leader_element.getFirstChild());
258 }
259
260 }
261
262 private void removeFirstTextNode(Element element){
263 //remove the first text node
264 NodeList children_nodelist = element.getChildNodes();
265 for (int i = 0; i < children_nodelist.getLength(); i++) {
266 Node child_node = children_nodelist.item(i);
267 if (child_node.getNodeType() == Node.TEXT_NODE) {
268 element.removeChild(child_node);
269 return;
270 }
271 }
272
273 }
274
275 private String getElementTextValue(Element element)
276 {
277 String text ="";
278
279 // Find the node child
280 NodeList children_nodelist = element.getChildNodes();
281 for (int i = 0; i < children_nodelist.getLength(); i++) {
282 Node child_node = children_nodelist.item(i);
283 if (child_node.getNodeType() == Node.TEXT_NODE) {
284 text +=child_node.getNodeValue();
285 }
286 }
287
288 return text;
289 }
290
291
292 private void setMappingVariable(Document style_doc){
293 Node child = style_doc.getDocumentElement().getFirstChild();
294 while(child != null) {
295 String name = child.getNodeName();
296 if (name.equals("xsl:variable")) {
297 Element variable_element = (Element)child;
298 if ( variable_element.getAttribute("name").trim().equals("mapping")){
299 variable_element.setAttribute("select","document('"+mapping_file+"')/Mapping");
300 }
301 }
302 child = child.getNextSibling();
303 }
304
305 }
306
307 private void setTransformerLanguageParams(Transformer transformer)
308 {
309 if(targetlang != "") {
310 transformer.setParameter("sourcelang",sourcelang);
311 transformer.setParameter("targetlang",targetlang);
312 }
313 }
314
315 private void translate(String xml_file, String xsl_file, String output_file)throws IOException,TransformerException, TransformerConfigurationException, FileNotFoundException, IOException{
316
317 TransformerFactory tFactory = TransformerFactory.newInstance();
318 Transformer transformer = tFactory.newTransformer(new StreamSource(xsl_file));
319
320 OutputStreamWriter output = null;
321 if (output_file.equals("")) {
322 output = new OutputStreamWriter(System.out, "UTF-8");
323 }
324 else{
325 output = new OutputStreamWriter(new FileOutputStream(output_file), "UTF-8");
326 }
327
328 setTransformerLanguageParams(transformer); // sourcelang and targetlang
329 transformer.transform(new StreamSource(new File(xml_file)),new StreamResult(output));
330
331 }
332
333 static public String replaceAll(String source_string, String match_regexp, String replace_string)
334 {
335 return source_string.replaceAll(match_regexp, replace_string);
336 }
337
338 // Necessary for paperspast.dm, but can be used generally.
339 // The get-chunks cmd of gti.pl perl script when run over paperspast.dm returns XML with source and target lines
340 // like: [c=paperspast] {All newspapers} for source and [c=paperspast,l=mi] {Niupepa katoa} for target
341 // This function returns just the 'string' portion of the chunk of data: e.g 'All newspapers' and 'Niupepa katoa'
342 static public String getChunkString(String target_file_text)
343 {
344 int startindex = target_file_text.indexOf("[");
345 if(startindex != 0) {
346 return target_file_text;
347 } // to test that the input requires processing
348
349 // else
350 startindex = target_file_text.indexOf("{");
351 int endindex = target_file_text.lastIndexOf("}");
352 if(startindex != -1 && endindex != -1) {
353 return target_file_text.substring(startindex+1, endindex); // skips { and }
354 } else {
355 return target_file_text;
356 }
357
358 }
359
360 // Necessary for paperspast.dm, but can be used generally.
361 // The get-chunks cmd of gti.pl perl script when run over paperspast.dm returns XML with source and target lines
362 // like: [c=paperspast] {All newspapers} for source and [c=paperspast,l=mi] {Niupepa katoa} for target
363 // This function returns just the 'attribute' portion of the chunk of data: e.g 'c=paperspast' and 'c=paperspast,l=mi'
364 static public String getChunkAttr(String target_file_text)
365 {
366 int startindex = target_file_text.indexOf("[");
367 if(startindex != 0) {
368 return target_file_text;
369 } // to test that the input requires processing
370
371 // else
372 startindex = target_file_text.indexOf("{");
373 int endindex = target_file_text.lastIndexOf("}");
374 if(startindex != -1 && endindex != -1) {
375 endindex = target_file_text.lastIndexOf("]", startindex); // look for ] preceding the {
376 if(endindex > 1) { //if(endindex != -1) {
377 // so there's something to substring between [ and ]
378 return target_file_text.substring(1, endindex).trim(); // skips [ and ]
379 }
380 }
381 return target_file_text;
382 }
383
384 public static void main(String[] args)
385 {
386
387 String xml_file="";
388 String xsl_file="";
389 String mapping_file="";
390 String output_file="";
391
392 String sourcelang="";
393 String targetlang="";
394
395 // Checking Arguments
396 if(args.length < 1)
397 {
398 printUsage();
399 }
400
401 for (int i=0;i<args.length;i++){
402 if (args[i].equals("-m") && i+1 < args.length && !args[i+1].startsWith("-")){
403 mapping_file = args[++i];
404 checkFile(mapping_file.replaceAll("file:///",""));
405 }
406 else if (args[i].equals("-x") && i+1 < args.length && !args[i+1].startsWith("-")){
407 xml_file = args[++i];
408 checkFile(xml_file.replaceAll("file:///",""));
409 }
410 else if(args[i].equals("-t") && i+1 < args.length && !args[i+1].startsWith("-")){
411 xsl_file = args[++i];
412 checkFile( xsl_file.replaceAll("file:///",""));
413 }
414 else if(args[i].equals("-o") && i+1 < args.length && !args[i+1].startsWith("-")){
415 output_file = args[++i];
416
417 }
418 // The two language parameters (-s and -l) are for the gti-generate-tmx-xml file
419 // which requires the target lang (code), and will accept the optional source lang (code)
420 else if(args[i].equals("-s") && i+1 < args.length && !args[i+1].startsWith("-")){
421 sourcelang = args[++i];
422 }
423 else if(args[i].equals("-l") && i+1 < args.length && !args[i+1].startsWith("-")){
424 targetlang = args[++i];
425 }
426 else if(args[i].equals("-h")){
427 printUsage();
428 }
429 else{
430 printUsage();
431 }
432
433 }
434
435
436 ApplyXSLT core = null;
437
438 if (xml_file.equals("") && !xsl_file.equals("")){//read from pipe line
439 if (mapping_file.equals("")){
440 core = new ApplyXSLT(xsl_file, sourcelang, targetlang);
441 }
442 else{
443 core = new ApplyXSLT(xsl_file,mapping_file, sourcelang, targetlang);
444 }
445
446 if (core != null){
447 core.process();
448 }
449 else{
450 printUsage();
451 }
452 }
453 else if(!xml_file.equals("") && !xsl_file.equals("")){
454 core = new ApplyXSLT(sourcelang, targetlang);
455 try {
456 core.translate(xml_file,xsl_file,output_file);
457 }
458 catch(Exception e){e.printStackTrace();}
459 }
460 else{
461 printUsage();
462 }
463
464 }
465
466 private static void checkFile(String filename){
467 File file = new File(filename);
468 if (!file.exists()){
469 System.out.println("Error: "+filename+" doesn't exist!");
470 System.exit(-1);
471 }
472 }
473
474 private static void printUsage(){
475 System.out.println("Usage: ApplyXSLT -x File -t File [-m File] [-o File] [-s sourcelang] [-l targetlang]");
476 System.out.println("\t-x specifies the xml file (Note: optional for piped xml data)");
477 System.out.println("\t-t specifies the xsl file");
478 System.out.println("\t-m specifies the mapping file (for MARCXMLPlugout.pm only)");
479 System.out.println("\t-o specifies the output file name (output to screen if this option is absent)");
480 System.out.println("\t-s specifies the input language code for generating TMX file. Defaults to 'en' if none is provided");
481 System.out.println("\t-l specifies the output language code. Required if generating a TMX file.");
482 System.exit(-1);
483 }
484}
485
486
Note: See TracBrowser for help on using the repository browser.