source: main/trunk/greenstone2/build-src/src/java/org/nzdl/gsdl/ApplyXSLT.java@ 38850

Last change on this file since 38850 was 38850, checked in by davidb, 3 months ago

How this Java code was determining end-of-input was found to be incorrect; having read the docs more carefully, this is an updated version that works even if there is a delay in input arriving on standard-in

  • Property svn:keywords set to Author Date Id Revision
File size: 23.3 KB
Line 
1/**********************************************************************
2 *
3 * ApplyXSLT.java
4 *
5 * Copyright 2006-2010 The New Zealand Digital Library Project
6 *
7 * A component of the Greenstone digital library software
8 * from the New Zealand Digital Library Project at the
9 * University of Waikato, New Zealand.
10 *
11 * This program is free software; you can redistribute it and/or modify
12 * it under the terms of the GNU General Public License as published by
13 * the Free Software Foundation; either version 2 of the License, or
14 * (at your option) any later version.
15 *
16 * This program is distributed in the hope that it will be useful,
17 * but WITHOUT ANY WARRANTY; without even the implied warranty of
18 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19 * GNU General Public License for more details.
20 *
21 * You should have received a copy of the GNU General Public License
22 * along with this program; if not, write to the Free Software
23 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24 *
25 *********************************************************************/
26
27
28package org.nzdl.gsdl;
29
30import java.io.*;
31import java.util.HashMap;
32import java.util.Iterator;
33import java.util.Map;
34import java.util.Map.Entry;
35
36import javax.xml.transform.Transformer;
37import javax.xml.transform.TransformerConfigurationException;
38import javax.xml.transform.TransformerException;
39import javax.xml.transform.TransformerFactory;
40import javax.xml.transform.stream.StreamResult;
41import javax.xml.transform.stream.StreamSource;
42
43import javax.xml.parsers.*;
44import javax.xml.transform.dom.*;
45import org.w3c.dom.*;
46
47
48
49/**
50 * Use the TraX interface to perform a transformation in the simplest manner possible
51 * (3 statements).
52 */
53public class ApplyXSLT
54{
55
56 public static final String DOC_START = new String ("<?DocStart?>");
57 public static final String DOC_END = new String ("<?DocEnd?>");
58 public static final String INPUT_END = new String ("<?Done?>");
59
60 private static final String RECORD_ELEMENT = "record";
61 private static final String CONTROLFIELD_ELEMENT = "controlfield";
62 private static final String SUBFIELD_ELEMENT = "subfield";
63 private static final String LEADER_ELEMENT = "leader";
64
65 private final int BEFORE_READING = 0;
66 private final int IS_READING = 1;
67 private String xsl_file;
68 private String mapping_file;
69
70 private String sourcelang;
71 private String targetlang;
72 private HashMap paramMap;
73
74 public ApplyXSLT(String sourcelang, String targetlang, HashMap param_map){
75 initParams(sourcelang, targetlang, param_map);
76 }
77
78 public ApplyXSLT(String xsl_file, String sourcelang, String targetlang, HashMap param_map)
79 {
80 this.xsl_file = xsl_file;
81 initParams(sourcelang, targetlang, param_map);
82 }
83
84 public ApplyXSLT(String xsl_file, String mapping_file, String sourcelang, String targetlang, HashMap param_map) {
85 this.xsl_file = xsl_file;
86 this.mapping_file = mapping_file;
87 initParams(sourcelang, targetlang, param_map);
88 }
89
90 private void initParams(String sourcelang, String targetlang, HashMap param_map)
91 {
92 this.sourcelang = sourcelang;
93 this.targetlang = targetlang;
94 // if only target language is provided, assume source language is English
95 if(sourcelang.equals("") && !targetlang.equals("")) {
96 this.sourcelang = "en";
97 }
98
99 // any custom parameters to be passed into the XSLT would be in the map by now
100 paramMap = param_map;
101 }
102
103 /*
104 private boolean isOpen(BufferedReader br) {
105 // The code used to rely on br.ready() to determine whether it should
106 // continue looping to read input (from standard-in). However on
107 // closer read of the JavaDoc, br.ready() can return false even
108 // when the input stream is still open -- it's just that there isn't
109 // any input lines to read in at the moment.
110
111 // If the input stream has been closed (from the external calling Perl code),
112 // then the only sure way to determine that this condition has arise is
113 // to ask br.ready() and catch the Exception that occurs ... because the
114 // input stream is not closed!
115
116 boolean is_open = true;
117
118 try {
119 boolean is_ready = br.ready();
120 }
121 catch (Exception e) { // Could be more selective and go for IOException ??
122 //System.err.println("ApplyXSLT::isOpen() encountered an exception => so input is closed");
123 is_open = false;
124 }
125
126 return is_open;
127 }
128 */
129
130 private boolean process()
131 {
132 try{
133
134 // Use System InputStream to receive piped data from the perl program
135 InputStreamReader ir = new InputStreamReader(System.in, "UTF8");
136 BufferedReader br = new BufferedReader(ir);
137
138 int system_status = BEFORE_READING;
139 StringBuffer a_doc = new StringBuffer();
140 String output_file = new String();
141
142
143 String this_line;
144 while ((this_line = br.readLine()) != null) {
145
146 //System.err.println("Read in from pipe, line: " + this_line);
147
148 if(system_status == BEFORE_READING){
149 if(this_line.compareTo(DOC_START) == 0){
150 // If this_line is DOC_START then we require the next line of input to be
151 // the filename
152 output_file = br.readLine(); // read the next line as the output file name
153 if (output_file == null) {
154 // A problem of some form occurred
155 return false;
156 }
157 //System.err.println("Read in from pipe, next line: " + output_file);
158 system_status = IS_READING;
159 a_doc = new StringBuffer();
160 }
161 else if(this_line.compareTo(INPUT_END) == 0){
162 return true;
163 }
164 else{
165 system_status = BEFORE_READING;
166 }
167
168 }
169 else if(system_status == IS_READING){
170 if(this_line.compareTo(DOC_END) == 0){
171 boolean result = false;
172 if (mapping_file !=null && !mapping_file.equals("")){
173 result = translateXMLWithMapping(a_doc.toString(), output_file);
174 }
175 else{
176 result = translateXML(a_doc.toString(), output_file);
177 }
178
179 if (!result){
180 return false;
181 }
182
183 system_status = BEFORE_READING;
184
185 }
186 else{
187 a_doc.append(this_line + "\n");
188 }
189 }
190 else{
191 System.err.println ("Undefined system status in ApplyXSLT.java main().");
192 System.exit(-1);
193 }
194
195 }
196
197 return true;
198
199 //if(br != null) {
200 // br.close();
201 // br = null;
202 //}
203 }
204 catch (Exception e) {
205 System.err.println("Receiving piped data error!" + e.toString());
206 e.printStackTrace();
207 }
208
209 return false;
210 }
211
212 // reads xml from stdin, but no <?DocStart?><?DocEnd?> markers, and sends output to STDOUT
213 private boolean processPipedFromStdIn()
214 {
215 try{
216 //System.err.println("Received nothing\n");
217
218 ReadStreamGobbler readInStreamGobbler = new ReadStreamGobbler(System.in, true);
219 readInStreamGobbler.start();
220 readInStreamGobbler.join();
221 String outputstr = readInStreamGobbler.getOutput();
222
223 // Using join() above, even though we use only one streamGobbler thread, and even
224 // though we're not dealing with the input/output/error streams of a Process object.
225 // But the join() call here didn't break things.
226 // http://www.javaworld.com/article/2071275/core-java/when-runtime-exec---won-t.html?page=2
227
228 boolean result = false;
229 if (mapping_file !=null && !mapping_file.equals("")){
230 result = translateXMLWithMapping(outputstr, null); // null: no outputfile, send to STDOUT
231 }
232 else{
233 result = translateXML(outputstr, null); // null: no outputfile, send to STDOUT
234 }
235
236 if (!result){
237 System.err.println("Translation Failed!!");
238 return false;
239 } else {
240 return true;
241 }
242 }catch (Exception e) {
243 System.err.println("Receiving piped data error!" + e.toString());
244 e.printStackTrace();
245 }
246 return false;
247 }
248
249
250 private boolean translateXML(String full_doc, String output_file)
251 throws IOException,TransformerException, TransformerConfigurationException, FileNotFoundException
252 {
253
254 StringReader str = new StringReader(full_doc) ;
255
256 TransformerFactory tFactory = TransformerFactory.newInstance();
257 Transformer transformer = tFactory.newTransformer(new StreamSource(xsl_file));
258
259 setTransformerParams(transformer); // sourcelang and targetlang and any further custom parameters to be passed into the XSLT
260
261 if(output_file != null) {
262 transformer.transform(new StreamSource(str), new StreamResult(new FileOutputStream(output_file)));
263 } else {
264 transformer.transform(new StreamSource(str), new StreamResult(System.out));
265 }
266 return true;
267 }
268
269 private boolean translateXMLWithMapping(String full_doc, String output_file)
270 throws IOException,TransformerException, TransformerConfigurationException, FileNotFoundException
271 {
272 StringReader str = new StringReader(full_doc) ;
273
274 try{
275 TransformerFactory tFactory = TransformerFactory.newInstance();
276 Transformer transformer = tFactory.newTransformer(new StreamSource(xsl_file));
277
278 Document mapping_doc = DocumentBuilderFactory.newInstance().newDocumentBuilder().parse(mapping_file);
279 Element mapping =mapping_doc.getDocumentElement();
280
281 transformer.setParameter("mapping",mapping);
282 setTransformerParams(transformer); // sourcelang and targetlang and any further custom parameters to be passed into the XSLT
283
284 Document output_doc = DocumentBuilderFactory.newInstance().newDocumentBuilder().newDocument();
285
286 transformer.transform(new StreamSource(str), new DOMResult(output_doc));
287
288 calculateRecordsLength(output_doc);
289
290 transformer = tFactory.newTransformer();
291
292 transformer.transform(new DOMSource(output_doc), new StreamResult(new FileOutputStream(output_file)));
293
294 }
295 catch(Exception e){
296 e.printStackTrace();
297 return false;
298 }
299
300 return true;
301 }
302
303 private void calculateRecordsLength(Document output_doc){
304 NodeList records = output_doc.getDocumentElement().getElementsByTagName(RECORD_ELEMENT);
305
306 for(int i=0;i<records.getLength();i++){
307 Element record = (Element)records.item(i);
308 calculateRecordLength(record);
309 }
310 }
311
312 private void calculateRecordLength(Element record){
313 int total_length =0;
314 NodeList controlfileds = record.getElementsByTagName(CONTROLFIELD_ELEMENT);
315 for(int i=0;i<controlfileds.getLength();i++){
316 Element controlfiled = (Element)controlfileds.item(i);
317 total_length +=getElementTextValue(controlfiled).length();
318 }
319
320 NodeList subfileds = record.getElementsByTagName(SUBFIELD_ELEMENT);
321 for(int i=0;i<subfileds.getLength();i++){
322 Element subfiled = (Element)subfileds.item(i);
323 total_length +=getElementTextValue(subfiled).length();
324 }
325
326 String record_length = total_length+"";
327 //fill in a extra digit as record length needs to be five characters long
328 if (total_length < 10000){
329 record_length = "0"+record_length;
330 if (total_length < 1000){
331 record_length = "0"+record_length;
332 }
333 if (total_length < 100){
334 record_length = "0"+record_length;
335 }
336 if (total_length < 10){
337 record_length = "0"+record_length;
338 }
339
340 }
341
342 NodeList leaders = record.getElementsByTagName(LEADER_ELEMENT);
343
344 //only one leader element
345 if (leaders.getLength() >0){
346 Element leader_element = (Element)leaders.item(0);
347 removeFirstTextNode(leader_element);
348 leader_element.insertBefore(leader_element.getOwnerDocument().createTextNode(record_length),leader_element.getFirstChild());
349 }
350
351 }
352
353 private void removeFirstTextNode(Element element){
354 //remove the first text node
355 NodeList children_nodelist = element.getChildNodes();
356 for (int i = 0; i < children_nodelist.getLength(); i++) {
357 Node child_node = children_nodelist.item(i);
358 if (child_node.getNodeType() == Node.TEXT_NODE) {
359 element.removeChild(child_node);
360 return;
361 }
362 }
363
364 }
365
366 private String getElementTextValue(Element element)
367 {
368 String text ="";
369
370 // Find the node child
371 NodeList children_nodelist = element.getChildNodes();
372 for (int i = 0; i < children_nodelist.getLength(); i++) {
373 Node child_node = children_nodelist.item(i);
374 if (child_node.getNodeType() == Node.TEXT_NODE) {
375 text +=child_node.getNodeValue();
376 }
377 }
378
379 return text;
380 }
381
382
383 private void setMappingVariable(Document style_doc){
384 Node child = style_doc.getDocumentElement().getFirstChild();
385 while(child != null) {
386 String name = child.getNodeName();
387 if (name.equals("xsl:variable")) {
388 Element variable_element = (Element)child;
389 if ( variable_element.getAttribute("name").trim().equals("mapping")){
390 variable_element.setAttribute("select","document('"+mapping_file+"')/Mapping");
391 }
392 }
393 child = child.getNextSibling();
394 }
395
396 }
397
398 private void setTransformerParams(Transformer transformer)
399 {
400 if(targetlang != "") {
401 transformer.setParameter("sourcelang",sourcelang);
402 transformer.setParameter("targetlang",targetlang);
403 }
404
405 // handle any custom parameters that are also to be passed into the XSLT
406 Iterator i = paramMap.entrySet().iterator();
407 while(i.hasNext()) {
408 Map.Entry entry = (Map.Entry)i.next();
409 String paramName = (String)entry.getKey();
410 String paramValue = (String)entry.getValue();
411
412 transformer.setParameter(paramName, paramValue);
413 }
414
415 }
416
417 private void translate(String xml_file, String xsl_file, String output_file)throws IOException,TransformerException, TransformerConfigurationException, FileNotFoundException, IOException{
418
419 TransformerFactory tFactory = TransformerFactory.newInstance();
420 Transformer transformer = tFactory.newTransformer(new StreamSource(xsl_file));
421
422 OutputStreamWriter output = null;
423 if (output_file.equals("")) {
424 output = new OutputStreamWriter(System.out, "UTF-8");
425 }
426 else{
427 output = new OutputStreamWriter(new FileOutputStream(output_file), "UTF-8");
428 }
429
430 setTransformerParams(transformer); // sourcelang and targetlang and any further custom parameters to be passed into the XSLT
431 transformer.transform(new StreamSource(new File(xml_file)),new StreamResult(output));
432
433 }
434 private void translateWithMapping(String xml_file, String xsl_file, String mapping_file, String output_file)throws IOException,TransformerException, TransformerConfigurationException, FileNotFoundException {
435
436 TransformerFactory tFactory = TransformerFactory.newInstance();
437 Transformer transformer = tFactory.newTransformer(new StreamSource(xsl_file));
438
439 OutputStreamWriter output = null;
440 if (output_file.equals("")) {
441 output = new OutputStreamWriter(System.out, "UTF-8");
442 }
443 else{
444 output = new OutputStreamWriter(new FileOutputStream(output_file), "UTF-8");
445 }
446 try {
447 Document mapping_doc = DocumentBuilderFactory.newInstance().newDocumentBuilder().parse(mapping_file);
448 Element mapping =mapping_doc.getDocumentElement();
449
450 transformer.setParameter("mapping",mapping);
451 } catch (Exception e) {
452 System.err.println("Couldn't load in mapping file");
453 e.printStackTrace();
454 }
455 setTransformerParams(transformer); // sourcelang and targetlang and any further custom parameters to be passed into the XSLT
456 transformer.transform(new StreamSource(new File(xml_file)),new StreamResult(output));
457
458 }
459
460 static public String replaceAll(String source_string, String match_regexp, String replace_string)
461 {
462 return source_string.replaceAll(match_regexp, replace_string);
463 }
464
465 // Necessary for paperspast.dm, but can be used generally.
466 // The get-chunks cmd of gti.pl perl script when run over paperspast.dm returns XML with source and target lines
467 // like: [c=paperspast] {All newspapers} for source and [c=paperspast,l=mi] {Niupepa katoa} for target
468 // This function returns just the 'string' portion of the chunk of data: e.g 'All newspapers' and 'Niupepa katoa'
469 static public String getChunkString(String target_file_text)
470 {
471 int startindex = target_file_text.indexOf("[");
472 if(startindex != 0) {
473 return target_file_text;
474 } // to test that the input requires processing
475
476 // else
477 startindex = target_file_text.indexOf("{");
478 int endindex = target_file_text.lastIndexOf("}");
479 if(startindex != -1 && endindex != -1) {
480 return target_file_text.substring(startindex+1, endindex); // skips { and }
481 } else {
482 return target_file_text;
483 }
484
485 }
486
487 // Necessary for paperspast.dm, but can be used generally.
488 // The get-chunks cmd of gti.pl perl script when run over paperspast.dm returns XML with source and target lines
489 // like: [c=paperspast] {All newspapers} for source and [c=paperspast,l=mi] {Niupepa katoa} for target
490 // This function returns just the 'attribute' portion of the chunk of data: e.g 'c=paperspast' and 'c=paperspast,l=mi'
491 static public String getChunkAttr(String target_file_text)
492 {
493 int startindex = target_file_text.indexOf("[");
494 if(startindex != 0) {
495 return target_file_text;
496 } // to test that the input requires processing
497
498 // else
499 startindex = target_file_text.indexOf("{");
500 int endindex = target_file_text.lastIndexOf("}");
501 if(startindex != -1 && endindex != -1) {
502 endindex = target_file_text.lastIndexOf("]", startindex); // look for ] preceding the {
503 if(endindex > 1) { //if(endindex != -1) {
504 // so there's something to substring between [ and ]
505 return target_file_text.substring(1, endindex).trim(); // skips [ and ]
506 }
507 }
508 return target_file_text;
509 }
510
511 public static void main(String[] args)
512 {
513 String xml_file="";
514 String xsl_file="";
515 String mapping_file="";
516 String output_file="";
517
518 String sourcelang="";
519 String targetlang="";
520
521 boolean readFromStdInFlag = false;
522
523 HashMap paramMap = new HashMap();
524 int index = -1; // index of the '=' sign in cmdline argument specifying custom parameters to be passed into the XSLT
525
526 // Checking Arguments
527 if(args.length < 1)
528 {
529 printUsage();
530 }
531
532 for (int i=0;i<args.length;i++){
533 if (args[i].equals("-m") && i+1 < args.length && !args[i+1].startsWith("-")){
534 mapping_file = args[++i];
535 checkFile(mapping_file.replaceAll("file:///",""));
536 }
537 else if (args[i].equals("-x") && i+1 < args.length && !args[i+1].startsWith("-")){
538 xml_file = args[++i];
539 checkFile(xml_file.replaceAll("file:///",""));
540 }
541 else if(args[i].equals("-t") && i+1 < args.length && !args[i+1].startsWith("-")){
542 xsl_file = args[++i];
543 checkFile( xsl_file.replaceAll("file:///",""));
544 }
545 else if(args[i].equals("-o") && i+1 < args.length && !args[i+1].startsWith("-")){
546 output_file = args[++i];
547
548 }
549 // The two language parameters (-s and -l) are for the gti-generate-tmx-xml file
550 // which requires the target lang (code), and will accept the optional source lang (code)
551 else if(args[i].equals("-s") && i+1 < args.length && !args[i+1].startsWith("-")){
552 sourcelang = args[++i];
553 }
554 else if(args[i].equals("-l") && i+1 < args.length && !args[i+1].startsWith("-")){
555 targetlang = args[++i];
556 }
557 else if(args[i].equals("-c")){
558 readFromStdInFlag = true;
559 }
560 else if(args[i].equals("-h")){
561 printUsage();
562 }
563 else if ((index = args[i].indexOf("=")) != -1) { // custom parameters provided on the cmdline in the form paramName1=paramValue1 paramName2=paramValue2 etc
564 // that are to be passed into the XSLT
565 String paramName = args[i].substring(0, index);
566 String paramValue = args[i].substring(index+1); // skip the = sign
567 paramMap.put(paramName, paramValue);
568 index = -1;
569 }
570 else{
571 printUsage();
572 }
573
574 }
575
576
577 ApplyXSLT core = null;
578
579 if (xml_file.equals("") && !xsl_file.equals("")){//read from pipe line
580 if (mapping_file.equals("")){
581 core = new ApplyXSLT(xsl_file, sourcelang, targetlang, paramMap);
582 }
583 else{
584 core = new ApplyXSLT(xsl_file, mapping_file, sourcelang, targetlang, paramMap);
585 }
586
587 if (core != null){
588 if(readFromStdInFlag) { // ApplyXSLT was run with -c: read from pipe but no <?DocStart?><?DocEnd?> markers
589 core.processPipedFromStdIn();
590 }
591 core.process(); //read from pipe line, but expecting <?DocStart?><?DocEnd?> embedding markers
592 }
593 else{
594 printUsage();
595 }
596 }
597 else if(!xml_file.equals("") && !xsl_file.equals("")){
598 core = new ApplyXSLT(sourcelang, targetlang, paramMap);
599 try {
600 if (mapping_file.equals("")) {
601 core.translate(xml_file,xsl_file,output_file);
602 } else {
603 core.translateWithMapping(xml_file,xsl_file,mapping_file, output_file);
604 }
605 }
606 catch(Exception e){e.printStackTrace();}
607 }
608 else{
609 printUsage();
610 }
611
612 }
613
614 private static void checkFile(String filename){
615 File file = new File(filename);
616 if (!file.exists()){
617 System.out.println("Error: "+filename+" doesn't exist!");
618 System.exit(-1);
619 }
620 }
621
622 private static void printUsage(){
623 System.out.println("Usage: ApplyXSLT -x File -t File [-m File] [-o File] [-s sourcelang] [-l targetlang] [param-name=param-value]");
624 System.out.println("\t-x specifies the xml file (Note: optional for piped xml data)");
625 System.out.println("\t-c read xml file piped from stdin but without DocStart/DocEnd markers. Writes to stdout");
626 System.out.println("\t-t specifies the xsl file");
627 System.out.println("\t-m specifies the mapping file (for MARCXMLPlugout.pm only)");
628 System.out.println("\t-o specifies the output file name (output to screen if this option is absent)");
629 System.out.println("\t-s specifies the input language code for generating TMX file. Defaults to 'en' if none is provided");
630 System.out.println("\t-l specifies the output language code. Required if generating a TMX file.");
631 System.out.println("\tFor general transformations of an XML by an XSLT, you can pass in parameter name=value pairs if any need to passed on into the XSLT as xsl params.");
632 System.exit(-1);
633 }
634
635
636 // StreamGobblers used in reading/writing to Process' input and outputstreams can be re-used more generally.
637 // Here in ApplyXSTL.java we use it to read from a pipe line (stdin piped into this ApplyXSLT.java)
638 // Code based on http://www.javaworld.com/article/2071275/core-java/when-runtime-exec---won-t.html?page=2
639 class ReadStreamGobbler extends Thread
640 {
641 InputStream is = null;
642 StringBuffer outputstr = new StringBuffer();
643 boolean split_newlines = false;
644
645
646 public ReadStreamGobbler(InputStream is)
647 {
648 this.is = is;
649 split_newlines = false;
650 }
651
652 public ReadStreamGobbler(InputStream is, boolean split_newlines)
653 {
654 this.is = is;
655 this.split_newlines = split_newlines;
656 }
657
658 public void run()
659 {
660 BufferedReader br = null;
661 try {
662 br = new BufferedReader(new InputStreamReader(is, "UTF-8"));
663 String line=null;
664 while ( (line = br.readLine()) != null) {
665 //System.out.println("@@@ GOT LINE: " + line);
666 outputstr.append(line);
667 if(split_newlines) {
668 outputstr.append("\n");
669 }
670 }
671 } catch (IOException ioe) {
672 ioe.printStackTrace();
673 } finally {
674 System.err.println("ReadStreamGobbler:run() finished. Closing resource");
675 closeResource(br);
676 }
677 }
678
679 public String getOutput() {
680 return outputstr.toString();
681 }
682
683 // http://docs.oracle.com/javase/tutorial/essential/exceptions/finally.html
684 // http://stackoverflow.com/questions/481446/throws-exception-in-finally-blocks
685 public void closeResource(Closeable resourceHandle) {
686 try {
687 if(resourceHandle != null) {
688 resourceHandle.close();
689 resourceHandle = null;
690 }
691 } catch(Exception e) {
692 System.err.println("Exception closing resource: " + e.getMessage());
693 e.printStackTrace();
694 }
695 }
696 }
697
698}
699
700
Note: See TracBrowser for help on using the repository browser.