source: main/trunk/greenstone2/build-src/src/java/org/nzdl/gsdl/ApplyXSLT.java@ 25285

Last change on this file since 25285 was 25285, checked in by ak19, 12 years ago

Updated to deal with paperspast.dm: added a new module into gti.pl for this. Because getting chunks from the paperspast.dm file results in attribute like values in the source and target strings, there are now functions in ApplyXSLT.java to remove and retrieve these. Finally, the function to remove these attribute-like values in the source and target strings is called in the gti-generate-tmx-xml.xslt file.

  • Property svn:keywords set to Author Date Id Revision
File size: 15.6 KB
RevLine 
[22736]1/**********************************************************************
2 *
3 * ApplyXSLT.java
4 *
5 * Copyright 2006-2010 The New Zealand Digital Library Project
6 *
7 * A component of the Greenstone digital library software
8 * from the New Zealand Digital Library Project at the
9 * University of Waikato, New Zealand.
10 *
11 * This program is free software; you can redistribute it and/or modify
12 * it under the terms of the GNU General Public License as published by
13 * the Free Software Foundation; either version 2 of the License, or
14 * (at your option) any later version.
15 *
16 * This program is distributed in the hope that it will be useful,
17 * but WITHOUT ANY WARRANTY; without even the implied warranty of
18 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19 * GNU General Public License for more details.
20 *
21 * You should have received a copy of the GNU General Public License
22 * along with this program; if not, write to the Free Software
23 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24 *
25 *********************************************************************/
26
27
[12395]28package org.nzdl.gsdl;
29
30import java.io.*;
31
32import javax.xml.transform.Transformer;
33import javax.xml.transform.TransformerConfigurationException;
34import javax.xml.transform.TransformerException;
35import javax.xml.transform.TransformerFactory;
36import javax.xml.transform.stream.StreamResult;
37import javax.xml.transform.stream.StreamSource;
38
[12594]39import javax.xml.parsers.*;
40import javax.xml.transform.dom.*;
41import org.w3c.dom.*;
42
43
44
[12395]45/**
46 * Use the TraX interface to perform a transformation in the simplest manner possible
47 * (3 statements).
48 */
49public class ApplyXSLT
50{
51
[20875]52 public static final String DOC_START = new String ("<?DocStart?>");
53 public static final String DOC_END = new String ("<?DocEnd?>");
54 public static final String INPUT_END = new String ("<?Done?>");
[12395]55
[20875]56 private static final String RECORD_ELEMENT = "record";
57 private static final String CONTROLFIELD_ELEMENT = "controlfield";
58 private static final String SUBFIELD_ELEMENT = "subfield";
59 private static final String LEADER_ELEMENT = "leader";
[12594]60
[20875]61 private final int BEFORE_READING = 0;
62 private final int IS_READING = 1;
63 private String xsl_file;
64 private String mapping_file;
[12395]65
[25241]66 private String sourcelang;
67 private String targetlang;
68
69 public ApplyXSLT(String sourcelang, String targetlang){
70 initLanguages(sourcelang, targetlang);
71 }
[13470]72
[25241]73 public ApplyXSLT(String xsl_file, String sourcelang, String targetlang)
[20875]74 {
75 this.xsl_file = xsl_file;
[25241]76 initLanguages(sourcelang, targetlang);
[20875]77 }
[12395]78
[25241]79 public ApplyXSLT(String xsl_file, String sourcelang, String targetlang, String mapping_file) {
[20875]80 this.xsl_file = xsl_file;
81 this.mapping_file = mapping_file;
[25241]82 initLanguages(sourcelang, targetlang);
[20875]83 }
[12594]84
[25241]85 private void initLanguages(String sourcelang, String targetlang)
86 {
87 this.sourcelang = sourcelang;
88 this.targetlang = targetlang;
89 // if only target language is provided, assume source language is English
90 if(sourcelang.equals("") && !targetlang.equals("")) {
91 this.sourcelang = "en";
92 }
93 }
94
[20875]95 private boolean process()
96 {
97 try{
[13223]98
[20875]99 // Use System InputStream to receive piped data from the perl program
100 InputStreamReader ir = new InputStreamReader(System.in, "UTF8");
101 BufferedReader br = new BufferedReader(ir);
[12395]102
[20875]103 int system_status = BEFORE_READING;
[20944]104 StringBuffer a_doc = new StringBuffer();
[20875]105 String output_file = new String();
[12395]106
[13223]107
[20875]108 while (br.ready()) {
[12594]109
[20875]110 String this_line = br.readLine();
111 if(system_status == BEFORE_READING){
112 if(this_line.compareTo(DOC_START) == 0){
113 output_file = br.readLine(); // read the next line as the output file name
114 system_status = IS_READING;
[20944]115 a_doc = new StringBuffer();
[20875]116 }
117 else if(this_line.compareTo(INPUT_END) == 0){
118 return true;
119 }
120 else{
121 System.err.println("Undefined process status:" + this_line);
122 system_status = BEFORE_READING;
123 }
[12594]124
[20875]125 }
126 else if(system_status == IS_READING){
127 if(this_line.compareTo(DOC_END) == 0){
128 boolean result = false;
129 if (mapping_file !=null && !mapping_file.equals("")){
[20944]130 result = translateXMLWithMapping(a_doc.toString(), output_file);
[20875]131 }
132 else{
[20944]133 result = translateXML(a_doc.toString(), output_file);
[20875]134 }
[12594]135
[20875]136 if (!result){
137 System.err.println("Translation Failed!!");
138 return false;
139 }
[12594]140
[20875]141 system_status = BEFORE_READING;
[12594]142
[20875]143 }
144 else{
[20944]145 a_doc.append(this_line + "\n");
[20875]146 }
147 }
148 else{
149 System.err.println ("Undefined system status in ApplyXSLT.java main().");
150 System.exit(-1);
151 }
[12594]152
[20875]153 }
154 }catch (Exception e)
155 {
156 System.err.println("Receiving piped data error!" + e.toString());
157 }
[12395]158
[20875]159 return false;
160 }
[12594]161
162
[20875]163 private boolean translateXML(String full_doc, String output_file)
164 throws IOException,TransformerException, TransformerConfigurationException, FileNotFoundException
165 {
[12594]166
[20875]167 StringReader str = new StringReader(full_doc) ;
[12395]168
[20875]169 TransformerFactory tFactory = TransformerFactory.newInstance();
170 Transformer transformer = tFactory.newTransformer(new StreamSource(xsl_file));
[25241]171
172 setTransformerLanguageParams(transformer); // sourcelang and targetlang
173
[20875]174 transformer.transform(new StreamSource(str), new StreamResult(new FileOutputStream(output_file)));
175 return true;
176 }
[12395]177
[20875]178 private boolean translateXMLWithMapping(String full_doc, String output_file)
179 throws IOException,TransformerException, TransformerConfigurationException, FileNotFoundException
180 {
181 StringReader str = new StringReader(full_doc) ;
[12594]182
[20875]183 try{
184 TransformerFactory tFactory = TransformerFactory.newInstance();
185 Transformer transformer = tFactory.newTransformer(new StreamSource(xsl_file));
[12594]186
[20875]187 Document mapping_doc = DocumentBuilderFactory.newInstance().newDocumentBuilder().parse(mapping_file);
188 Element mapping =mapping_doc.getDocumentElement();
[12594]189
[20875]190 transformer.setParameter("mapping",mapping);
[25241]191 setTransformerLanguageParams(transformer); // sourcelang and targetlang
[12594]192
[20875]193 Document output_doc = DocumentBuilderFactory.newInstance().newDocumentBuilder().newDocument();
[12594]194
[20875]195 transformer.transform(new StreamSource(str), new DOMResult(output_doc));
[12594]196
[20875]197 calculateRecordsLength(output_doc);
[12594]198
[20875]199 transformer = tFactory.newTransformer();
[12594]200
[20875]201 transformer.transform(new DOMSource(output_doc), new StreamResult(new FileOutputStream(output_file)));
[12594]202
203 }
[20875]204 catch(Exception e){
205 e.printStackTrace();
206 return false;
207 }
[12594]208
[20875]209 return true;
210 }
[12594]211
[20875]212 private void calculateRecordsLength(Document output_doc){
213 NodeList records = output_doc.getDocumentElement().getElementsByTagName(RECORD_ELEMENT);
214
215 for(int i=0;i<records.getLength();i++){
216 Element record = (Element)records.item(i);
217 calculateRecordLength(record);
[12594]218 }
[20875]219 }
[12594]220
[20875]221 private void calculateRecordLength(Element record){
222 int total_length =0;
223 NodeList controlfileds = record.getElementsByTagName(CONTROLFIELD_ELEMENT);
224 for(int i=0;i<controlfileds.getLength();i++){
225 Element controlfiled = (Element)controlfileds.item(i);
226 total_length +=getElementTextValue(controlfiled).length();
227 }
[12594]228
[20875]229 NodeList subfileds = record.getElementsByTagName(SUBFIELD_ELEMENT);
230 for(int i=0;i<subfileds.getLength();i++){
231 Element subfiled = (Element)subfileds.item(i);
232 total_length +=getElementTextValue(subfiled).length();
233 }
[12594]234
[20875]235 String record_length = total_length+"";
236 //fill in a extra digit as record length needs to be five characters long
237 if (total_length < 10000){
238 record_length = "0"+record_length;
239 if (total_length < 1000){
240 record_length = "0"+record_length;
241 }
242 if (total_length < 100){
243 record_length = "0"+record_length;
244 }
245 if (total_length < 10){
246 record_length = "0"+record_length;
247 }
[12594]248
[20875]249 }
[12594]250
[20875]251 NodeList leaders = record.getElementsByTagName(LEADER_ELEMENT);
[12594]252
[20875]253 //only one leader element
254 if (leaders.getLength() >0){
255 Element leader_element = (Element)leaders.item(0);
256 removeFirstTextNode(leader_element);
257 leader_element.insertBefore(leader_element.getOwnerDocument().createTextNode(record_length),leader_element.getFirstChild());
[12594]258 }
259
[20875]260 }
[12594]261
[20875]262 private void removeFirstTextNode(Element element){
263 //remove the first text node
264 NodeList children_nodelist = element.getChildNodes();
265 for (int i = 0; i < children_nodelist.getLength(); i++) {
266 Node child_node = children_nodelist.item(i);
267 if (child_node.getNodeType() == Node.TEXT_NODE) {
268 element.removeChild(child_node);
269 return;
270 }
[12594]271 }
272
[20875]273 }
[12594]274
[20875]275 private String getElementTextValue(Element element)
276 {
277 String text ="";
[12594]278
[20875]279 // Find the node child
280 NodeList children_nodelist = element.getChildNodes();
281 for (int i = 0; i < children_nodelist.getLength(); i++) {
282 Node child_node = children_nodelist.item(i);
283 if (child_node.getNodeType() == Node.TEXT_NODE) {
284 text +=child_node.getNodeValue();
285 }
[12594]286 }
[20875]287
288 return text;
289 }
[12594]290
291
[20875]292 private void setMappingVariable(Document style_doc){
293 Node child = style_doc.getDocumentElement().getFirstChild();
294 while(child != null) {
295 String name = child.getNodeName();
296 if (name.equals("xsl:variable")) {
297 Element variable_element = (Element)child;
298 if ( variable_element.getAttribute("name").trim().equals("mapping")){
299 variable_element.setAttribute("select","document('"+mapping_file+"')/Mapping");
[12594]300 }
[20875]301 }
302 child = child.getNextSibling();
303 }
[12594]304
[20875]305 }
[12594]306
[25241]307 private void setTransformerLanguageParams(Transformer transformer)
308 {
309 if(targetlang != "") {
310 transformer.setParameter("sourcelang",sourcelang);
311 transformer.setParameter("targetlang",targetlang);
312 }
313 }
[12594]314
[20875]315 private void translate(String xml_file, String xsl_file, String output_file)throws IOException,TransformerException, TransformerConfigurationException, FileNotFoundException, IOException{
[13473]316
[20875]317 TransformerFactory tFactory = TransformerFactory.newInstance();
318 Transformer transformer = tFactory.newTransformer(new StreamSource(xsl_file));
[13473]319
[20875]320 OutputStreamWriter output = null;
321 if (output_file.equals("")) {
322 output = new OutputStreamWriter(System.out, "UTF-8");
323 }
324 else{
325 output = new OutputStreamWriter(new FileOutputStream(output_file), "UTF-8");
326 }
[25241]327
328 setTransformerLanguageParams(transformer); // sourcelang and targetlang
[20875]329 transformer.transform(new StreamSource(new File(xml_file)),new StreamResult(output));
[13473]330
[20875]331 }
[13470]332
[20875]333 static public String replaceAll(String source_string, String match_regexp, String replace_string)
334 {
335 return source_string.replaceAll(match_regexp, replace_string);
336 }
[13473]337
[25285]338 // Necessary for paperspast.dm, but can be used generally.
339 // The get-chunks cmd of gti.pl perl script when run over paperspast.dm returns XML with source and target lines
340 // like: [c=paperspast] {All newspapers} for source and [c=paperspast,l=mi] {Niupepa katoa} for target
341 // This function returns just the 'string' portion of the chunk of data: e.g 'All newspapers' and 'Niupepa katoa'
342 static public String getChunkString(String target_file_text)
343 {
344 int startindex = target_file_text.indexOf("[");
345 if(startindex != 0) {
346 return target_file_text;
347 } // to test that the input requires processing
[13473]348
[25285]349 // else
350 startindex = target_file_text.indexOf("{");
351 int endindex = target_file_text.lastIndexOf("}");
352 if(startindex != -1 && endindex != -1) {
353 return target_file_text.substring(startindex+1, endindex); // skips { and }
354 } else {
355 return target_file_text;
356 }
357
358 }
359
360 // Necessary for paperspast.dm, but can be used generally.
361 // The get-chunks cmd of gti.pl perl script when run over paperspast.dm returns XML with source and target lines
362 // like: [c=paperspast] {All newspapers} for source and [c=paperspast,l=mi] {Niupepa katoa} for target
363 // This function returns just the 'attribute' portion of the chunk of data: e.g 'c=paperspast' and 'c=paperspast,l=mi'
364 static public String getChunkAttr(String target_file_text)
365 {
366 int startindex = target_file_text.indexOf("[");
367 if(startindex != 0) {
368 return target_file_text;
369 } // to test that the input requires processing
370
371 // else
372 startindex = target_file_text.indexOf("{");
373 int endindex = target_file_text.lastIndexOf("}");
374 if(startindex != -1 && endindex != -1) {
375 endindex = target_file_text.lastIndexOf("]", startindex); // look for ] preceding the {
376 if(endindex > 1) { //if(endindex != -1) {
377 // so there's something to substring between [ and ]
378 return target_file_text.substring(1, endindex).trim(); // skips [ and ]
379 }
380 }
381 return target_file_text;
382 }
383
[20875]384 public static void main(String[] args)
385 {
[13470]386
[20875]387 String xml_file="";
388 String xsl_file="";
389 String mapping_file="";
390 String output_file="";
[25241]391
392 String sourcelang="";
393 String targetlang="";
394
[20875]395 // Checking Arguments
396 if(args.length < 1)
397 {
398 printUsage();
399 }
[13470]400
[20875]401 for (int i=0;i<args.length;i++){
402 if (args[i].equals("-m") && i+1 < args.length && !args[i+1].startsWith("-")){
403 mapping_file = args[++i];
404 checkFile(mapping_file.replaceAll("file:///",""));
405 }
406 else if (args[i].equals("-x") && i+1 < args.length && !args[i+1].startsWith("-")){
407 xml_file = args[++i];
408 checkFile(xml_file.replaceAll("file:///",""));
409 }
410 else if(args[i].equals("-t") && i+1 < args.length && !args[i+1].startsWith("-")){
411 xsl_file = args[++i];
412 checkFile( xsl_file.replaceAll("file:///",""));
413 }
414 else if(args[i].equals("-o") && i+1 < args.length && !args[i+1].startsWith("-")){
415 output_file = args[++i];
[13470]416
[20875]417 }
[25241]418 // The two language parameters (-s and -l) are for the gti-generate-tmx-xml file
419 // which requires the target lang (code), and will accept the optional source lang (code)
420 else if(args[i].equals("-s") && i+1 < args.length && !args[i+1].startsWith("-")){
421 sourcelang = args[++i];
422 }
423 else if(args[i].equals("-l") && i+1 < args.length && !args[i+1].startsWith("-")){
424 targetlang = args[++i];
425 }
[20875]426 else if(args[i].equals("-h")){
427 printUsage();
428 }
429 else{
430 printUsage();
431 }
[13470]432
[20875]433 }
[12594]434
[13470]435
[20875]436 ApplyXSLT core = null;
[13470]437
[20875]438 if (xml_file.equals("") && !xsl_file.equals("")){//read from pipe line
439 if (mapping_file.equals("")){
[25241]440 core = new ApplyXSLT(xsl_file, sourcelang, targetlang);
[20875]441 }
442 else{
[25241]443 core = new ApplyXSLT(xsl_file,mapping_file, sourcelang, targetlang);
[20875]444 }
[13470]445
[20875]446 if (core != null){
447 core.process();
448 }
449 else{
450 printUsage();
451 }
452 }
453 else if(!xml_file.equals("") && !xsl_file.equals("")){
[25241]454 core = new ApplyXSLT(sourcelang, targetlang);
[20875]455 try {
456 core.translate(xml_file,xsl_file,output_file);
457 }
458 catch(Exception e){e.printStackTrace();}
459 }
460 else{
461 printUsage();
462 }
[13470]463
[20875]464 }
[13470]465
[20875]466 private static void checkFile(String filename){
467 File file = new File(filename);
468 if (!file.exists()){
469 System.out.println("Error: "+filename+" doesn't exist!");
470 System.exit(-1);
[13470]471 }
[20875]472 }
[13470]473
[20875]474 private static void printUsage(){
[25241]475 System.out.println("Usage: ApplyXSLT -x File -t File [-m File] [-o File] [-s sourcelang] [-l targetlang]");
[20882]476 System.out.println("\t-x specifies the xml file (Note: optional for piped xml data)");
477 System.out.println("\t-t specifies the xsl file");
[20875]478 System.out.println("\t-m specifies the mapping file (for MARCXMLPlugout.pm only)");
479 System.out.println("\t-o specifies the output file name (output to screen if this option is absent)");
[25241]480 System.out.println("\t-s specifies the input language code for generating TMX file. Defaults to 'en' if none is provided");
481 System.out.println("\t-l specifies the output language code. Required if generating a TMX file.");
[20875]482 System.exit(-1);
483 }
[12395]484}
485
486
Note: See TracBrowser for help on using the repository browser.