source: trunk/gsdl3/web/sites/localsite/collect/gberg/java/ImportXML.java@ 5956

Last change on this file since 5956 was 5956, checked in by kjdon, 20 years ago

new files for making a simple lucene collection from xml documents

  • Property svn:keywords set to Author Date Id Revision
File size: 3.2 KB
Line 
1/**
2 *
3 * @author [email protected]
4 * @version
5 */
6
7
8// gsdl3 classes
9import org.greenstone.gsdl3.util.XMLConverter;
10import org.greenstone.gsdl3.util.GSFile;
11// XML classes
12import org.w3c.dom.Document;
13import org.w3c.dom.Element;
14import org.w3c.dom.Node;
15import org.w3c.dom.traversal.NodeIterator;
16import org.w3c.dom.traversal.NodeFilter;
17import org.w3c.dom.traversal.DocumentTraversal;
18import org.apache.xml.serialize.XMLSerializer;
19import org.xml.sax.InputSource;
20import org.xml.sax.EntityResolver;
21// java classes
22import java.io.File;
23import java.io.FileOutputStream;
24
25public class ImportXML
26 implements EntityResolver {
27 File out_dir = null;
28 XMLConverter converter = null;
29
30 String base_path = null;
31 public ImportXML() {
32 converter = new XMLConverter();
33 converter.setEntityResolver(this);
34
35 }
36 public void setOutDir(File out_dir) {
37 this.out_dir = out_dir;
38 }
39 public void init() {
40
41 }
42 public void importFile(File file) throws Exception {
43 importFile(file, "");
44 }
45 protected void importFile(File file, String local_path) throws Exception {
46
47 if (file.isDirectory()) {
48 File files [] = file.listFiles();
49 for (int i=0; i<files.length; i++) {
50 //if (files[i].getName().endsWith(".xml")) {
51 importFile(files[i], local_path+File.separator+files[i].getName());
52 //}
53 }
54 return;
55 }
56
57 base_path = file.getPath();
58 base_path = base_path.substring(0, base_path.lastIndexOf(File.separatorChar));
59 System.out.println("base path = "+base_path);
60 // now we have an actual file
61 System.out.println("processing file "+file.getPath());
62 File out_file = new File (out_dir, local_path);
63 String name = file.getName();
64 if (name.endsWith(".dtd")) {
65 if (!GSFile.copyFile(file, out_file)) {
66 System.err.println("couldn't copy dtd file "+file.getPath()+" to "+out_file.getPath()+"- please do the copy yourself");
67 }
68 //copy the file
69 return;
70 }
71 if (!name.endsWith(".xml")) {
72 // now we ignore any that don't end in .xml
73 return;
74 }
75 // now do the importing
76 Document doc = converter.getDOM(file);
77
78 String gs3NS = "http://www.greenstone.org/gs3";
79
80 Element rootNode = doc.getDocumentElement();
81
82 rootNode.setAttribute("xmlns:gs3", gs3NS);
83
84 DocumentTraversal traversal = (DocumentTraversal)doc;
85 NodeIterator i = traversal.createNodeIterator(doc, NodeFilter.SHOW_ELEMENT, null, true);
86
87 Element element = null;
88 Node node = null;
89 int id = 0;
90 while ((node = i.nextNode()) != null) {
91 element = (Element)node;
92 if (XMLTagInfo.isIndexable(element.getNodeName())) {
93 element.setAttribute("gs3:id", Integer.toString(id++));
94 }
95 }
96
97 XMLSerializer gs3Serializer = new XMLSerializer(new FileOutputStream(out_file), null);
98 gs3Serializer.asDOMSerializer().serialize(doc);
99
100 }
101
102 public void finish() {
103 }
104
105 public InputSource resolveEntity (String public_id, String system_id) {
106
107 if (system_id.startsWith("file://")) {
108 return new InputSource(system_id);
109 }
110 if (!system_id.startsWith(File.separator)) {
111 system_id = base_path+File.separatorChar+system_id;
112 }
113 return new InputSource("file://"+system_id);
114 }
115
116}
Note: See TracBrowser for help on using the repository browser.