source: trunk/gli/src/org/greenstone/gatherer/msm/GDMParser.java@ 5288

Last change on this file since 5288 was 4674, checked in by jmt12, 21 years ago

* empty log message *

  • Property svn:keywords set to Author Date Id Revision
File size: 7.2 KB
Line 
1/**
2 *#########################################################################
3 *
4 * A component of the Gatherer application, part of the Greenstone digital
5 * library suite from the New Zealand Digital Library Project at the
6 * University of Waikato, New Zealand.
7 *
8 * <BR><BR>
9 *
10 * Author: John Thompson, Greenstone Digital Library, University of Waikato
11 *
12 * <BR><BR>
13 *
14 * Copyright (C) 1999 New Zealand Digital Library Project
15 *
16 * <BR><BR>
17 *
18 * This program is free software; you can redistribute it and/or modify
19 * it under the terms of the GNU General Public License as published by
20 * the Free Software Foundation; either version 2 of the License, or
21 * (at your option) any later version.
22 *
23 * <BR><BR>
24 *
25 * This program is distributed in the hope that it will be useful,
26 * but WITHOUT ANY WARRANTY; without even the implied warranty of
27 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
28 * GNU General Public License for more details.
29 *
30 * <BR><BR>
31 *
32 * You should have received a copy of the GNU General Public License
33 * along with this program; if not, write to the Free Software
34 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
35 *########################################################################
36 */
37
38
39
40
41
42
43/* GPL_HEADER */
44package org.greenstone.gatherer.msm;
45/**************************************************************************************
46 * Title: Gatherer
47 * Description: The Gatherer: a tool for gathering and enriching a digital collection.
48 * Company: The University of Waikato
49 * Written: / /02
50 * Revised: 20/08/02 Commented and Optimized.
51 * @author John Thompson, 9826509
52 * @version 2.3
53 **************************************************************************************/
54import java.io.BufferedReader;
55import java.io.File;
56import java.io.FileInputStream;
57import java.io.InputStream;
58import java.io.InputStreamReader;
59import java.io.Reader;
60import java.lang.IllegalArgumentException;
61import java.lang.ref.SoftReference;
62import java.util.ArrayList;
63import java.util.LinkedHashMap;
64import java.util.Map;
65import org.apache.xerces.parsers.DOMParser;
66import org.apache.xml.serialize.XMLSerializer;
67import org.apache.xml.serialize.OutputFormat;
68import org.w3c.dom.Document;
69import org.xml.sax.InputSource;
70/** Parses metadata.xml documents of the GreenstoneDirectoryMetadata variety, caching where possible. */
71// ####################################################################################
72// Optimization Saving
73// ####################################################################################
74// Vector -> ArrayList + Memory, + Processor (pos. - Processor)
75// Hashtable -> HashMap + Memory, + Processor
76// Removed extra global references + Memory (16k)
77// ####################################################################################
78public class GDMParser
79 extends LinkedHashMap {
80 /** A list of file names that we know do not actually belong to valid GDM xml files, so there not much point in ever trying to read them again. */
81 private ArrayList ignore = null;
82 /** The actual xerces parser used to read in xml documents. */
83 private DOMParser parser = null;
84 /** The default maximum cache size if max size not explicitly set. */
85 private int max_size = 25;
86 /** Default constructor. */
87 public GDMParser() {
88 super();
89 this.ignore = new ArrayList();
90 try {
91 parser = new DOMParser();
92 // Don't let it import external dtds. If it does it'll probably spit the dummy. If people try to use a poorly formated xml file more fool them.
93 parser.setFeature("http://xml.org/sax/features/validation", false);
94 parser.setFeature("http://apache.org/xml/features/nonvalidating/load-external-dtd", false);
95 // May or may not be ignored, the documentation for Xerces is contradictory. If it works then parsing -should- be faster.
96 parser.setFeature("http://apache.org/xml/features/dom/defer-node-expansion", true);
97 }
98 catch(Exception error) {
99 ///ystem.err.println("Fatal Error in GDMParser.init(): " + error);
100 error.printStackTrace();
101 System.exit(1);
102 }
103 }
104 /** Constructor with maximum size set.
105 * @param max_size The maximum size of the cache, as an <i>int</i>.
106 */
107 public GDMParser(int max_size) {
108 this();
109 this.max_size = max_size;
110 }
111 /** Destructor, clears cache and remove persistant global references. */
112 public void destroy() {
113 ignore.clear();
114 ignore = null;
115 parser = null;
116 clear();
117 }
118 /** Fetches the document for the given xml file. This may mean (re)parsing it or simply fetching it from cache.
119 * @param file The metadata.xml <strong>File</strong> you wish to get the document for.
120 * @return A <strong>Document</strong> which is sourced from file.
121 */
122 public Document parse(File file) {
123 ///ystem.err.println("Parse: " + file.getAbsolutePath());
124 Document result = null;
125 if(file.exists()) {
126 // Check if we've already parsed this file in an earlier attempt.
127 if(containsKey(file)) {
128 ///ystem.err.println("Already cached previously.");
129 //result = (Document) get(file);
130 SoftReference reference = (SoftReference) get(file);
131 if(reference != null) {
132 result = (Document) reference.get();
133 }
134 else {
135 ///ystem.err.println("Reference expired.");
136 }
137 }
138 // Check the ignore list and see if we've already detected this isn't a greenstone metadata file.
139 if(result == null && !ignore.contains(file)) {
140 ///ystem.err.println("Reparse file.");
141 // Of course we may not have, or it may have expired so...
142 try {
143 // Display progress dialog.
144 InputStream is = new FileInputStream(file);
145 InputStreamReader isr = new InputStreamReader(is);
146 Reader r = new BufferedReader(isr);
147 InputSource isc = new InputSource(r);
148 parser.parse(isc); // Slow.
149 Document document = parser.getDocument();
150 // First test. Check we have a GreenstoneDirectoryMetadata file, or for the older version DirectoryMetadata.
151 if(!document.getDoctype().getName().equals("GreenstoneDirectoryMetadata") && !document.getDoctype().getName().equals("DirectoryMetadata")) {
152 ///ystem.err.println("Adding to ignore list.");
153 // Add to ignore list. Not a gdm file.
154 ignore.add(file);
155 }
156 // Cache document.
157 else {
158 ///ystem.err.println("Adding to cache.");
159 put(file, new SoftReference(document));
160 result = document;
161 }
162 }
163 catch (Exception error) {
164 ///ystem.err.println("Error! " + error);
165 error.printStackTrace();
166 }
167 }
168 else {
169 ///ystem.err.println("File on ignore list.");
170 }
171 }
172 else {
173 ///ystem.err.println("File does not exist!");
174 }
175 return result;
176 }
177 /** Automatically called by the LinkedHashMap object whenever an object is added, to determine whether it should remove the oldest entry.
178 * @param eldest The eldest <strong>Map.Entry</strong> which may mean in terms of age, or in terms of usage.
179 * @return <i>true</i> if the given entry should be removed, <i>false</i> otherwise.
180 */
181 protected boolean removeEldestEntry(Map.Entry eldest) {
182 return size() > max_size;
183 }
184}
Note: See TracBrowser for help on using the repository browser.