source: gli/trunk/src/org/greenstone/gatherer/metadata/DocXMLFile.java@ 16671

Last change on this file since 16671 was 16671, checked in by ak19, 16 years ago

Instead of Base64 decoding the gsdl_source_filename it now URL decodes. This works with the doc.pm perl script which URLencodes gsdl_source_filename rather than applying base64 encoding on it (which is done to preserve the original character encoding).

  • Property svn:keywords set to Author Date Id Revision
File size: 10.9 KB
Line 
1/**
2 *############################################################################
3 * A component of the Greenstone Librarian Interface, part of the Greenstone
4 * digital library suite from the New Zealand Digital Library Project at the
5 * University of Waikato, New Zealand.
6 *
7 * Author: Michael Dewsnip, NZDL Project, University of Waikato, NZ
8 *
9 * Copyright (C) 2004 New Zealand Digital Library Project
10 *
11 * This program is free software; you can redistribute it and/or modify
12 * it under the terms of the GNU General Public License as published by
13 * the Free Software Foundation; either version 2 of the License, or
14 * (at your option) any later version.
15 *
16 * This program is distributed in the hope that it will be useful,
17 * but WITHOUT ANY WARRANTY; without even the implied warranty of
18 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19 * GNU General Public License for more details.
20 *
21 * You should have received a copy of the GNU General Public License
22 * along with this program; if not, write to the Free Software
23 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24 *############################################################################
25 */
26
27package org.greenstone.gatherer.metadata;
28
29
30import java.io.*;
31import java.util.*;
32import java.net.URLDecoder;
33import org.greenstone.gatherer.DebugStream;
34import org.greenstone.gatherer.util.Utility;
35
36
37/** This class represents one doc.xml file */
38public class DocXMLFile
39 extends File
40{
41 private HashMap source_file_name_to_description_elements_mapping = new HashMap();
42
43
44 public DocXMLFile(String doc_xml_file_path)
45 {
46 super(doc_xml_file_path);
47 }
48
49
50 public ArrayList getMetadataExtractedFromFile(File file)
51 {
52 // Build up a list of metadata extracted from this file
53 ArrayList metadata_values = new ArrayList();
54
55 String file_relative_path = file.getAbsolutePath();
56 int import_index = file_relative_path.indexOf("import");
57 if (import_index != -1) {
58 file_relative_path = file_relative_path.substring(import_index + "import".length() + 1);
59 }
60
61 // Check whether this doc.xml file contains extracted metadata for the specified file
62 ArrayList description_elements_list = (ArrayList) source_file_name_to_description_elements_mapping.get(file_relative_path);
63 if (description_elements_list == null) {
64 // ...it doesn't
65 return metadata_values;
66 }
67
68 MetadataSet extracted_metadata_set = MetadataSetManager.getMetadataSet(MetadataSetManager.EXTRACTED_METADATA_NAMESPACE);
69
70 // Parse the doc.xml file
71 DebugStream.println("Applicable doc.xml file: " + this);
72 try {
73 BufferedReader buffered_reader = new BufferedReader(new FileReader(this));
74
75 int description_element_num = 0;
76 int next_description_element_start = ((Integer) description_elements_list.get(description_element_num)).intValue();
77 boolean in_relevant_description_element = false;
78
79 String line = null;
80 for (int line_num = 0; (line = buffered_reader.readLine()) != null; line_num++) {
81 // Check if this line contains the start of a relevant Description element
82 if (line_num == next_description_element_start) {
83 in_relevant_description_element = true;
84 continue;
85 }
86
87 // If we're not in a relevant Description element we don't care about anything
88 if (in_relevant_description_element == false) {
89 continue;
90 }
91
92 // Check if this line contains the end of the relevant Description element
93 if (line.indexOf("</Description>") != -1) {
94 description_element_num++;
95 if (description_element_num == description_elements_list.size()) {
96 break;
97 }
98
99 next_description_element_start = ((Integer) description_elements_list.get(description_element_num)).intValue();
100 in_relevant_description_element = false;
101 continue;
102 }
103
104 // If this line doesn't contain a complete Metadata element, we're not interested
105 if (line.indexOf("<Metadata ") == -1 || line.indexOf("</Metadata>") == -1) {
106 continue;
107 }
108
109 // Extract the metadata element name
110 int name_index = line.indexOf(" name=\"") + " name=\"".length();
111 String metadata_element_name_full = line.substring(name_index, line.indexOf("\"", name_index));
112
113 // If the metadata has a namespace it isn't extracted metadata, so we're not interested
114 String metadata_set_namespace = MetadataTools.getMetadataSetNamespace(metadata_element_name_full);
115 if (!metadata_set_namespace.equals("")) {
116 continue;
117 }
118
119 // Extracted metadata!
120 String metadata_element_name = metadata_element_name_full;
121
122 // We completely ignore bibliographic data
123 if (metadata_element_name.equals("SourceSegment")) {
124 buffered_reader.close();
125 return new ArrayList();
126 }
127
128 // Ignore metadata starting with gsdl (used to be lowercase metadata and /metadata)
129 if (metadata_element_name.startsWith("gsdl")) {
130 continue;
131 }
132
133 MetadataElement metadata_element = extracted_metadata_set.getMetadataElementWithName(metadata_element_name);
134
135 // Value trees are not stored for extracted metadata, so create a new value tree node now
136 int value_index = line.indexOf(">", name_index) + ">".length();
137 String metadata_element_value = line.substring(value_index, line.lastIndexOf("</Metadata>"));
138
139 metadata_element.addMetadataValue(metadata_element_value);
140 MetadataValueTreeNode metadata_value_tree_node = metadata_element.getMetadataValueTreeNode(metadata_element_value);
141
142 // Add the new metadata value to the list
143 MetadataValue metadata_value = new MetadataValue(metadata_element, metadata_value_tree_node);
144 metadata_values.add(metadata_value);
145 }
146
147 buffered_reader.close();
148 }
149 catch (FileNotFoundException exception) {
150 DebugStream.printStackTrace(exception);
151 }
152 catch (IOException exception) {
153 DebugStream.printStackTrace(exception);
154 }
155
156 return metadata_values;
157 }
158
159
160 /**
161 * Every doc.xml file must be skimmed when a collection is opened, for two reasons:
162 * - To build a mapping from source file to its corresponding doc.xml file
163 * - To get a complete list of all extracted metadata elements
164 */
165 public void skimFile()
166 {
167 MetadataSet extracted_metadata_set = MetadataSetManager.getMetadataSet(MetadataSetManager.EXTRACTED_METADATA_NAMESPACE);
168
169 // Skim the doc.xml file as quickly as possible (don't parse as XML), looking at the Metadata elements
170 DebugStream.println("Skimming doc.xml file " + this + "...");
171 try {
172 BufferedReader buffered_reader = new BufferedReader(new FileReader(this));
173 int description_element_start = -1;
174
175 String line = null;
176 for (int line_num = 0; (line = buffered_reader.readLine()) != null; line_num++) {
177 // This line contains the start of a Description element
178 if (line.indexOf("<Description>") != -1) {
179 if (description_element_start != -1) {
180 System.err.println("Parse error: previous Description element unfinished!");
181 }
182 description_element_start = line_num;
183 continue;
184 }
185
186 // This line contains the end of a Description element
187 if (line.indexOf("</Description>") != -1) {
188 if (description_element_start == -1) {
189 System.err.println("Parse error: Description element unstarted!");
190 }
191 description_element_start = -1;
192 continue;
193 }
194
195 // If we're not in a Description element there shouldn't be any Metadata elements
196 if (description_element_start == -1) {
197 continue;
198 }
199
200 // This line doesn't contain a Metadata element, so we're not interested
201 if (line.indexOf("<Metadata ") == -1) {
202 DebugStream.println("Warning: Description element line doesn't contain Metadata element.");
203 continue;
204 }
205
206 // Extract the metadata element name
207 int name_index = line.indexOf(" name=\"") + " name=\"".length();
208 String metadata_element_name_full = line.substring(name_index, line.indexOf("\"", name_index));
209
210 // If the metadata has a namespace it isn't extracted metadata, so we're not interested
211 String metadata_set_namespace = MetadataTools.getMetadataSetNamespace(metadata_element_name_full);
212 if (!metadata_set_namespace.equals("")) {
213 continue;
214 }
215
216 // Extracted metadata!
217 String metadata_element_name = metadata_element_name_full;
218
219 // Note which file this doc.xml is for
220 if (metadata_element_name.equals("gsdlsourcefilename")) {
221 // Extract the gsdlsourcefilename element value
222 int value_index = line.indexOf(">", name_index) + ">".length();
223 String gsdlsourcefilename_value = line.substring(value_index, line.indexOf("<", value_index));
224
225 // We're only interested in the path relative to the import folder
226 int import_index = gsdlsourcefilename_value.indexOf("import");
227 if (import_index != -1) {
228 gsdlsourcefilename_value = gsdlsourcefilename_value.substring(import_index + "import".length());
229
230 boolean is_unix_path = gsdlsourcefilename_value.startsWith("/");
231 gsdlsourcefilename_value = gsdlsourcefilename_value.substring(1);
232
233 // URL decode gsdlsourcefilename. Need to set the decoder to use the default system encoding
234 // This is stored in the System's file.encoding property.
235 gsdlsourcefilename_value = URLDecoder.decode(gsdlsourcefilename_value, System.getProperty("file.encoding"));
236
237 // Make sure the path matches the OS that is running
238 if (is_unix_path && Utility.isWindows()) {
239 // Convert path from Unix to Windows
240 gsdlsourcefilename_value = gsdlsourcefilename_value.replaceAll("\\/", "\\\\");
241 }
242 else if (!is_unix_path && !Utility.isWindows()) {
243 // Convert path from Windows to Unix
244 gsdlsourcefilename_value = gsdlsourcefilename_value.replaceAll("\\\\", "/");
245 }
246
247 // Remember this for quick access later
248 if (source_file_name_to_description_elements_mapping.get(gsdlsourcefilename_value) == null) {
249 source_file_name_to_description_elements_mapping.put(gsdlsourcefilename_value, new ArrayList());
250 }
251
252 ((ArrayList) source_file_name_to_description_elements_mapping.get(gsdlsourcefilename_value)).add(new Integer(description_element_start));
253 }
254
255 // Warn about an odd gsdlsourcefilename, except if it is the Greenstone "tmp" directory
256 // This is true when the source files come from a zip file processed by ZIPPlug, for example
257 else if (gsdlsourcefilename_value.indexOf("tmp") == -1) {
258 // We don't really know what is going on...
259 System.err.println("Warning: Could not understand gsdlsourcefilename " + gsdlsourcefilename_value);
260 }
261 }
262
263 // Ignore metadata starting with gsdl (used to be lowercase metadata and /metadata)
264 if (metadata_element_name.startsWith("gsdl")) {
265 continue;
266 }
267
268 MetadataElement metadata_element = extracted_metadata_set.getMetadataElementWithName(metadata_element_name);
269 if (metadata_element == null) {
270 // This element isn't defined in ex.mds, so create it for this session
271 DebugStream.println("Extracted metadata element not defined: " + metadata_element_name);
272 extracted_metadata_set.addMetadataElementForThisSession(metadata_element_name);
273 }
274 }
275
276 buffered_reader.close();
277 }
278 catch (FileNotFoundException exception) {
279 DebugStream.printStackTrace(exception);
280 }
281 catch (IOException exception) {
282 DebugStream.printStackTrace(exception);
283 }
284 }
285}
Note: See TracBrowser for help on using the repository browser.