source: gli/trunk/src/org/greenstone/gatherer/metadata/DocXMLFile.java@ 16579

Last change on this file since 16579 was 16579, checked in by ak19, 16 years ago

Modified to work with changes in doc.pm: doc.pm Base64 encodes gsdlsourcefilename now, so this class must decode it.

  • Property svn:keywords set to Author Date Id Revision
File size: 11.1 KB
Line 
1/**
2 *############################################################################
3 * A component of the Greenstone Librarian Interface, part of the Greenstone
4 * digital library suite from the New Zealand Digital Library Project at the
5 * University of Waikato, New Zealand.
6 *
7 * Author: Michael Dewsnip, NZDL Project, University of Waikato, NZ
8 *
9 * Copyright (C) 2004 New Zealand Digital Library Project
10 *
11 * This program is free software; you can redistribute it and/or modify
12 * it under the terms of the GNU General Public License as published by
13 * the Free Software Foundation; either version 2 of the License, or
14 * (at your option) any later version.
15 *
16 * This program is distributed in the hope that it will be useful,
17 * but WITHOUT ANY WARRANTY; without even the implied warranty of
18 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19 * GNU General Public License for more details.
20 *
21 * You should have received a copy of the GNU General Public License
22 * along with this program; if not, write to the Free Software
23 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24 *############################################################################
25 */
26
27package org.greenstone.gatherer.metadata;
28
29
30import java.io.*;
31import java.util.*;
32import org.greenstone.gatherer.DebugStream;
33import org.greenstone.gatherer.util.Utility;
34import org.greenstone.gatherer.feedback.Base64;
35
36/** This class represents one doc.xml file */
37public class DocXMLFile
38 extends File
39{
40 private HashMap source_file_name_to_description_elements_mapping = new HashMap();
41
42
43 public DocXMLFile(String doc_xml_file_path)
44 {
45 super(doc_xml_file_path);
46 }
47
48
49 public ArrayList getMetadataExtractedFromFile(File file)
50 {
51 // Build up a list of metadata extracted from this file
52 ArrayList metadata_values = new ArrayList();
53
54 String file_relative_path = file.getAbsolutePath();
55 int import_index = file_relative_path.indexOf("import");
56 if (import_index != -1) {
57 file_relative_path = file_relative_path.substring(import_index + "import".length() + 1);
58 }
59
60 // Check whether this doc.xml file contains extracted metadata for the specified file
61 ArrayList description_elements_list = (ArrayList) source_file_name_to_description_elements_mapping.get(file_relative_path);
62 if (description_elements_list == null) {
63 // ...it doesn't
64 return metadata_values;
65 }
66
67 MetadataSet extracted_metadata_set = MetadataSetManager.getMetadataSet(MetadataSetManager.EXTRACTED_METADATA_NAMESPACE);
68
69 // Parse the doc.xml file
70 DebugStream.println("Applicable doc.xml file: " + this);
71 try {
72 BufferedReader buffered_reader = new BufferedReader(new FileReader(this));
73
74 int description_element_num = 0;
75 int next_description_element_start = ((Integer) description_elements_list.get(description_element_num)).intValue();
76 boolean in_relevant_description_element = false;
77
78 String line = null;
79 for (int line_num = 0; (line = buffered_reader.readLine()) != null; line_num++) {
80 // Check if this line contains the start of a relevant Description element
81 if (line_num == next_description_element_start) {
82 in_relevant_description_element = true;
83 continue;
84 }
85
86 // If we're not in a relevant Description element we don't care about anything
87 if (in_relevant_description_element == false) {
88 continue;
89 }
90
91 // Check if this line contains the end of the relevant Description element
92 if (line.indexOf("</Description>") != -1) {
93 description_element_num++;
94 if (description_element_num == description_elements_list.size()) {
95 break;
96 }
97
98 next_description_element_start = ((Integer) description_elements_list.get(description_element_num)).intValue();
99 in_relevant_description_element = false;
100 continue;
101 }
102
103 // If this line doesn't contain a complete Metadata element, we're not interested
104 if (line.indexOf("<Metadata ") == -1 || line.indexOf("</Metadata>") == -1) {
105 continue;
106 }
107
108 // Extract the metadata element name
109 int name_index = line.indexOf(" name=\"") + " name=\"".length();
110 String metadata_element_name_full = line.substring(name_index, line.indexOf("\"", name_index));
111
112 // If the metadata has a namespace it isn't extracted metadata, so we're not interested
113 String metadata_set_namespace = MetadataTools.getMetadataSetNamespace(metadata_element_name_full);
114 if (!metadata_set_namespace.equals("")) {
115 continue;
116 }
117
118 // Extracted metadata!
119 String metadata_element_name = metadata_element_name_full;
120
121 // We completely ignore bibliographic data
122 if (metadata_element_name.equals("SourceSegment")) {
123 buffered_reader.close();
124 return new ArrayList();
125 }
126
127 // Ignore metadata starting with gsdl (used to be lowercase metadata and /metadata)
128 if (metadata_element_name.startsWith("gsdl")) {
129 continue;
130 }
131
132 MetadataElement metadata_element = extracted_metadata_set.getMetadataElementWithName(metadata_element_name);
133
134 // Value trees are not stored for extracted metadata, so create a new value tree node now
135 int value_index = line.indexOf(">", name_index) + ">".length();
136 String metadata_element_value = line.substring(value_index, line.lastIndexOf("</Metadata>"));
137
138 metadata_element.addMetadataValue(metadata_element_value);
139 MetadataValueTreeNode metadata_value_tree_node = metadata_element.getMetadataValueTreeNode(metadata_element_value);
140
141 // Add the new metadata value to the list
142 MetadataValue metadata_value = new MetadataValue(metadata_element, metadata_value_tree_node);
143 metadata_values.add(metadata_value);
144 }
145
146 buffered_reader.close();
147 }
148 catch (FileNotFoundException exception) {
149 DebugStream.printStackTrace(exception);
150 }
151 catch (IOException exception) {
152 DebugStream.printStackTrace(exception);
153 }
154
155 return metadata_values;
156 }
157
158
159 /**
160 * Every doc.xml file must be skimmed when a collection is opened, for two reasons:
161 * - To build a mapping from source file to its corresponding doc.xml file
162 * - To get a complete list of all extracted metadata elements
163 */
164 public void skimFile()
165 {
166 MetadataSet extracted_metadata_set = MetadataSetManager.getMetadataSet(MetadataSetManager.EXTRACTED_METADATA_NAMESPACE);
167
168 // Skim the doc.xml file as quickly as possible (don't parse as XML), looking at the Metadata elements
169 DebugStream.println("Skimming doc.xml file " + this + "...");
170 try {
171 BufferedReader buffered_reader = new BufferedReader(new FileReader(this));
172 int description_element_start = -1;
173
174 String line = null;
175 for (int line_num = 0; (line = buffered_reader.readLine()) != null; line_num++) {
176 // This line contains the start of a Description element
177 if (line.indexOf("<Description>") != -1) {
178 if (description_element_start != -1) {
179 System.err.println("Parse error: previous Description element unfinished!");
180 }
181 description_element_start = line_num;
182 continue;
183 }
184
185 // This line contains the end of a Description element
186 if (line.indexOf("</Description>") != -1) {
187 if (description_element_start == -1) {
188 System.err.println("Parse error: Description element unstarted!");
189 }
190 description_element_start = -1;
191 continue;
192 }
193
194 // If we're not in a Description element there shouldn't be any Metadata elements
195 if (description_element_start == -1) {
196 continue;
197 }
198
199 // This line doesn't contain a Metadata element, so we're not interested
200 if (line.indexOf("<Metadata ") == -1) {
201 DebugStream.println("Warning: Description element line doesn't contain Metadata element.");
202 continue;
203 }
204
205 // Extract the metadata element name
206 int name_index = line.indexOf(" name=\"") + " name=\"".length();
207 String metadata_element_name_full = line.substring(name_index, line.indexOf("\"", name_index));
208
209 // If the metadata has a namespace it isn't extracted metadata, so we're not interested
210 String metadata_set_namespace = MetadataTools.getMetadataSetNamespace(metadata_element_name_full);
211 if (!metadata_set_namespace.equals("")) {
212 continue;
213 }
214
215 // Extracted metadata!
216 String metadata_element_name = metadata_element_name_full;
217
218 // Note which file this doc.xml is for
219 if (metadata_element_name.equals("gsdlsourcefilename")) {
220 // Extract the gsdlsourcefilename element value
221 int value_index = line.indexOf(">", name_index) + ">".length();
222 String gsdlsourcefilename_value = line.substring(value_index, line.indexOf("<", value_index));
223
224 // We're only interested in the path relative to the import folder
225 int import_index = gsdlsourcefilename_value.indexOf("import");
226 if (import_index != -1) {
227 gsdlsourcefilename_value = gsdlsourcefilename_value.substring(import_index + "import".length());
228
229 boolean is_unix_path = gsdlsourcefilename_value.startsWith("/");
230 gsdlsourcefilename_value = gsdlsourcefilename_value.substring(1);
231
232 // Apply Base64 decoding - note that encoding into base64 may have
233 // encoded some characters as slashes. Since slash characters are
234 // removed below the decoding must be done before that to ensure
235 // only genuine slashes remain for processing:
236 gsdlsourcefilename_value = new String(Base64.decode(gsdlsourcefilename_value));
237 //System.err.println("****decodedSrcFilename is: " + gsdlsourcefilename_value);
238
239 // Make sure the path matches the OS that is running
240 if (is_unix_path && Utility.isWindows()) {
241 // Convert path from Unix to Windows
242 gsdlsourcefilename_value = gsdlsourcefilename_value.replaceAll("\\/", "\\\\");
243 }
244 else if (!is_unix_path && !Utility.isWindows()) {
245 // Convert path from Windows to Unix
246 gsdlsourcefilename_value = gsdlsourcefilename_value.replaceAll("\\\\", "/");
247 }
248
249 // Remember this for quick access later
250 if (source_file_name_to_description_elements_mapping.get(gsdlsourcefilename_value) == null) {
251 source_file_name_to_description_elements_mapping.put(gsdlsourcefilename_value, new ArrayList());
252 }
253
254 ((ArrayList) source_file_name_to_description_elements_mapping.get(gsdlsourcefilename_value)).add(new Integer(description_element_start));
255 }
256
257 // Warn about an odd gsdlsourcefilename, except if it is the Greenstone "tmp" directory
258 // This is true when the source files come from a zip file processed by ZIPPlug, for example
259 else if (gsdlsourcefilename_value.indexOf("tmp") == -1) {
260 // We don't really know what is going on...
261 System.err.println("Warning: Could not understand gsdlsourcefilename " + gsdlsourcefilename_value);
262 }
263 }
264
265 // Ignore metadata starting with gsdl (used to be lowercase metadata and /metadata)
266 if (metadata_element_name.startsWith("gsdl")) {
267 continue;
268 }
269
270 MetadataElement metadata_element = extracted_metadata_set.getMetadataElementWithName(metadata_element_name);
271 if (metadata_element == null) {
272 // This element isn't defined in ex.mds, so create it for this session
273 DebugStream.println("Extracted metadata element not defined: " + metadata_element_name);
274 extracted_metadata_set.addMetadataElementForThisSession(metadata_element_name);
275 }
276 }
277
278 buffered_reader.close();
279 }
280 catch (FileNotFoundException exception) {
281 DebugStream.printStackTrace(exception);
282 }
283 catch (IOException exception) {
284 DebugStream.printStackTrace(exception);
285 }
286 }
287}
Note: See TracBrowser for help on using the repository browser.