source: trunk/gli/src/org/greenstone/gatherer/metadata/DocXMLFile.java@ 9701

Last change on this file since 9701 was 9701, checked in by mdewsnip, 19 years ago

Prevented the "Could not understand gsdlsourcefilename" warning for odd gsdlsourcefilename's consisting of the Greenstone "tmp" directory. This is true for source files from zip files processed by ZIPPlug for example. It is not possible to view the extracted metadata for these files (because they are not visible on the Enrich pane).

  • Property svn:keywords set to Author Date Id Revision
File size: 10.6 KB
Line 
1/**
2 *############################################################################
3 * A component of the Greenstone Librarian Interface, part of the Greenstone
4 * digital library suite from the New Zealand Digital Library Project at the
5 * University of Waikato, New Zealand.
6 *
7 * Author: Michael Dewsnip, NZDL Project, University of Waikato, NZ
8 *
9 * Copyright (C) 2004 New Zealand Digital Library Project
10 *
11 * This program is free software; you can redistribute it and/or modify
12 * it under the terms of the GNU General Public License as published by
13 * the Free Software Foundation; either version 2 of the License, or
14 * (at your option) any later version.
15 *
16 * This program is distributed in the hope that it will be useful,
17 * but WITHOUT ANY WARRANTY; without even the implied warranty of
18 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19 * GNU General Public License for more details.
20 *
21 * You should have received a copy of the GNU General Public License
22 * along with this program; if not, write to the Free Software
23 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24 *############################################################################
25 */
26
27package org.greenstone.gatherer.metadata;
28
29
30import java.io.*;
31import java.util.*;
32import org.greenstone.gatherer.DebugStream;
33import org.greenstone.gatherer.util.Utility;
34
35
36/** This class represents one doc.xml file */
37public class DocXMLFile
38 extends File
39{
40 private HashMap source_file_name_to_description_elements_mapping = new HashMap();
41
42
43 public DocXMLFile(String doc_xml_file_path)
44 {
45 super(doc_xml_file_path);
46 }
47
48
49 public ArrayList getMetadataExtractedFromFile(File file)
50 {
51 // Build up a list of metadata extracted from this file
52 ArrayList metadata_values = new ArrayList();
53
54 String file_relative_path = file.getAbsolutePath();
55 int import_index = file_relative_path.indexOf("import");
56 if (import_index != -1) {
57 file_relative_path = file_relative_path.substring(import_index + "import".length() + 1);
58 }
59
60 // Check whether this doc.xml file contains extracted metadata for the specified file
61 ArrayList description_elements_list = (ArrayList) source_file_name_to_description_elements_mapping.get(file_relative_path);
62 if (description_elements_list == null) {
63 // ...it doesn't
64 return metadata_values;
65 }
66
67 MetadataSet extracted_metadata_set = MetadataSetManager.getMetadataSet(MetadataSetManager.EXTRACTED_METADATA_NAMESPACE);
68
69 // Parse the doc.xml file
70 DebugStream.println("Applicable doc.xml file: " + this);
71 try {
72 BufferedReader buffered_reader = new BufferedReader(new FileReader(this));
73
74 int description_element_num = 0;
75 int next_description_element_start = ((Integer) description_elements_list.get(description_element_num)).intValue();
76 boolean in_relevant_description_element = false;
77
78 String line = null;
79 for (int line_num = 0; (line = buffered_reader.readLine()) != null; line_num++) {
80 // Check if this line contains the start of a relevant Description element
81 if (line_num == next_description_element_start) {
82 in_relevant_description_element = true;
83 continue;
84 }
85
86 // If we're not in a relevant Description element we don't care about anything
87 if (in_relevant_description_element == false) {
88 continue;
89 }
90
91 // Check if this line contains the end of the relevant Description element
92 if (line.indexOf("</Description>") != -1) {
93 description_element_num++;
94 if (description_element_num == description_elements_list.size()) {
95 break;
96 }
97
98 next_description_element_start = ((Integer) description_elements_list.get(description_element_num)).intValue();
99 in_relevant_description_element = false;
100 continue;
101 }
102
103 // If this line doesn't contain a complete Metadata element, we're not interested
104 if (line.indexOf("<Metadata ") == -1 || line.indexOf("</Metadata>") == -1) {
105 continue;
106 }
107
108 // Extract the metadata element name
109 int name_index = line.indexOf(" name=\"") + " name=\"".length();
110 String metadata_element_name_full = line.substring(name_index, line.indexOf("\"", name_index));
111
112 // If the metadata has a namespace it isn't extracted metadata, so we're not interested
113 String metadata_set_namespace = MetadataTools.getMetadataSetNamespace(metadata_element_name_full);
114 if (!metadata_set_namespace.equals("")) {
115 continue;
116 }
117
118 // Extracted metadata!
119 String metadata_element_name = metadata_element_name_full;
120
121 // We completely ignore bibliographic data
122 if (metadata_element_name.equals("SourceSegment")) {
123 return new ArrayList();
124 }
125
126 // Ignore lower-case metadata elements (gsdlsourcefilename, gsdlassocfile etc.)
127 // and those starting with '/' (/srclink)
128 char first_character = metadata_element_name.charAt(0);
129 if (Character.isLowerCase(first_character) || first_character == '/') {
130 continue;
131 }
132
133 MetadataElement metadata_element = extracted_metadata_set.getMetadataElementWithName(metadata_element_name);
134
135 // Value trees are not stored for extracted metadata, so create a new value tree node now
136 int value_index = line.indexOf(">", name_index) + ">".length();
137 String metadata_element_value = line.substring(value_index, line.lastIndexOf("</Metadata>"));
138
139 metadata_element.addMetadataValue(metadata_element_value);
140 MetadataValueTreeNode metadata_value_tree_node = metadata_element.getMetadataValueTreeNode(metadata_element_value);
141
142 // Add the new metadata value to the list
143 MetadataValue metadata_value = new MetadataValue(metadata_element, metadata_value_tree_node);
144 metadata_values.add(metadata_value);
145 }
146 }
147 catch (Exception ex) {
148 System.err.println("Exception: " + ex);
149 ex.printStackTrace();
150 }
151
152 return metadata_values;
153 }
154
155
156 /**
157 * Every doc.xml file must be skimmed when a collection is opened, for two reasons:
158 * - To build a mapping from source file to its corresponding doc.xml file
159 * - To get a complete list of all extracted metadata elements
160 */
161 public void skimFile()
162 {
163 MetadataSet extracted_metadata_set = MetadataSetManager.getMetadataSet(MetadataSetManager.EXTRACTED_METADATA_NAMESPACE);
164
165 // Skim the doc.xml file as quickly as possible (don't parse as XML), looking at the Metadata elements
166 DebugStream.println("Skimming doc.xml file " + this + "...");
167 try {
168 BufferedReader buffered_reader = new BufferedReader(new FileReader(this));
169 int description_element_start = -1;
170
171 String line = null;
172 for (int line_num = 0; (line = buffered_reader.readLine()) != null; line_num++) {
173 // This line contains the start of a Description element
174 if (line.indexOf("<Description>") != -1) {
175 if (description_element_start != -1) {
176 System.err.println("Parse error: previous Description element unfinished!");
177 }
178 description_element_start = line_num;
179 continue;
180 }
181
182 // This line contains the end of a Description element
183 if (line.indexOf("</Description>") != -1) {
184 if (description_element_start == -1) {
185 System.err.println("Parse error: Description element unstarted!");
186 }
187 description_element_start = -1;
188 continue;
189 }
190
191 // If we're not in a Description element there shouldn't be any Metadata elements
192 if (description_element_start == -1) {
193 continue;
194 }
195
196 // This line doesn't contain a Metadata element, so we're not interested
197 if (line.indexOf("<Metadata ") == -1) {
198 DebugStream.println("Warning: Description element line doesn't contain Metadata element.");
199 continue;
200 }
201
202 // Extract the metadata element name
203 int name_index = line.indexOf(" name=\"") + " name=\"".length();
204 String metadata_element_name_full = line.substring(name_index, line.indexOf("\"", name_index));
205
206 // If the metadata has a namespace it isn't extracted metadata, so we're not interested
207 String metadata_set_namespace = MetadataTools.getMetadataSetNamespace(metadata_element_name_full);
208 if (!metadata_set_namespace.equals("")) {
209 continue;
210 }
211
212 // Extracted metadata!
213 String metadata_element_name = metadata_element_name_full;
214
215 // Note which file this doc.xml is for
216 if (metadata_element_name.equals("gsdlsourcefilename")) {
217 // Extract the gsdlsourcefilename element value
218 int value_index = line.indexOf(">", name_index) + ">".length();
219 String gsdlsourcefilename_value = line.substring(value_index, line.indexOf("<", value_index));
220
221 // We're only interested in the path relative to the import folder
222 int import_index = gsdlsourcefilename_value.indexOf("import");
223 if (import_index != -1) {
224 gsdlsourcefilename_value = gsdlsourcefilename_value.substring(import_index + "import".length());
225
226 boolean is_unix_path = gsdlsourcefilename_value.startsWith("/");
227 gsdlsourcefilename_value = gsdlsourcefilename_value.substring(1);
228
229 // Make sure the path matches the OS that is running
230 if (is_unix_path && Utility.isWindows()) {
231 // Convert path from Unix to Windows
232 gsdlsourcefilename_value = gsdlsourcefilename_value.replaceAll("\\/", "\\\\");
233 }
234 else if (!is_unix_path && !Utility.isWindows()) {
235 // Convert path from Windows to Unix
236 gsdlsourcefilename_value = gsdlsourcefilename_value.replaceAll("\\\\", "/");
237 }
238
239 // Remember this for quick access later
240 if (source_file_name_to_description_elements_mapping.get(gsdlsourcefilename_value) == null) {
241 source_file_name_to_description_elements_mapping.put(gsdlsourcefilename_value, new ArrayList());
242 }
243
244 ((ArrayList) source_file_name_to_description_elements_mapping.get(gsdlsourcefilename_value)).add(new Integer(description_element_start));
245 }
246
247 // Warn about an odd gsdlsourcefilename, except if it is the Greenstone "tmp" directory
248 // This is true when the source files come from a zip file processed by ZIPPlug, for example
249 else if (gsdlsourcefilename_value.indexOf("tmp") == -1) {
250 // We don't really know what is going on...
251 System.err.println("Warning: Could not understand gsdlsourcefilename " + gsdlsourcefilename_value);
252 }
253 }
254
255 // Ignore lower-case metadata elements (gsdlsourcefilename, gsdlassocfile etc.)
256 // and those starting with '/' (/srclink)
257 char first_character = metadata_element_name.charAt(0);
258 if (Character.isLowerCase(first_character) || first_character == '/') {
259 continue;
260 }
261
262 MetadataElement metadata_element = extracted_metadata_set.getMetadataElementWithName(metadata_element_name);
263 if (metadata_element == null) {
264 // This element isn't defined in ex.mds, so create it for this session
265 DebugStream.println("Extracted metadata element not defined: " + metadata_element_name);
266 extracted_metadata_set.addMetadataElementForThisSession(metadata_element_name);
267 }
268 }
269 }
270 catch (Exception ex) {
271 System.err.println("Exception: " + ex);
272 ex.printStackTrace();
273 }
274 }
275}
Note: See TracBrowser for help on using the repository browser.