source: trunk/gli/src/org/greenstone/gatherer/metadata/DocXMLFile.java@ 8669

Last change on this file since 8669 was 8669, checked in by mdewsnip, 19 years ago

Prevented a null pointer exception where extracted metadata contains a newline character. Eventually this should be fixed so it is dealt with correctly.

  • Property svn:keywords set to Author Date Id Revision
File size: 10.4 KB
Line 
1/**
2 *############################################################################
3 * A component of the Greenstone Librarian Interface, part of the Greenstone
4 * digital library suite from the New Zealand Digital Library Project at the
5 * University of Waikato, New Zealand.
6 *
7 * Author: Michael Dewsnip, NZDL Project, University of Waikato, NZ
8 *
9 * Copyright (C) 2004 New Zealand Digital Library Project
10 *
11 * This program is free software; you can redistribute it and/or modify
12 * it under the terms of the GNU General Public License as published by
13 * the Free Software Foundation; either version 2 of the License, or
14 * (at your option) any later version.
15 *
16 * This program is distributed in the hope that it will be useful,
17 * but WITHOUT ANY WARRANTY; without even the implied warranty of
18 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19 * GNU General Public License for more details.
20 *
21 * You should have received a copy of the GNU General Public License
22 * along with this program; if not, write to the Free Software
23 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24 *############################################################################
25 */
26
27package org.greenstone.gatherer.metadata;
28
29
30import java.io.*;
31import java.util.*;
32import org.greenstone.gatherer.DebugStream;
33import org.greenstone.gatherer.util.Utility;
34
35
36/** This class represents one doc.xml file */
37public class DocXMLFile
38 extends File
39{
40 private HashMap source_file_name_to_description_elements_mapping = new HashMap();
41
42
43 public DocXMLFile(String doc_xml_file_path)
44 {
45 super(doc_xml_file_path);
46 }
47
48
49 public ArrayList getMetadataExtractedFromFile(File file)
50 {
51 // Build up a list of metadata extracted from this file
52 ArrayList metadata_values = new ArrayList();
53
54 String file_relative_path = file.getAbsolutePath();
55 int import_index = file_relative_path.indexOf("import");
56 if (import_index != -1) {
57 file_relative_path = file_relative_path.substring(import_index + "import".length() + 1);
58 }
59
60 // Check whether this doc.xml file contains extracted metadata for the specified file
61 ArrayList description_elements_list = (ArrayList) source_file_name_to_description_elements_mapping.get(file_relative_path);
62 if (description_elements_list == null) {
63 // ...it doesn't
64 return metadata_values;
65 }
66
67 MetadataSet extracted_metadata_set = MetadataSetManager.getMetadataSet(MetadataSetManager.EXTRACTED_METADATA_NAMESPACE);
68
69 // Parse the doc.xml file
70 DebugStream.println("Applicable doc.xml file: " + this);
71 try {
72 BufferedReader buffered_reader = new BufferedReader(new FileReader(this));
73
74 int description_element_num = 0;
75 int next_description_element_start = ((Integer) description_elements_list.get(description_element_num)).intValue();
76 boolean in_relevant_description_element = false;
77
78 String line = null;
79 for (int line_num = 0; (line = buffered_reader.readLine()) != null; line_num++) {
80 // Check if this line contains the start of a relevant Description element
81 if (line_num == next_description_element_start) {
82 in_relevant_description_element = true;
83 continue;
84 }
85
86 // If we're not in a relevant Description element we don't care about anything
87 if (in_relevant_description_element == false) {
88 continue;
89 }
90
91 // Check if this line contains the end of the relevant Description element
92 if (line.indexOf("</Description>") != -1) {
93 description_element_num++;
94 if (description_element_num == description_elements_list.size()) {
95 break;
96 }
97
98 next_description_element_start = ((Integer) description_elements_list.get(description_element_num)).intValue();
99 in_relevant_description_element = false;
100 continue;
101 }
102
103 // If this line doesn't contain a complete Metadata element, we're not interested
104 if (line.indexOf("<Metadata ") == -1 || line.indexOf("</Metadata>") == -1) {
105 continue;
106 }
107
108 // Extract the metadata element name
109 int name_index = line.indexOf(" name=\"") + " name=\"".length();
110 String metadata_element_name_full = line.substring(name_index, line.indexOf("\"", name_index));
111
112 // If the metadata has a namespace it isn't extracted metadata, so we're not interested
113 String metadata_set_namespace = MetadataTools.getMetadataSetNamespace(metadata_element_name_full);
114 if (!metadata_set_namespace.equals("")) {
115 continue;
116 }
117
118 // Extracted metadata!
119 String metadata_element_name = metadata_element_name_full;
120
121 // We completely ignore bibliographic data
122 if (metadata_element_name.equals("SourceSegment")) {
123 return new ArrayList();
124 }
125
126 // Ignore lower-case metadata elements (gsdlsourcefilename, gsdlassocfile etc.)
127 // and those starting with '/' (/srclink)
128 char first_character = metadata_element_name.charAt(0);
129 if (Character.isLowerCase(first_character) || first_character == '/') {
130 continue;
131 }
132
133 MetadataElement metadata_element = extracted_metadata_set.getMetadataElementWithName(metadata_element_name);
134
135 // Value trees are not stored for extracted metadata, so create a new value tree node now
136 int value_index = line.indexOf(">", name_index) + ">".length();
137 String metadata_element_value = line.substring(value_index, line.lastIndexOf("</Metadata>"));
138
139 metadata_element.addMetadataValue(metadata_element_value);
140 MetadataValueTreeNode metadata_value_tree_node = metadata_element.getMetadataValueTreeNode(metadata_element_value);
141
142 // Add the new metadata value to the list
143 MetadataValue metadata_value = new MetadataValue(metadata_element, metadata_value_tree_node);
144 metadata_values.add(metadata_value);
145 }
146 }
147 catch (Exception ex) {
148 System.err.println("Exception: " + ex);
149 ex.printStackTrace();
150 }
151
152 return metadata_values;
153 }
154
155
156 /**
157 * Every doc.xml file must be skimmed when a collection is opened, for two reasons:
158 * - To build a mapping from source file to its corresponding doc.xml file
159 * - To get a complete list of all extracted metadata elements
160 */
161 public void skimFile()
162 {
163 MetadataSet extracted_metadata_set = MetadataSetManager.getMetadataSet(MetadataSetManager.EXTRACTED_METADATA_NAMESPACE);
164
165 // Skim the doc.xml file as quickly as possible (don't parse as XML), looking at the Metadata elements
166 DebugStream.println("Skimming doc.xml file " + this + "...");
167 try {
168 BufferedReader buffered_reader = new BufferedReader(new FileReader(this));
169 int description_element_start = -1;
170
171 String line = null;
172 for (int line_num = 0; (line = buffered_reader.readLine()) != null; line_num++) {
173 // This line contains the start of a Description element
174 if (line.indexOf("<Description>") != -1) {
175 if (description_element_start != -1) {
176 System.err.println("Parse error: previous Description element unfinished!");
177 }
178 description_element_start = line_num;
179 continue;
180 }
181
182 // This line contains the end of a Description element
183 if (line.indexOf("</Description>") != -1) {
184 if (description_element_start == -1) {
185 System.err.println("Parse error: Description element unstarted!");
186 }
187 description_element_start = -1;
188 continue;
189 }
190
191 // If we're not in a Description element there shouldn't be any Metadata elements
192 if (description_element_start == -1) {
193 continue;
194 }
195
196 // This line doesn't contain a Metadata element, so we're not interested
197 if (line.indexOf("<Metadata ") == -1) {
198 DebugStream.println("Warning: Description element line doesn't contain Metadata element.");
199 continue;
200 }
201
202 // Extract the metadata element name
203 int name_index = line.indexOf(" name=\"") + " name=\"".length();
204 String metadata_element_name_full = line.substring(name_index, line.indexOf("\"", name_index));
205
206 // If the metadata has a namespace it isn't extracted metadata, so we're not interested
207 String metadata_set_namespace = MetadataTools.getMetadataSetNamespace(metadata_element_name_full);
208 if (!metadata_set_namespace.equals("")) {
209 continue;
210 }
211
212 // Extracted metadata!
213 String metadata_element_name = metadata_element_name_full;
214
215 // Note which file this doc.xml is for
216 if (metadata_element_name.equals("gsdlsourcefilename")) {
217 // Extract the gsdlsourcefilename element value
218 int value_index = line.indexOf(">", name_index) + ">".length();
219 String gsdlsourcefilename_value = line.substring(value_index, line.indexOf("<", value_index));
220
221 // We're only interested in the path relative to the import folder
222 int import_index = gsdlsourcefilename_value.indexOf("import");
223 if (import_index != -1) {
224 gsdlsourcefilename_value = gsdlsourcefilename_value.substring(import_index + "import".length());
225
226 boolean is_unix_path = gsdlsourcefilename_value.startsWith("/");
227 gsdlsourcefilename_value = gsdlsourcefilename_value.substring(1);
228
229 // Make sure the path matches the OS that is running
230 if (is_unix_path && Utility.isWindows()) {
231 // Convert path from Unix to Windows
232 gsdlsourcefilename_value = gsdlsourcefilename_value.replaceAll("\\/", "\\\\");
233 }
234 else if (!is_unix_path && !Utility.isWindows()) {
235 // Convert path from Windows to Unix
236 gsdlsourcefilename_value = gsdlsourcefilename_value.replaceAll("\\\\", "/");
237 }
238
239 // Remember this for quick access later
240 if (source_file_name_to_description_elements_mapping.get(gsdlsourcefilename_value) == null) {
241 source_file_name_to_description_elements_mapping.put(gsdlsourcefilename_value, new ArrayList());
242 }
243
244 ((ArrayList) source_file_name_to_description_elements_mapping.get(gsdlsourcefilename_value)).add(new Integer(description_element_start));
245 }
246 else {
247 // We don't really know what is going on...
248 System.err.println("Warning: Could not understand gsdlsourcefilename " + gsdlsourcefilename_value);
249 }
250 }
251
252 // Ignore lower-case metadata elements (gsdlsourcefilename, gsdlassocfile etc.)
253 // and those starting with '/' (/srclink)
254 char first_character = metadata_element_name.charAt(0);
255 if (Character.isLowerCase(first_character) || first_character == '/') {
256 continue;
257 }
258
259 MetadataElement metadata_element = extracted_metadata_set.getMetadataElementWithName(metadata_element_name);
260 if (metadata_element == null) {
261 // This element isn't defined in ex.mds, so create it for this session
262 DebugStream.println("Extracted metadata element not defined: " + metadata_element_name);
263 extracted_metadata_set.addMetadataElementForThisSession(metadata_element_name);
264 }
265 }
266 }
267 catch (Exception ex) {
268 System.err.println("Exception: " + ex);
269 ex.printStackTrace();
270 }
271 }
272}
Note: See TracBrowser for help on using the repository browser.