source: gs3-extensions/mat/trunk/src/org/greenstone/mat/MetadataStats.java@ 22171

Last change on this file since 22171 was 22171, checked in by sjm84, 14 years ago

Updated the MAT code so that it now uses the new util classes

File size: 18.3 KB
Line 
1package org.greenstone.mat;
2
3import org.w3c.dom.*;
4import javax.xml.parsers.*;
5import javax.xml.transform.*;
6import javax.xml.transform.dom.*;
7import javax.xml.transform.stream.*;
8
9import java.io.BufferedWriter;
10import java.io.File;
11import java.io.FileOutputStream;
12import java.io.OutputStreamWriter;
13import java.io.Writer;
14
15import java.util.ArrayList;
16import java.util.Collection;
17import java.util.HashMap;
18import java.util.Iterator;
19import java.util.Set;
20import java.util.StringTokenizer;
21
22import org.greenstone.gsdl3.core.MessageRouter;
23import org.greenstone.gsdl3.util.GDBMWrapper;
24import org.greenstone.gsdl3.util.GSFile;
25import org.greenstone.gsdl3.util.GSXML;
26import org.greenstone.gsdl3.util.XMLConverter;
27import org.greenstone.util.GlobalProperties;
28import org.greenstone.gsdl3.util.GSPath;
29
30import org.w3c.dom.Document;
31import org.w3c.dom.Element;
32import org.w3c.dom.NamedNodeMap;
33import org.w3c.dom.Node;
34import org.w3c.dom.NodeList;
35
36public class MetadataStats {
37
38 private Document doc=null;
39 private MessageRouter mr = null;
40 private XMLConverter converter=null;
41 private GDBMWrapper gdbm_src = null;
42 private GSPath gspath = null;
43 private GlobalProperties globalProperty = null;
44 private String site_name = "localsite";
45 private String DLS_SET = "dls";
46 private String DC_SET = "dublin";
47 private String OAI_URL = "";
48 private boolean status = false;
49 private String oai_Prefix ="";
50
51 String destination = "";
52
53 private int TotalDoc = 0;
54 private HashMap MDS_list = new HashMap();
55 public ArrayList metadataNameList = new ArrayList();
56 public String StatsDirectory;
57 public String HTMLDirectory;
58 private String collection_Name = null;
59 private String collection = null;
60 private final String DBType ="gdbm";
61 private final String fileSeparator = File.separator;
62 protected final String gsdl3Home = null;
63 MetadataElement me;
64
65 /*
66 The constructor connects to the database and retrieve
67 information for the collection
68 */
69
70 private static boolean deleteDir(File dir) {
71
72 if (dir.isDirectory()) {
73 String[] children = dir.list();
74 for (int i=0; i<children.length; i++) {
75 boolean success = deleteDir(new File(dir, children[i]));
76 if (!success) {
77 return false;
78 }
79 }
80 }
81 return dir.delete();
82 }
83
84 public MetadataStats(String site_home, String collection,String url,String oaiPrefix){
85
86 OAI_URL = url;
87 collection_Name = collection;
88 oai_Prefix = oaiPrefix;
89
90 try{
91 destination = globalProperty.getGSDL3Home()+fileSeparator+"mat"+fileSeparator+collection+fileSeparator;
92 }catch(Exception ex){
93 ex.printStackTrace();
94 }
95
96 HTMLDirectory = destination;
97 StatsDirectory = HTMLDirectory+"metadataStats"+fileSeparator;
98
99 if(new File(StatsDirectory).exists()){
100 deleteDir(new File(StatsDirectory));
101 }
102 new File(StatsDirectory).mkdirs();
103
104 String gdbm = GSFile.collectionDatabaseFile(site_home,collection,collection,DBType);
105
106 mr = new MessageRouter();
107 mr.setSiteName(this.site_name);
108 mr.configure();
109
110 this.collection = collection;
111 this.gdbm_src = new GDBMWrapper();
112 this.converter = new XMLConverter();
113 this.doc = this.converter.newDOM();
114 this.gspath = new GSPath();
115 this.globalProperty = new GlobalProperties();
116 Setup(gdbm);
117 }
118
119 public void getAvailableMetadataSets(String SetName,HashMap UsedMap, String SetAbbr){
120
121 String gsdl3Home = globalProperty.getGSDL3Home();
122 String os = "linux";
123
124 if(System.getProperty("os.name").toLowerCase().indexOf("windows")!=-1){
125 gsdl3Home = gsdl3Home.replaceAll("\\\\", "/");
126 os = "windows";
127 }
128 String metadataSetHome = gspath.removeLastLink(gsdl3Home)+fileSeparator+"gli"+fileSeparator+"metadata";
129 if(os.equals("windows")){
130 metadataSetHome = metadataSetHome.replaceAll("/", "\\\\");
131 }
132 File metadata_directory = new File(metadataSetHome);
133
134 if (metadata_directory.exists()) {
135
136 File[] directory_files = metadata_directory.listFiles();
137
138 for (int i = 0; i < directory_files.length; i++) {
139 File child_file = directory_files[i];
140
141 if (!child_file.isDirectory() && child_file.getName().endsWith("mds")) {
142 String fileName = child_file.getName();
143
144 if(!MDS_list.containsKey(SetName) && fileName.equals(SetName+".mds")){
145
146 MetadataSet ms = new MetadataSet();
147 ms.setName(SetName);
148 ms.setAbb(SetAbbr);
149 converter.newDOM();
150
151 Document d = converter.getDOM(child_file);
152 NodeList e = d.getElementsByTagName("Element");
153
154 int length = e.getLength();
155
156 for(int y = 0; y<length; y++){
157 Node temp = e.item(y);
158 NamedNodeMap mmp = temp.getAttributes();
159
160 if(!mmp.item(0).getNodeValue().equals("Description")){
161 MetadataElement me = new MetadataElement();
162 me.setMetadataName((SetAbbr+"."+mmp.item(0).getNodeValue()));
163
164 if(!metadataNameList.contains(me.getMetadataName())){
165 metadataNameList.add(me.getMetadataName());
166 ms.addIndex(me.getMetadataName());
167 }
168 UsedMap.put(me.getMetadataName(),me);
169 }
170 }
171 MDS_list.put(ms.getName(), ms);
172 }
173 }
174 }
175 }
176 }
177
178 private void parseMetadata(ArrayList doc_list, String MetadataElement){
179
180 int counter = 0;
181 int docSize = doc_list.size();
182
183 while(true){
184
185 Node message = this.doc.createElement(GSXML.MESSAGE_ELEM);
186 Node request = GSXML.createBasicRequest(this.doc, GSXML.REQUEST_TYPE_PROCESS,collection+"/DocumentMetadataRetrieve","en", "");
187 Node param_list = this.doc.createElement(GSXML.PARAM_ELEM+GSXML.LIST_MODIFIER);
188 param_list.appendChild(GSXML.createParameter(this.doc, "metadata", "all"));
189 Node documentNode_list = this.doc.createElement(GSXML.DOC_NODE_ELEM+GSXML.LIST_MODIFIER);
190
191 if(docSize>=300){
192 docSize = docSize - 300;
193 for(int i = 0; i<300; i++){
194 Element documentNode = this.doc.createElement(GSXML.DOC_NODE_ELEM);
195 documentNode.setAttribute("nodeID",(String)doc_list.get(counter));
196 documentNode_list.appendChild(documentNode);
197 counter++;
198 }
199
200 request.appendChild(param_list);
201 request.appendChild(documentNode_list);
202 message.appendChild(request);
203 Node response = mr.process(message);
204 getAttributes(response);
205 response = null;
206 request = null;
207 param_list = null;
208 documentNode_list = null;
209 }
210
211 else if(docSize<300){
212 for(int i = 0; i<docSize; i++){
213 Element documentNode = this.doc.createElement(GSXML.DOC_NODE_ELEM);
214 documentNode.setAttribute("nodeID",(String)doc_list.get(counter));
215 documentNode_list.appendChild(documentNode);
216 counter++;
217 }
218
219 request.appendChild(param_list);
220 request.appendChild(documentNode_list);
221 message.appendChild(request);
222 Node response = mr.process(message);
223 getAttributes(response);
224 response = null;
225 request = null;
226 param_list = null;
227 documentNode_list = null;
228 break;
229 }
230 }
231 }
232
233 public boolean getStatus(){
234 return this.status;
235 }
236
237 private void Setup(String gdbm){
238
239 if (!this.gdbm_src.openDatabase(gdbm,GDBMWrapper.READ)) {
240 System.out.println("Could not open GDBM database!");
241 }
242
243 else{
244 String info = this.gdbm_src.getValue("browselist");
245
246 if (info == null) {
247 System.out.println("cannot locate the list");
248 }
249 else{
250 if (info == null) {
251 System.out.println("the db does not contain any info");
252 }
253
254 ArrayList children = new ArrayList();
255 StringTokenizer st = new StringTokenizer(info, ";");
256
257 while (st.hasMoreTokens()) {
258 String part = st.nextToken(";");
259
260 if(part.indexOf("<contains>")!=-1){
261 part = part.replaceAll("<contains>", "");
262 }
263 else if(part.indexOf("<thistype>")!=-1){
264 int location = part.indexOf("<thistype>");
265 part = part.substring(0,location-1);
266 }
267 children.add(part);
268 }
269
270 gdbm_src.closeDatabase();
271 parseMetadata(children,"all");
272 TotalDoc = children.size();
273 System.out.println("Total Doc:"+TotalDoc);
274 status = true;
275 }
276 }
277 }
278
279 public void getAttributes(Node f){
280
281 HashMap UsedMap = new HashMap();
282 Element e = (Element) f;
283 NodeList metadataNode = e.getElementsByTagName("metadata");
284 int length = metadataNode.getLength();
285
286 for (int j = 0; j < length; j++){
287 Node aNode = metadataNode.item(j);
288 NamedNodeMap NodeMap = aNode.getAttributes();
289 Node AttributeNode = NodeMap.item(0);
290 String att_name = AttributeNode.getNodeValue();
291 if(att_name.indexOf("dls.")!=-1){getAvailableMetadataSets(DLS_SET,UsedMap,"dls");}
292 else if(att_name.indexOf("dc.")!=-1){getAvailableMetadataSets(DC_SET,UsedMap,"dc");}
293 }
294
295 adjust(UsedMap,e);
296 UsedMap = null;
297 e = null;
298 System.gc();
299 }
300
301
302 /*
303 * MATCH UP THE ELEMENTS FROM BOTH DOCUMENTS AND PRE-DEFINED METADATA SET
304 * MOVE THE ELEMENT FROM NOUSEDMAP TO USEDMAP IF THE ELEMENT HAS NOT BEEN DISCOVERED BEFORE
305 * INCREASE THE COUNTER ONCE THE ELEMENT IS RECONGNIZED
306 * CALCULATE THE TIMES OF ELEMENTS USED AND STORE IT INTO THE LIST
307 */
308
309 private void adjust(HashMap UsedMap, Element response){
310
311 String DocID;;
312 NodeList nList = response.getElementsByTagName("documentNode");
313 int length = nList.getLength();
314
315 for (int j = 0; j <length; j++){
316
317 Node aNode = nList.item(j);
318 NamedNodeMap NodeMap = aNode.getAttributes();
319 Node AttributeNode = NodeMap.item(0);
320 String att_name = AttributeNode.getNodeValue();
321 DocID = att_name;
322
323 NodeList childList = aNode.getChildNodes();
324 Node nNode = childList.item(0);
325 NodeList grandChildList = nNode.getChildNodes();
326
327 int length1 = grandChildList.getLength();
328
329 for(int e = 0; e<length1 ; e++){
330
331 Node xNode = grandChildList.item(e);
332 NamedNodeMap xNodeMap = xNode.getAttributes();
333 Node xAttributeNode = xNodeMap.item(0);
334 String xatt_name = xAttributeNode.getNodeValue();
335 NodeList cList = xNode.getChildNodes();
336 String ActValue = cList.item(0).getNodeValue();
337 char firstchart = xatt_name.charAt(0);
338
339 if(xatt_name.indexOf('^')!=-1){
340 xatt_name = xatt_name.replace('^', '.');
341 }
342
343 if(UsedMap.containsKey(xatt_name)){
344
345 MetadataElement met = (MetadataElement) UsedMap.get(xatt_name);
346 met.IncreaseFrequency();
347 HashMap metadataMap = met.getMetadataList();
348
349 if(metadataMap.containsKey(DocID)){
350 DocumentInfo dc = (DocumentInfo)metadataMap.get(DocID);
351 dc.IncreaseFrequence();
352 dc.addActualValue(ActValue);
353 metadataMap.put(DocID, dc);
354 }
355 else{
356 DocumentInfo dc = new DocumentInfo();
357 dc.IncreaseFrequence();
358 dc.setDocumentID(DocID);
359 dc.addActualValue(ActValue);
360 metadataMap.put(DocID, dc);
361 }
362
363 HashMap valueMap = met.getValueList();
364
365 if(valueMap.containsKey(ActValue)){
366 Integer f = (Integer)valueMap.get(ActValue);
367 int fx = f.intValue();
368 fx++;
369 valueMap.put(ActValue,new Integer(fx));
370 }
371 else{
372 valueMap.put(ActValue, new Integer(1));
373 }
374
375 }
376 else if( (!UsedMap.containsKey(xatt_name)) && xatt_name.equals("archivedir")){
377
378 String SetAbbr = "ex";
379 String SetName = "extracted";
380 MetadataSet ms = new MetadataSet();
381 ms.setName(SetName);
382 ms.setAbb(SetAbbr);
383
384 MetadataElement me = new MetadataElement();
385 me.setMetadataName(xatt_name);
386 me.IncreaseFrequency();
387 HashMap metadataMap = me.getMetadataList();
388
389 if(metadataMap.containsKey(DocID)){
390 DocumentInfo dc = (DocumentInfo)metadataMap.get(DocID);
391 dc.IncreaseFrequence();
392 dc.addActualValue(ActValue);
393 metadataMap.put(DocID, dc);
394 }
395 else{
396 DocumentInfo dc = new DocumentInfo();
397 dc.IncreaseFrequence();
398 dc.setDocumentID(DocID);
399 dc.addActualValue(ActValue);
400 metadataMap.put(DocID, dc);
401 }
402
403 HashMap valueMap = me.getValueList();
404 if(valueMap.containsKey(ActValue)){
405 Integer f = (Integer)valueMap.get(ActValue);
406 int fx = f.intValue();
407 fx++;
408 valueMap.put(ActValue,new Integer(fx));
409 }
410 else{
411
412 valueMap.put(ActValue, new Integer(1));
413 }
414 if(!metadataNameList.contains(me.getMetadataName())){
415 metadataNameList.add(me.getMetadataName());
416 ms.addIndex(me.getMetadataName());
417 }
418
419 UsedMap.put(me.getMetadataName(), me);
420 if(!MDS_list.containsKey(ms.getName())){
421 MDS_list.put(ms.getName(), ms);}
422 else{
423 MetadataSet msx = (MetadataSet)MDS_list.get(ms.getName());
424 msx.addIndex(me.getMetadataName());
425 MDS_list.put(ms.getName(), msx);
426 }
427 }
428 else if((!UsedMap.containsKey(xatt_name)) && xatt_name.indexOf(".")!=-1 && xatt_name.indexOf("dc.")==-1){
429 int dotLocation = xatt_name.indexOf(".");
430 String SetAbbr = xatt_name.substring(0,dotLocation);
431 String SetName = SetAbbr;
432
433 MetadataSet ms = new MetadataSet();
434 ms.setName(SetName);
435 ms.setAbb(SetAbbr);
436
437 MetadataElement me = new MetadataElement();
438 me.setMetadataName(xatt_name);
439 me.IncreaseFrequency();
440 HashMap metadataMap = me.getMetadataList();
441
442 if(metadataMap.containsKey(DocID)){
443 DocumentInfo dc = (DocumentInfo)metadataMap.get(DocID);
444 dc.IncreaseFrequence();
445 dc.addActualValue(ActValue);
446 metadataMap.put(DocID, dc);
447 }
448 else{
449 DocumentInfo dc = new DocumentInfo();
450 dc.IncreaseFrequence();
451 dc.setDocumentID(DocID);
452 dc.addActualValue(ActValue);
453 metadataMap.put(DocID, dc);
454 }
455
456 HashMap valueMap = me.getValueList();
457 if(valueMap.containsKey(ActValue)){
458 Integer f = (Integer)valueMap.get(ActValue);
459 int fx = f.intValue();
460 fx++;
461 valueMap.put(ActValue,new Integer(fx));
462 }
463 else{
464
465 valueMap.put(ActValue, new Integer(1));
466 }
467 if(!metadataNameList.contains(me.getMetadataName())){
468 metadataNameList.add(me.getMetadataName());
469 ms.addIndex(me.getMetadataName());
470 }
471 UsedMap.put(me.getMetadataName(), me);
472
473 if(!MDS_list.containsKey(ms.getName())){
474 MDS_list.put(ms.getName(), ms);
475 }
476 else{
477 MetadataSet msx = (MetadataSet)MDS_list.get(ms.getName());
478 msx.addIndex(me.getMetadataName());
479 MDS_list.put(ms.getName(), msx);
480 }
481 }
482
483 else if ((!UsedMap.containsKey(xatt_name)) && (xatt_name.indexOf("dc.")==0) && xatt_name.indexOf("dc.Description")!=0){
484
485 String SetName = "dublin";
486 xatt_name = xatt_name.replace('^', '.');
487
488 MetadataElement me = new MetadataElement();
489 me.setMetadataName(xatt_name);
490 me.IncreaseFrequency();
491 HashMap metadataMap = me.getMetadataList();
492
493 DocumentInfo dc = new DocumentInfo();
494 dc.IncreaseFrequence();
495 dc.setDocumentID(DocID);
496 dc.addActualValue(ActValue);
497 metadataMap.put(DocID, dc);
498
499 HashMap valueMap = me.getValueList();
500 valueMap.put(ActValue, new Integer(1));
501
502 if(!metadataNameList.contains(me.getMetadataName())){
503 metadataNameList.add(me.getMetadataName());
504 }
505
506 UsedMap.put(me.getMetadataName(), me);
507 MetadataSet msx = (MetadataSet)MDS_list.get(SetName);
508 msx.addIndex(me.getMetadataName());
509 MDS_list.put(SetName, msx);
510 }
511 }
512 }
513
514 int counter = 0;
515 Set s = UsedMap.keySet();
516 Iterator is = s.iterator();
517
518 while(is.hasNext()){
519 String fileName = (String)is.next();
520 MetadataElement me = (MetadataElement)UsedMap.get(fileName);
521 HashMap hp = me.getMetadataList();
522 Collection ks = hp.values();
523 Iterator iks = ks.iterator();
524
525 try{
526 DocumentBuilderFactory docBuilderFactory = DocumentBuilderFactory.newInstance();
527 DocumentBuilder docBuilder = docBuilderFactory.newDocumentBuilder();
528 Document doc = docBuilder.newDocument();
529 Element root = doc.createElement("root");
530 boolean fileExist = (new File(StatsDirectory+fileName+".xml")).exists();
531
532 if(!fileExist){
533 root = doc.createElement("root");
534 }
535 else{
536 doc = docBuilder.parse (new File(StatsDirectory+fileName+".xml"));
537 root = doc.getDocumentElement();
538 }
539
540 while(iks.hasNext()){
541 DocumentInfo dc = (DocumentInfo)iks.next();
542 Element docID = doc.createElement("Document");
543 docID.setAttribute("id", dc.getDocumentID());
544
545 Element freq = doc.createElement("Frequency");
546 Text text = doc.createTextNode(dc.getFrequence()+"");
547 freq.appendChild(text);
548 docID.appendChild(freq);
549
550 Element actValue = doc.createElement("ActualValue");
551 ArrayList alist = dc.getActualValue();
552
553 if(alist.size()==0){
554 text = doc.createTextNode(" ");
555 actValue = doc.createElement("ActualValue");
556 actValue.appendChild(text);
557 docID.appendChild(actValue);
558 }
559
560 for(int i = 0; i<alist.size(); i++){
561 actValue = doc.createElement("ActualValue");
562 String utf8String = new String(((String)alist.get(i)).getBytes(),"UTF-8");
563 Text text1 = doc.createTextNode(utf8String);
564 actValue.appendChild(text1);
565 docID.appendChild(actValue);
566 }
567
568 root.appendChild(docID);
569 docID = null;
570 counter++;
571 }
572
573 TransformerFactory tf= TransformerFactory.newInstance();
574 Transformer transformer= tf.newTransformer();
575 DOMSource source= new DOMSource(root);
576 transformer.setOutputProperty(OutputKeys.INDENT,"yes");
577
578 Writer pwx= new BufferedWriter(new OutputStreamWriter(new FileOutputStream(StatsDirectory+fileName+".xml"),"UTF-8"));
579 StreamResult result= new StreamResult(pwx);
580 transformer.transform(source,result);
581 pwx.close();
582
583 root = null;
584 docBuilderFactory = null;
585 docBuilder = null;
586 doc = null;
587
588 }catch (Exception e) {
589 System.out.println(e);
590 }
591 }
592 }
593
594 public int getDocNum(){
595 return TotalDoc;
596 }
597
598 public HashMap getMetadataSetMap(){
599 MDS_list.remove("extracted");
600 return (HashMap)MDS_list.clone();
601 }
602
603 public String getCollectionName(){
604 return collection_Name;
605 }
606
607 public void setOAIURL(String url){
608 OAI_URL = url;
609 }
610
611 public String getOAIURL(){
612 return OAI_URL;
613 }
614
615 public String getOaiPrefix(){
616 return oai_Prefix;
617 }
618
619 public ArrayList getMetadataNameList(){
620 return (ArrayList)metadataNameList.clone();
621 }
622 private String constructUpdateMessgae (){
623 String message = "<message><request type='system' to=''><system type='configure' subset=''/></request></message>";
624 return message;
625 }
626}
Note: See TracBrowser for help on using the repository browser.