source: other-projects/trunk/greenstone3-extension/mat/Greenstone3Project/src/org/greenstone3/ms/MetadataStats.java@ 17156

Last change on this file since 17156 was 17156, checked in by cc108, 16 years ago

Adding the project Metadata Quality for Digital Libraries into the repository

File size: 19.3 KB
Line 
1package org.greenstone3.ms;
2
3
4import org.w3c.dom.*;
5import javax.xml.parsers.*;
6import javax.xml.transform.*;
7import javax.xml.transform.dom.*;
8import javax.xml.transform.stream.*;
9
10
11import java.io.BufferedWriter;
12import java.io.File;
13import java.io.FileOutputStream;
14import java.io.OutputStreamWriter;
15import java.io.PrintWriter;
16import java.io.Writer;
17
18import java.util.ArrayList;
19import java.util.Collection;
20import java.util.HashMap;
21import java.util.Iterator;
22import java.util.Set;
23import java.util.StringTokenizer;
24
25import org.greenstone.gsdl3.core.MessageRouter;
26import org.greenstone.gsdl3.util.DBInfo;
27import org.greenstone.gsdl3.util.GDBMWrapper;
28import org.greenstone.gsdl3.util.GSFile;
29import org.greenstone.gsdl3.util.GSXML;
30import org.greenstone.gsdl3.util.XMLConverter;
31
32import org.w3c.dom.Document;
33import org.w3c.dom.Element;
34import org.w3c.dom.NamedNodeMap;
35import org.w3c.dom.Node;
36import org.w3c.dom.NodeList;
37
38
39public class MetadataStats {
40
41 private Document doc=null;
42 private MessageRouter mr = null;
43 private XMLConverter converter=null;
44 private GDBMWrapper gdbm_src = null;
45 private String site_name = "localsite";
46 private String node_id = "browselist";
47 private String DLS_SET = "dls";
48 private String DC_SET = "dublin";
49 private String OAI_URL = "";
50 private boolean status = false;
51 private String oai_Prefix ="";
52 //private PrintWriter pw;
53 // servlet
54 final String destination = "/home/cc108/MRWks1/Greenstone3Project/";
55 private int TotalDoc = 0;
56 //private int TotalElement = 0;
57
58 private HashMap MDS_list = new HashMap();
59
60 private String collection = null;
61 public ArrayList metadataNameList = new ArrayList();
62 public String myNewDir;
63 private String collection_Name = null;
64
65
66 MetadataElement me;
67 /*
68 The constructor connects to the database and retrieve
69 information for the collection
70 */
71
72 private static boolean deleteDir(File dir) {
73 if (dir.isDirectory()) {
74 String[] children = dir.list();
75 for (int i=0; i<children.length; i++) {
76 boolean success = deleteDir(new File(dir, children[i]));
77 if (!success) {
78 return false;
79 }
80 }
81 }
82
83 return dir.delete();
84 }
85
86 public MetadataStats(String site_home, String collection,String url,String oaiPrefix){
87 OAI_URL = url;
88 collection_Name = collection;
89 oai_Prefix = oaiPrefix;
90 //pw = out;
91
92 //myNewDir = collection+"/metadataStats";
93 myNewDir =destination+collection+"/metadataStats";
94
95 if(new File(myNewDir).exists()){deleteDir(new File(myNewDir));}
96 new File(myNewDir).mkdirs();
97
98
99 String gdbm = GSFile.collectionDatabaseFile(site_home,collection,collection,"gdbm");
100
101 mr = new MessageRouter();
102 mr.setSiteName(this.site_name);
103 mr.configure();
104
105 this.collection = collection;
106 this.gdbm_src = new GDBMWrapper();
107 this.converter = new XMLConverter();
108 this.doc = this.converter.newDOM();
109 Setup(gdbm);
110 }
111
112 public void getAvailableMetadataSets(String SetName,HashMap UsedMap, String SetAbbr){
113
114 File metadata_directory = new File("/research/cc108/greenstone3/gli/metadata");
115 //File metadata_directory = new File("/greenstone3/gli/metadata");
116 //File metadata_directory = new File("C://Program Files//Greenstone3//gli//metadata");
117
118 if (metadata_directory.exists()) {
119 // Load just those .mds files in this directory, and return them
120 File[] directory_files = metadata_directory.listFiles();
121
122 for (int i = 0; i < directory_files.length; i++) {
123 File child_file = directory_files[i];
124
125 if (!child_file.isDirectory() && child_file.getName().endsWith("mds")) {
126 String fileName = child_file.getName();
127
128 /////////////////////////
129 //LOADING DLS CORE
130 //---if(fileName.equals(SetName)&& fileName.equals("dls.mds") && DLS_STATUS){
131 if(!MDS_list.containsKey(SetName)&& fileName.equals(SetName+".mds")){
132
133 MetadataSet ms = new MetadataSet();
134 ms.setName(SetName);
135 ms.setAbb(SetAbbr);
136
137 converter.newDOM();
138 Document d = converter.getDOM(child_file);
139
140 NodeList e = d.getElementsByTagName("Element");
141
142 int length = e.getLength();
143
144 for(int y = 0; y<length; y++){
145 Node temp = e.item(y);
146 NamedNodeMap mmp = temp.getAttributes();
147
148 if(!mmp.item(0).getNodeValue().equals("dc.Description")){
149 MetadataElement me = new MetadataElement();
150 //me.setMetadataName((SetAbbr+"."+mmp.item(0).getNodeValue().toLowerCase()));
151 me.setMetadataName((SetAbbr+"."+mmp.item(0).getNodeValue()));
152 //System.out.println(me.getMetadataName());
153 //.child_file............child_file
154 if(!metadataNameList.contains(me.getMetadataName())){
155 metadataNameList.add(me.getMetadataName());
156 ms.addIndex(me.getMetadataName());
157 }
158 UsedMap.put(me.getMetadataName(),me);
159 }
160 }
161
162 MDS_list.put(ms.getName(), ms);
163
164 }// DLS FINISH
165 }
166 }
167 }
168 }
169
170 // doc_list is the list of document ids
171 //MetadataElement represents which metadata element we want to explore
172 private void parseMetadata(ArrayList doc_list, String MetadataElement){
173 //MetadataElement = "dc.contributor";
174 //System.out.println("parse metadata starts");
175 int counter = 0;
176 //int x = 0;
177 int docSize = doc_list.size();
178
179 System.out.print("<!-- prepare request -->");
180 while(true){
181
182 Node message = this.doc.createElement(GSXML.MESSAGE_ELEM);
183
184 Node request = GSXML.createBasicRequest(
185 this.doc, GSXML.REQUEST_TYPE_PROCESS,collection+"/DocumentMetadataRetrieve","en", "");
186
187 Node param_list = this.doc.createElement(
188 GSXML.PARAM_ELEM+GSXML.LIST_MODIFIER);
189
190 param_list.appendChild(GSXML.createParameter(
191 this.doc, "metadata", "all"));
192 //metadatalist-dc
193 Node documentNode_list = this.doc.createElement(
194 GSXML.DOC_NODE_ELEM+GSXML.LIST_MODIFIER);
195
196 if(docSize>=300){
197 docSize = docSize - 300;
198 for(int i = 0; i<300; i++){
199 Element documentNode = this.doc.createElement(GSXML.DOC_NODE_ELEM);
200 documentNode.setAttribute("nodeID",(String)doc_list.get(counter));
201 documentNode_list.appendChild(documentNode);
202 counter++;
203 }
204 request.appendChild(param_list);
205 request.appendChild(documentNode_list);
206 message.appendChild(request);
207 Node response = mr.process(message);
208 getAttributes(response);
209
210 response = null;
211 request = null;
212 param_list = null;
213 documentNode_list = null;
214 //System.gc();
215 }
216
217 else if(docSize<300){
218 for(int i = 0; i<docSize; i++){
219 Element documentNode = this.doc.createElement(GSXML.DOC_NODE_ELEM);
220 documentNode.setAttribute("nodeID",(String)doc_list.get(counter));
221 documentNode_list.appendChild(documentNode);
222 counter++;
223 }
224 request.appendChild(param_list);
225 request.appendChild(documentNode_list);
226 message.appendChild(request);
227
228 Node response = mr.process(message);
229 getAttributes(response);
230 response = null;
231 request = null;
232 param_list = null;
233 documentNode_list = null;
234
235 break;
236 }
237 }
238 }
239
240 public boolean getStatus(){
241 return this.status;
242 }
243
244
245 private void Setup(String gdbm){
246
247
248 if (!this.gdbm_src.openDatabase(gdbm,GDBMWrapper.READ)) {
249 System.out.println("Could not open GDBM database!");
250
251 }
252
253 else{
254
255 String info = this.gdbm_src.getValue("browselist");
256
257 if (info == null) {
258 System.out.println("cannot locate the list");
259 }
260 else{
261
262 if (info == null) {
263 System.out.println("the db does not contain any info");
264 }
265
266 ArrayList children = new ArrayList();
267 StringTokenizer st = new StringTokenizer(info, ";");
268
269 while (st.hasMoreTokens()) {
270 String part = st.nextToken(";");
271
272
273
274 if(part.contains("<contains>")){
275 part = part.replace("<contains>", "");
276 }
277 else if(part.contains("<thistype>")){
278
279 int location = part.indexOf("<thistype>");
280 part = part.substring(0,location-1);
281 }
282 children.add(part);
283 System.out.println("..."+part+".....2");
284
285 }
286
287 gdbm_src.closeDatabase();
288 parseMetadata(children,"all");
289 TotalDoc = children.size();
290 System.out.println("Total Doc22222:"+TotalDoc);
291 status = true;
292 }
293 }
294 }
295
296
297 public void getAttributes(Node f){
298
299 HashMap UsedMap = new HashMap();
300 Element e = (Element) f;
301 NodeList metadataNode = e.getElementsByTagName("metadata");
302 int length = metadataNode.getLength();
303
304 for (int j = 0; j < length; j++){
305 Node aNode = metadataNode.item(j);
306 NamedNodeMap NodeMap = aNode.getAttributes();
307 Node AttributeNode = NodeMap.item(0);
308 String att_name = AttributeNode.getNodeValue();
309
310 if(att_name.indexOf("dls.")!=-1){getAvailableMetadataSets(DLS_SET,UsedMap,"dls");}
311 else if(att_name.indexOf("dc.")!=-1){getAvailableMetadataSets(DC_SET,UsedMap,"dc");}
312
313 }
314
315 adjust(UsedMap,e);
316 UsedMap = null;
317 e = null;
318 System.gc();
319 }
320
321
322 /*
323 * MATCH UP THE ELEMENTS FROM BOTH DOCUMENTS AND PRE-DEFINED METADATA SET
324 * MOVE THE ELEMENT FROM NOUSEDMAP TO USEDMAP IF THE ELEMENT HAS NOT BEEN DISCOVERED BEFORE
325 * INCREASE THE COUNTER ONCE THE ELEMENT IS RECONGNIZED
326 * CALCULATE THE TIMES OF ELEMENTS USED AND STORE IT INTO THE LIST
327 */
328
329 private void adjust(HashMap UsedMap, Element response){
330
331
332 String DocID;;
333 NodeList nList = response.getElementsByTagName("documentNode");
334 int length = nList.getLength();
335 for (int j = 0; j <length; j++){
336
337 Node aNode = nList.item(j);
338 NamedNodeMap NodeMap = aNode.getAttributes();
339 Node AttributeNode = NodeMap.item(0);
340 String att_name = AttributeNode.getNodeValue();
341 DocID = att_name;
342
343 NodeList childList = aNode.getChildNodes();
344 Node nNode = childList.item(0);
345 NodeList grandChildList = nNode.getChildNodes();
346
347 int length1 = grandChildList.getLength();
348
349 for(int e = 0; e<length1 ; e++){
350
351 Node xNode = grandChildList.item(e);
352 NamedNodeMap xNodeMap = xNode.getAttributes();
353 Node xAttributeNode = xNodeMap.item(0);
354 String xatt_name = xAttributeNode.getNodeValue();
355
356 NodeList cList = xNode.getChildNodes();
357
358 String ActValue = cList.item(0).getNodeValue();
359
360
361 char firstchart = xatt_name.charAt(0);
362
363
364 if(xatt_name.indexOf('^')!=-1){
365 xatt_name = xatt_name.replace('^', '.');
366 }
367
368 if(UsedMap.containsKey(xatt_name)){
369
370 MetadataElement met = (MetadataElement) UsedMap.get(xatt_name);
371 met.IncreaseFrequency();
372 HashMap metadataMap = met.getMetadataList();
373
374 if(metadataMap.containsKey(DocID)){
375
376 DocumentInfo dc = (DocumentInfo)metadataMap.get(DocID);
377 dc.IncreaseFrequence();
378 dc.addActualValue(ActValue);
379 metadataMap.put(DocID, dc);
380 }
381 else{
382
383 DocumentInfo dc = new DocumentInfo();
384 dc.IncreaseFrequence();
385 dc.setDocumentID(DocID);
386 dc.addActualValue(ActValue);
387 metadataMap.put(DocID, dc);
388 }
389
390 HashMap valueMap = met.getValueList();
391 if(valueMap.containsKey(ActValue)){
392 Integer f = (Integer)valueMap.get(ActValue);
393 int fx = f.intValue();
394 fx++;
395 valueMap.put(ActValue,new Integer(fx));
396 }
397 else{
398 valueMap.put(ActValue, new Integer(1));
399 }
400
401 }
402 //else if ((!UsedMap.containsKey(xatt_name)) && (xatt_name.indexOf(".")==-1) && (!xatt_name.equals("prettymd")) && (!xatt_name.equals("Description")) && (Character.isLetter(firstchart)) && (xatt_name.indexOf("-")==-1)){
403 else if( (!UsedMap.containsKey(xatt_name)) && xatt_name.equals("archivedir")){
404 //pw.write("NEW EX element");
405 String SetAbbr = "ex";
406 String SetName = "extracted";
407
408 MetadataSet ms = new MetadataSet();
409 ms.setName(SetName);
410 ms.setAbb(SetAbbr);
411
412 MetadataElement me = new MetadataElement();
413 me.setMetadataName(xatt_name);
414 me.IncreaseFrequency();
415 HashMap metadataMap = me.getMetadataList();
416
417 if(metadataMap.containsKey(DocID)){
418 DocumentInfo dc = (DocumentInfo)metadataMap.get(DocID);
419 dc.IncreaseFrequence();
420 dc.addActualValue(ActValue);
421 metadataMap.put(DocID, dc);
422 }
423 else{
424 DocumentInfo dc = new DocumentInfo();
425 dc.IncreaseFrequence();
426 dc.setDocumentID(DocID);
427 dc.addActualValue(ActValue);
428 metadataMap.put(DocID, dc);
429 }
430
431 HashMap valueMap = me.getValueList();
432 if(valueMap.containsKey(ActValue)){
433 Integer f = (Integer)valueMap.get(ActValue);
434 int fx = f.intValue();
435 fx++;
436 valueMap.put(ActValue,new Integer(fx));
437 }
438 else{
439
440 valueMap.put(ActValue, new Integer(1));
441 }
442 if(!metadataNameList.contains(me.getMetadataName())){
443 metadataNameList.add(me.getMetadataName());
444 ms.addIndex(me.getMetadataName());
445 }
446
447 UsedMap.put(me.getMetadataName(), me);
448 if(!MDS_list.containsKey(ms.getName())){MDS_list.put(ms.getName(), ms);}
449 else{
450 MetadataSet msx = (MetadataSet)MDS_list.get(ms.getName());
451 msx.addIndex(me.getMetadataName());
452 MDS_list.put(ms.getName(), msx);
453 }
454 }
455
456
457 else if((!UsedMap.containsKey(xatt_name)) && xatt_name.indexOf(".")!=-1 && xatt_name.indexOf("dc.")==-1){
458 //pw.write("NEW EX element");
459 int dotLocation = xatt_name.indexOf(".");
460 String SetAbbr = xatt_name.substring(0,dotLocation);
461 String SetName = SetAbbr;
462
463 MetadataSet ms = new MetadataSet();
464 ms.setName(SetName);
465 ms.setAbb(SetAbbr);
466
467 MetadataElement me = new MetadataElement();
468 me.setMetadataName(xatt_name);
469 me.IncreaseFrequency();
470 HashMap metadataMap = me.getMetadataList();
471
472 if(metadataMap.containsKey(DocID)){
473 DocumentInfo dc = (DocumentInfo)metadataMap.get(DocID);
474 dc.IncreaseFrequence();
475 dc.addActualValue(ActValue);
476 metadataMap.put(DocID, dc);
477 }
478 else{
479 DocumentInfo dc = new DocumentInfo();
480 dc.IncreaseFrequence();
481 dc.setDocumentID(DocID);
482 dc.addActualValue(ActValue);
483 metadataMap.put(DocID, dc);
484 }
485
486 HashMap valueMap = me.getValueList();
487 if(valueMap.containsKey(ActValue)){
488 Integer f = (Integer)valueMap.get(ActValue);
489 int fx = f.intValue();
490 fx++;
491 valueMap.put(ActValue,new Integer(fx));
492 }
493 else{
494
495 valueMap.put(ActValue, new Integer(1));
496 }
497 if(!metadataNameList.contains(me.getMetadataName())){
498 metadataNameList.add(me.getMetadataName());
499 ms.addIndex(me.getMetadataName());
500 }
501 //pw.write("adding element");
502 UsedMap.put(me.getMetadataName(), me);
503 if(!MDS_list.containsKey(ms.getName())){MDS_list.put(ms.getName(), ms);}
504 else{
505 MetadataSet msx = (MetadataSet)MDS_list.get(ms.getName());
506 msx.addIndex(me.getMetadataName());
507 MDS_list.put(ms.getName(), msx);
508 }
509 }
510
511 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
512 else if ((!UsedMap.containsKey(xatt_name)) && (xatt_name.indexOf("dc.")==0) && xatt_name.indexOf("dc.Description")!=0){
513
514 //pw.write("QUALIFIED DC element");
515
516 String SetName = "dublin";
517 xatt_name = xatt_name.replace('^', '.');
518
519 MetadataElement me = new MetadataElement();
520 me.setMetadataName(xatt_name);
521 me.IncreaseFrequency();
522 HashMap metadataMap = me.getMetadataList();
523
524 DocumentInfo dc = new DocumentInfo();
525 dc.IncreaseFrequence();
526 dc.setDocumentID(DocID);
527 dc.addActualValue(ActValue);
528 metadataMap.put(DocID, dc);
529
530 HashMap valueMap = me.getValueList();
531 valueMap.put(ActValue, new Integer(1));
532
533 if(!metadataNameList.contains(me.getMetadataName())){
534 metadataNameList.add(me.getMetadataName());
535 }
536
537 //pw.write("adding element");
538 UsedMap.put(me.getMetadataName(), me);
539
540 MetadataSet msx = (MetadataSet)MDS_list.get(SetName);
541 msx.addIndex(me.getMetadataName());
542 MDS_list.put(SetName, msx);
543 }
544 /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
545 }// element
546
547 }//doc id
548 ////////////////////
549 //write file
550 int counter = 0;
551
552
553 Set s = UsedMap.keySet();
554 Iterator is = s.iterator();
555
556 while(is.hasNext()){
557
558 String fileName = (String)is.next();
559 MetadataElement me = (MetadataElement)UsedMap.get(fileName);
560 HashMap hp = me.getMetadataList();
561 Collection ks = hp.values();
562 Iterator iks = ks.iterator();
563
564 try{
565
566 DocumentBuilderFactory docBuilderFactory = DocumentBuilderFactory.newInstance();
567 DocumentBuilder docBuilder = docBuilderFactory.newDocumentBuilder();
568 Document doc = docBuilder.newDocument();
569 Element root = doc.createElement("root");
570 boolean fileExist = (new File(myNewDir+"/"+fileName+".xml")).exists();
571
572 if(!fileExist){
573 root = doc.createElement("root");
574 }
575 else{
576 doc = docBuilder.parse (new File(myNewDir+"/"+fileName+".xml"));
577 root = doc.getDocumentElement();
578 }
579
580 while(iks.hasNext()){
581
582 DocumentInfo dc = (DocumentInfo)iks.next();
583 Element docID = doc.createElement("Document");
584 docID.setAttribute("id", dc.getDocumentID());
585 //pw.write("doc id"+dc.getDocumentID());
586
587 Element freq = doc.createElement("Frequency");
588 Text text = doc.createTextNode(dc.getFrequence()+"");
589 freq.appendChild(text);
590 docID.appendChild(freq);
591
592 Element actValue = doc.createElement("ActualValue");
593 ArrayList alist = dc.getActualValue();
594 if(alist.size()==0){
595 text = doc.createTextNode(" ");
596 actValue = doc.createElement("ActualValue");
597 actValue.appendChild(text);
598 docID.appendChild(actValue);
599 }
600
601 for(int i = 0; i<alist.size(); i++){
602 actValue = doc.createElement("ActualValue");
603 String utf8String = new String(((String)alist.get(i)).getBytes(),"UTF-8");
604 Text text1 = doc.createTextNode(utf8String);
605 actValue.appendChild(text1);
606 docID.appendChild(actValue);
607 }
608
609 root.appendChild(docID);
610 docID = null;
611 counter++;
612 }
613
614 TransformerFactory tf=TransformerFactory.newInstance();
615 Transformer transformer=tf.newTransformer();
616 DOMSource source=new DOMSource(root);
617 transformer.setOutputProperty(OutputKeys.INDENT,"yes");
618
619 Writer pwx= new BufferedWriter(new OutputStreamWriter(new FileOutputStream(myNewDir+"/"+fileName+".xml"),"UTF-8"));
620 StreamResult result= new StreamResult(pwx);
621 transformer.transform(source,result);
622 pwx.close();
623
624 root = null;
625 docBuilderFactory = null;
626 docBuilder = null;
627 doc = null;
628
629 }catch (Exception e) {
630 System.out.println(e);
631 //e.printStackTrace(pw);
632 }
633 }
634 }
635
636 public int getDocNum(){
637 return TotalDoc;
638 }
639
640 public HashMap getMetadataSetMap(){
641 MDS_list.remove("extracted");
642 return (HashMap)MDS_list.clone();
643 }
644
645 public String getCollectionName(){
646 return collection_Name;
647 }
648
649 public void setOAIURL(String url){
650 OAI_URL = url;
651 }
652
653 public String getOAIURL(){
654 return OAI_URL;
655 }
656
657 public String getOaiPrefix(){
658 return oai_Prefix;
659 }
660
661 public ArrayList getMetadataNameList(){
662 return (ArrayList)metadataNameList.clone();
663 }
664 private String constructUpdateMessgae (){
665 String message = "<message><request type='system' to=''><system type='configure' subset=''/></request></message>";
666 return message;
667 }
668}
Note: See TracBrowser for help on using the repository browser.