source: other-projects/trunk/greenstone3-extension/mat/org/greenstone/gsdl3_extension/mat/MetadataStats.java@ 17358

Last change on this file since 17358 was 17358, checked in by cc108, 16 years ago

Updating Mat Source Code

File size: 18.7 KB
Line 
1package org.greenstone.gsdl3_extension.mat;
2
3
4import org.w3c.dom.*;
5import javax.xml.parsers.*;
6import javax.xml.transform.*;
7import javax.xml.transform.dom.*;
8import javax.xml.transform.stream.*;
9
10
11import java.io.BufferedWriter;
12import java.io.File;
13import java.io.FileOutputStream;
14import java.io.OutputStreamWriter;
15import java.io.Writer;
16
17import java.util.ArrayList;
18import java.util.Collection;
19import java.util.HashMap;
20import java.util.Iterator;
21import java.util.Set;
22import java.util.StringTokenizer;
23
24import org.greenstone.gsdl3.core.MessageRouter;
25import org.greenstone.gsdl3.util.GDBMWrapper;
26import org.greenstone.gsdl3.util.GSFile;
27import org.greenstone.gsdl3.util.GSXML;
28import org.greenstone.gsdl3.util.XMLConverter;
29import org.greenstone.gsdl3.util.GlobalProperties;
30import org.greenstone.gsdl3.util.GSPath;
31
32import org.w3c.dom.Document;
33import org.w3c.dom.Element;
34import org.w3c.dom.NamedNodeMap;
35import org.w3c.dom.Node;
36import org.w3c.dom.NodeList;
37
38public class MetadataStats {
39
40 private Document doc=null;
41 private MessageRouter mr = null;
42 private XMLConverter converter=null;
43 private GDBMWrapper gdbm_src = null;
44 private GSPath gspath = null;
45 private GlobalProperties globalProperty = null;
46 private String site_name = "localsite";
47 private String node_id = "browselist";
48 private String DLS_SET = "dls";
49 private String DC_SET = "dublin";
50 private String OAI_URL = "";
51 private boolean status = false;
52 private String oai_Prefix ="";
53
54 // servlet
55 String destination = "";
56
57 private int TotalDoc = 0;
58 private HashMap MDS_list = new HashMap();
59 public ArrayList metadataNameList = new ArrayList();
60 public String StatsDirectory;
61 public String HTMLDirectory;
62 private String collection_Name = null;
63 private String collection = null;
64
65 MetadataElement me;
66
67 /*
68 The constructor connects to the database and retrieve
69 information for the collection
70 */
71
72 private static boolean deleteDir(File dir) {
73
74 if (dir.isDirectory()) {
75 String[] children = dir.list();
76 for (int i=0; i<children.length; i++) {
77 boolean success = deleteDir(new File(dir, children[i]));
78 if (!success) {
79 return false;
80 }
81 }
82 }
83 return dir.delete();
84 }
85
86 public MetadataStats(String site_home, String collection,String url,String oaiPrefix){
87
88 OAI_URL = url;
89 collection_Name = collection;
90 oai_Prefix = oaiPrefix;
91
92 File current_dir = new File (".");
93 try{
94 destination = globalProperty.getGSDL3Home()+"/Collection_Analysis/";
95 }catch(Exception ex){
96 ex.printStackTrace();
97 }
98
99 HTMLDirectory = destination+collection+"/";
100 System.out.println(HTMLDirectory);
101 StatsDirectory =HTMLDirectory+"metadataStats/";
102 System.out.println(StatsDirectory);
103
104 if(new File(StatsDirectory).exists()){
105 deleteDir(new File(StatsDirectory));
106 }
107 new File(StatsDirectory).mkdirs();
108
109 String gdbm = GSFile.collectionDatabaseFile(site_home,collection,collection,"gdbm");
110
111 mr = new MessageRouter();
112 mr.setSiteName(this.site_name);
113 mr.configure();
114
115 this.collection = collection;
116 this.gdbm_src = new GDBMWrapper();
117 this.converter = new XMLConverter();
118 this.doc = this.converter.newDOM();
119 this.gspath = new GSPath();
120 this.globalProperty = new GlobalProperties();
121 Setup(gdbm);
122 }
123
124 public void getAvailableMetadataSets(String SetName,HashMap UsedMap, String SetAbbr){
125
126 String gsdl3Home = globalProperty.getGSDL3Home();
127 //System.out.println(gsdl3Home);
128 String metadataSetHome = gspath.removeLastLink(gsdl3Home)+"/gli/metadata";
129 //System.out.println(metadataSetHome);
130 //System.out.println(File.separator);
131 File metadata_directory = new File(metadataSetHome);
132
133 if (metadata_directory.exists()) {
134
135 File[] directory_files = metadata_directory.listFiles();
136
137 for (int i = 0; i < directory_files.length; i++) {
138 File child_file = directory_files[i];
139
140 if (!child_file.isDirectory() && child_file.getName().endsWith("mds")) {
141 String fileName = child_file.getName();
142
143 /////////////////////////
144 //LOADING DLS CORE
145 //---if(fileName.equals(SetName)&& fileName.equals("dls.mds") && DLS_STATUS){
146
147 if(!MDS_list.containsKey(SetName) && fileName.equals(SetName+".mds")){
148
149 MetadataSet ms = new MetadataSet();
150 ms.setName(SetName);
151 ms.setAbb(SetAbbr);
152 converter.newDOM();
153
154 Document d = converter.getDOM(child_file);
155 NodeList e = d.getElementsByTagName("Element");
156
157 int length = e.getLength();
158
159 for(int y = 0; y<length; y++){
160 Node temp = e.item(y);
161 NamedNodeMap mmp = temp.getAttributes();
162
163 if(!mmp.item(0).getNodeValue().equals("Description")){
164 MetadataElement me = new MetadataElement();
165 me.setMetadataName((SetAbbr+"."+mmp.item(0).getNodeValue()));
166
167 if(!metadataNameList.contains(me.getMetadataName())){
168 metadataNameList.add(me.getMetadataName());
169 ms.addIndex(me.getMetadataName());
170 }
171 UsedMap.put(me.getMetadataName(),me);
172 }
173 }
174 MDS_list.put(ms.getName(), ms);
175 }// DLS FINISH
176 }
177 }
178 }
179 }
180
181 // doc_list is the list of document ids
182 //MetadataElement represents which metadata element we want to explore
183
184 private void parseMetadata(ArrayList doc_list, String MetadataElement){
185
186 int counter = 0;
187 int docSize = doc_list.size();
188 System.out.println("<!-- prepare request -->");
189
190 while(true){
191
192 Node message = this.doc.createElement(GSXML.MESSAGE_ELEM);
193 Node request = GSXML.createBasicRequest(this.doc, GSXML.REQUEST_TYPE_PROCESS,collection+"/DocumentMetadataRetrieve","en", "");
194 Node param_list = this.doc.createElement(GSXML.PARAM_ELEM+GSXML.LIST_MODIFIER);
195 param_list.appendChild(GSXML.createParameter(this.doc, "metadata", "all"));
196 Node documentNode_list = this.doc.createElement(GSXML.DOC_NODE_ELEM+GSXML.LIST_MODIFIER);
197
198 if(docSize>=300){
199
200 docSize = docSize - 300;
201
202 for(int i = 0; i<300; i++){
203 Element documentNode = this.doc.createElement(GSXML.DOC_NODE_ELEM);
204 documentNode.setAttribute("nodeID",(String)doc_list.get(counter));
205 documentNode_list.appendChild(documentNode);
206 counter++;
207 }
208
209 request.appendChild(param_list);
210 request.appendChild(documentNode_list);
211 message.appendChild(request);
212 Node response = mr.process(message);
213 getAttributes(response);
214 response = null;
215 request = null;
216 param_list = null;
217 documentNode_list = null;
218 }
219
220 else if(docSize<300){
221
222 for(int i = 0; i<docSize; i++){
223 Element documentNode = this.doc.createElement(GSXML.DOC_NODE_ELEM);
224 documentNode.setAttribute("nodeID",(String)doc_list.get(counter));
225 documentNode_list.appendChild(documentNode);
226 counter++;
227 }
228
229 request.appendChild(param_list);
230 request.appendChild(documentNode_list);
231 message.appendChild(request);
232 System.out.println(this.converter.getPrettyString(message)+" ends");
233 Node response = mr.process(message);
234 System.out.println(this.converter.getPrettyString(response)+" ends");
235 getAttributes(response);
236 response = null;
237 request = null;
238 param_list = null;
239 documentNode_list = null;
240 break;
241 }
242 }
243 }
244
245 public boolean getStatus(){
246 return this.status;
247 }
248
249 private void Setup(String gdbm){
250
251 if (!this.gdbm_src.openDatabase(gdbm,GDBMWrapper.READ)) {
252 System.out.println("Could not open GDBM database!");
253 }
254
255 else{
256 String info = this.gdbm_src.getValue("browselist");
257
258 if (info == null) {
259 System.out.println("cannot locate the list");
260 }
261 else{
262 if (info == null) {
263 System.out.println("the db does not contain any info");
264 }
265
266 ArrayList children = new ArrayList();
267 StringTokenizer st = new StringTokenizer(info, ";");
268
269 while (st.hasMoreTokens()) {
270 String part = st.nextToken(";");
271
272 if(part.contains("<contains>")){
273 part = part.replace("<contains>", "");
274 }
275 else if(part.contains("<thistype>")){
276 int location = part.indexOf("<thistype>");
277 part = part.substring(0,location-1);
278 }
279 children.add(part);
280 }
281
282 gdbm_src.closeDatabase();
283 parseMetadata(children,"all");
284 TotalDoc = children.size();
285 System.out.println("Total Doc:"+TotalDoc);
286 status = true;
287 }
288 }
289 }
290
291 public void getAttributes(Node f){
292
293 HashMap UsedMap = new HashMap();
294 Element e = (Element) f;
295 NodeList metadataNode = e.getElementsByTagName("metadata");
296 int length = metadataNode.getLength();
297
298 for (int j = 0; j < length; j++){
299 Node aNode = metadataNode.item(j);
300 NamedNodeMap NodeMap = aNode.getAttributes();
301 Node AttributeNode = NodeMap.item(0);
302 String att_name = AttributeNode.getNodeValue();
303 if(att_name.indexOf("dls.")!=-1){getAvailableMetadataSets(DLS_SET,UsedMap,"dls");}
304 else if(att_name.indexOf("dc.")!=-1){getAvailableMetadataSets(DC_SET,UsedMap,"dc");}
305 }
306
307 adjust(UsedMap,e);
308 UsedMap = null;
309 e = null;
310 System.gc();
311 }
312
313
314 /*
315 * MATCH UP THE ELEMENTS FROM BOTH DOCUMENTS AND PRE-DEFINED METADATA SET
316 * MOVE THE ELEMENT FROM NOUSEDMAP TO USEDMAP IF THE ELEMENT HAS NOT BEEN DISCOVERED BEFORE
317 * INCREASE THE COUNTER ONCE THE ELEMENT IS RECONGNIZED
318 * CALCULATE THE TIMES OF ELEMENTS USED AND STORE IT INTO THE LIST
319 */
320
321 private void adjust(HashMap UsedMap, Element response){
322
323 String DocID;;
324 NodeList nList = response.getElementsByTagName("documentNode");
325 int length = nList.getLength();
326
327 for (int j = 0; j <length; j++){
328
329 Node aNode = nList.item(j);
330 NamedNodeMap NodeMap = aNode.getAttributes();
331 Node AttributeNode = NodeMap.item(0);
332 String att_name = AttributeNode.getNodeValue();
333 DocID = att_name;
334
335 NodeList childList = aNode.getChildNodes();
336 Node nNode = childList.item(0);
337 NodeList grandChildList = nNode.getChildNodes();
338
339 int length1 = grandChildList.getLength();
340
341 for(int e = 0; e<length1 ; e++){
342
343 Node xNode = grandChildList.item(e);
344 NamedNodeMap xNodeMap = xNode.getAttributes();
345 Node xAttributeNode = xNodeMap.item(0);
346 String xatt_name = xAttributeNode.getNodeValue();
347 NodeList cList = xNode.getChildNodes();
348 String ActValue = cList.item(0).getNodeValue();
349 char firstchart = xatt_name.charAt(0);
350
351 if(xatt_name.indexOf('^')!=-1){
352 xatt_name = xatt_name.replace('^', '.');
353 }
354
355 if(UsedMap.containsKey(xatt_name)){
356
357 MetadataElement met = (MetadataElement) UsedMap.get(xatt_name);
358 met.IncreaseFrequency();
359 HashMap metadataMap = met.getMetadataList();
360
361 if(metadataMap.containsKey(DocID)){
362 DocumentInfo dc = (DocumentInfo)metadataMap.get(DocID);
363 dc.IncreaseFrequence();
364 dc.addActualValue(ActValue);
365 metadataMap.put(DocID, dc);
366 }
367 else{
368 DocumentInfo dc = new DocumentInfo();
369 dc.IncreaseFrequence();
370 dc.setDocumentID(DocID);
371 dc.addActualValue(ActValue);
372 metadataMap.put(DocID, dc);
373 }
374
375 HashMap valueMap = met.getValueList();
376
377 if(valueMap.containsKey(ActValue)){
378 Integer f = (Integer)valueMap.get(ActValue);
379 int fx = f.intValue();
380 fx++;
381 valueMap.put(ActValue,new Integer(fx));
382 }
383 else{
384 valueMap.put(ActValue, new Integer(1));
385 }
386
387 }
388 else if( (!UsedMap.containsKey(xatt_name)) && xatt_name.equals("archivedir")){
389
390 String SetAbbr = "ex";
391 String SetName = "extracted";
392 MetadataSet ms = new MetadataSet();
393 ms.setName(SetName);
394 ms.setAbb(SetAbbr);
395
396 MetadataElement me = new MetadataElement();
397 me.setMetadataName(xatt_name);
398 me.IncreaseFrequency();
399 HashMap metadataMap = me.getMetadataList();
400
401 if(metadataMap.containsKey(DocID)){
402 DocumentInfo dc = (DocumentInfo)metadataMap.get(DocID);
403 dc.IncreaseFrequence();
404 dc.addActualValue(ActValue);
405 metadataMap.put(DocID, dc);
406 }
407 else{
408 DocumentInfo dc = new DocumentInfo();
409 dc.IncreaseFrequence();
410 dc.setDocumentID(DocID);
411 dc.addActualValue(ActValue);
412 metadataMap.put(DocID, dc);
413 }
414
415 HashMap valueMap = me.getValueList();
416 if(valueMap.containsKey(ActValue)){
417 Integer f = (Integer)valueMap.get(ActValue);
418 int fx = f.intValue();
419 fx++;
420 valueMap.put(ActValue,new Integer(fx));
421 }
422 else{
423
424 valueMap.put(ActValue, new Integer(1));
425 }
426 if(!metadataNameList.contains(me.getMetadataName())){
427 metadataNameList.add(me.getMetadataName());
428 ms.addIndex(me.getMetadataName());
429 }
430
431 UsedMap.put(me.getMetadataName(), me);
432 if(!MDS_list.containsKey(ms.getName())){
433 MDS_list.put(ms.getName(), ms);}
434 else{
435 MetadataSet msx = (MetadataSet)MDS_list.get(ms.getName());
436 msx.addIndex(me.getMetadataName());
437 MDS_list.put(ms.getName(), msx);
438 }
439 }
440 else if((!UsedMap.containsKey(xatt_name)) && xatt_name.indexOf(".")!=-1 && xatt_name.indexOf("dc.")==-1){
441 int dotLocation = xatt_name.indexOf(".");
442 String SetAbbr = xatt_name.substring(0,dotLocation);
443 String SetName = SetAbbr;
444
445 MetadataSet ms = new MetadataSet();
446 ms.setName(SetName);
447 ms.setAbb(SetAbbr);
448
449 MetadataElement me = new MetadataElement();
450 me.setMetadataName(xatt_name);
451 me.IncreaseFrequency();
452 HashMap metadataMap = me.getMetadataList();
453
454 if(metadataMap.containsKey(DocID)){
455 DocumentInfo dc = (DocumentInfo)metadataMap.get(DocID);
456 dc.IncreaseFrequence();
457 dc.addActualValue(ActValue);
458 metadataMap.put(DocID, dc);
459 }
460 else{
461 DocumentInfo dc = new DocumentInfo();
462 dc.IncreaseFrequence();
463 dc.setDocumentID(DocID);
464 dc.addActualValue(ActValue);
465 metadataMap.put(DocID, dc);
466 }
467
468 HashMap valueMap = me.getValueList();
469 if(valueMap.containsKey(ActValue)){
470 Integer f = (Integer)valueMap.get(ActValue);
471 int fx = f.intValue();
472 fx++;
473 valueMap.put(ActValue,new Integer(fx));
474 }
475 else{
476
477 valueMap.put(ActValue, new Integer(1));
478 }
479 if(!metadataNameList.contains(me.getMetadataName())){
480 metadataNameList.add(me.getMetadataName());
481 ms.addIndex(me.getMetadataName());
482 }
483 UsedMap.put(me.getMetadataName(), me);
484
485 if(!MDS_list.containsKey(ms.getName())){
486 MDS_list.put(ms.getName(), ms);
487 }
488 else{
489 MetadataSet msx = (MetadataSet)MDS_list.get(ms.getName());
490 msx.addIndex(me.getMetadataName());
491 MDS_list.put(ms.getName(), msx);
492 }
493 }
494
495 else if ((!UsedMap.containsKey(xatt_name)) && (xatt_name.indexOf("dc.")==0) && xatt_name.indexOf("dc.Description")!=0){
496
497 String SetName = "dublin";
498 xatt_name = xatt_name.replace('^', '.');
499
500 MetadataElement me = new MetadataElement();
501 me.setMetadataName(xatt_name);
502 me.IncreaseFrequency();
503 HashMap metadataMap = me.getMetadataList();
504
505 DocumentInfo dc = new DocumentInfo();
506 dc.IncreaseFrequence();
507 dc.setDocumentID(DocID);
508 dc.addActualValue(ActValue);
509 metadataMap.put(DocID, dc);
510
511 HashMap valueMap = me.getValueList();
512 valueMap.put(ActValue, new Integer(1));
513
514 if(!metadataNameList.contains(me.getMetadataName())){
515 metadataNameList.add(me.getMetadataName());
516 }
517
518 UsedMap.put(me.getMetadataName(), me);
519 MetadataSet msx = (MetadataSet)MDS_list.get(SetName);
520 msx.addIndex(me.getMetadataName());
521 MDS_list.put(SetName, msx);
522 }
523 }// element
524
525 }//doc id
526 ////////////////////
527
528 //write file
529 int counter = 0;
530 Set s = UsedMap.keySet();
531 Iterator is = s.iterator();
532
533 while(is.hasNext()){
534
535 String fileName = (String)is.next();
536 MetadataElement me = (MetadataElement)UsedMap.get(fileName);
537 HashMap hp = me.getMetadataList();
538 Collection ks = hp.values();
539 Iterator iks = ks.iterator();
540
541 try{
542 DocumentBuilderFactory docBuilderFactory = DocumentBuilderFactory.newInstance();
543 DocumentBuilder docBuilder = docBuilderFactory.newDocumentBuilder();
544 Document doc = docBuilder.newDocument();
545 Element root = doc.createElement("root");
546 boolean fileExist = (new File(StatsDirectory+"/"+fileName+".xml")).exists();
547
548 if(!fileExist){
549 root = doc.createElement("root");
550 }
551 else{
552 doc = docBuilder.parse (new File(StatsDirectory+"/"+fileName+".xml"));
553 root = doc.getDocumentElement();
554 }
555
556 while(iks.hasNext()){
557 DocumentInfo dc = (DocumentInfo)iks.next();
558 Element docID = doc.createElement("Document");
559 docID.setAttribute("id", dc.getDocumentID());
560
561 Element freq = doc.createElement("Frequency");
562 Text text = doc.createTextNode(dc.getFrequence()+"");
563 freq.appendChild(text);
564 docID.appendChild(freq);
565
566 Element actValue = doc.createElement("ActualValue");
567 ArrayList alist = dc.getActualValue();
568
569 if(alist.size()==0){
570 text = doc.createTextNode(" ");
571 actValue = doc.createElement("ActualValue");
572 actValue.appendChild(text);
573 docID.appendChild(actValue);
574 }
575
576 for(int i = 0; i<alist.size(); i++){
577 actValue = doc.createElement("ActualValue");
578 String utf8String = new String(((String)alist.get(i)).getBytes(),"UTF-8");
579 Text text1 = doc.createTextNode(utf8String);
580 actValue.appendChild(text1);
581 docID.appendChild(actValue);
582 }
583
584 root.appendChild(docID);
585 docID = null;
586 counter++;
587 }
588
589 TransformerFactory tf= TransformerFactory.newInstance();
590 Transformer transformer= tf.newTransformer();
591 DOMSource source= new DOMSource(root);
592 transformer.setOutputProperty(OutputKeys.INDENT,"yes");
593
594 Writer pwx= new BufferedWriter(new OutputStreamWriter(new FileOutputStream(StatsDirectory+"/"+fileName+".xml"),"UTF-8"));
595 StreamResult result= new StreamResult(pwx);
596 transformer.transform(source,result);
597 pwx.close();
598
599 root = null;
600 docBuilderFactory = null;
601 docBuilder = null;
602 doc = null;
603
604 }catch (Exception e) {
605 System.out.println(e);
606 }
607 }
608 }
609
610 public int getDocNum(){
611 return TotalDoc;
612 }
613
614 public HashMap getMetadataSetMap(){
615 MDS_list.remove("extracted");
616 return (HashMap)MDS_list.clone();
617 }
618
619 public String getCollectionName(){
620 return collection_Name;
621 }
622
623 public void setOAIURL(String url){
624 OAI_URL = url;
625 }
626
627 public String getOAIURL(){
628 return OAI_URL;
629 }
630
631 public String getOaiPrefix(){
632 return oai_Prefix;
633 }
634
635 public ArrayList getMetadataNameList(){
636 return (ArrayList)metadataNameList.clone();
637 }
638 private String constructUpdateMessgae (){
639 String message = "<message><request type='system' to=''><system type='configure' subset=''/></request></message>";
640 return message;
641 }
642}
Note: See TracBrowser for help on using the repository browser.