source: gs3-extensions/mat/trunk/src/org/greenstone/mat/MetadataStats.java@ 21927

Last change on this file since 21927 was 21927, checked in by sjm84, 14 years ago

Renamed package to org.greenstone.mat from org.greenstone.gsdl3_extension.mat

File size: 18.3 KB
Line 
1package org.greenstone.mat;
2
3import org.w3c.dom.*;
4import javax.xml.parsers.*;
5import javax.xml.transform.*;
6import javax.xml.transform.dom.*;
7import javax.xml.transform.stream.*;
8
9import java.io.BufferedWriter;
10import java.io.File;
11import java.io.FileOutputStream;
12import java.io.OutputStreamWriter;
13import java.io.Writer;
14
15import java.util.ArrayList;
16import java.util.Collection;
17import java.util.HashMap;
18import java.util.Iterator;
19import java.util.Set;
20import java.util.StringTokenizer;
21
22import org.greenstone.gsdl3.core.MessageRouter;
23import org.greenstone.gsdl3.util.GDBMWrapper;
24import org.greenstone.gsdl3.util.GSFile;
25import org.greenstone.gsdl3.util.GSXML;
26import org.greenstone.gsdl3.util.XMLConverter;
27import org.greenstone.gsdl3.util.GlobalProperties;
28import org.greenstone.gsdl3.util.GSPath;
29
30import org.w3c.dom.Document;
31import org.w3c.dom.Element;
32import org.w3c.dom.NamedNodeMap;
33import org.w3c.dom.Node;
34import org.w3c.dom.NodeList;
35
36public class MetadataStats {
37
38 private Document doc=null;
39 private MessageRouter mr = null;
40 private XMLConverter converter=null;
41 private GDBMWrapper gdbm_src = null;
42 private GSPath gspath = null;
43 private GlobalProperties globalProperty = null;
44 private String site_name = "localsite";
45 private String DLS_SET = "dls";
46 private String DC_SET = "dublin";
47 private String OAI_URL = "";
48 private boolean status = false;
49 private String oai_Prefix ="";
50
51 String destination = "";
52
53 private int TotalDoc = 0;
54 private HashMap MDS_list = new HashMap();
55 public ArrayList metadataNameList = new ArrayList();
56 public String StatsDirectory;
57 public String HTMLDirectory;
58 private String collection_Name = null;
59 private String collection = null;
60 private final String DBType ="gdbm";
61 private final String fileSeparator = File.separator;
62 protected final String gsdl3Home = null;
63 MetadataElement me;
64
65 /*
66 The constructor connects to the database and retrieve
67 information for the collection
68 */
69
70 private static boolean deleteDir(File dir) {
71
72 if (dir.isDirectory()) {
73 String[] children = dir.list();
74 for (int i=0; i<children.length; i++) {
75 boolean success = deleteDir(new File(dir, children[i]));
76 if (!success) {
77 return false;
78 }
79 }
80 }
81 return dir.delete();
82 }
83
84 public MetadataStats(String site_home, String collection,String url,String oaiPrefix){
85
86 OAI_URL = url;
87 collection_Name = collection;
88 oai_Prefix = oaiPrefix;
89
90 try{
91 destination = globalProperty.getGSDL3Home()+fileSeparator+"mat"+fileSeparator+collection+fileSeparator;
92 }catch(Exception ex){
93 ex.printStackTrace();
94 }
95
96 HTMLDirectory = destination;
97 StatsDirectory = HTMLDirectory+"metadataStats"+fileSeparator;
98
99 if(new File(StatsDirectory).exists()){
100 deleteDir(new File(StatsDirectory));
101 }
102 new File(StatsDirectory).mkdirs();
103
104 String gdbm = GSFile.collectionDatabaseFile(site_home,collection,collection,DBType);
105
106 mr = new MessageRouter();
107 mr.setSiteName(this.site_name);
108 mr.configure();
109
110 this.collection = collection;
111 this.gdbm_src = new GDBMWrapper();
112 this.converter = new XMLConverter();
113 this.doc = this.converter.newDOM();
114 this.gspath = new GSPath();
115 this.globalProperty = new GlobalProperties();
116 Setup(gdbm);
117 }
118
119 public void getAvailableMetadataSets(String SetName,HashMap UsedMap, String SetAbbr){
120
121 String gsdl3Home = globalProperty.getGSDL3Home();
122 String os = "linux";
123
124 if(System.getProperty("os.name").toLowerCase().indexOf("windows")!=-1){
125 gsdl3Home = gsdl3Home.replaceAll("\\\\", "/");
126 os = "windows";
127 }
128 String metadataSetHome = gspath.removeLastLink(gsdl3Home)+fileSeparator+"gli"+fileSeparator+"metadata";
129 if(os.equals("windows")){
130 metadataSetHome = metadataSetHome.replaceAll("/", "\\\\");
131 }
132 File metadata_directory = new File(metadataSetHome);
133
134 if (metadata_directory.exists()) {
135
136 File[] directory_files = metadata_directory.listFiles();
137
138 for (int i = 0; i < directory_files.length; i++) {
139 File child_file = directory_files[i];
140
141 if (!child_file.isDirectory() && child_file.getName().endsWith("mds")) {
142 String fileName = child_file.getName();
143
144 if(!MDS_list.containsKey(SetName) && fileName.equals(SetName+".mds")){
145
146 MetadataSet ms = new MetadataSet();
147 ms.setName(SetName);
148 ms.setAbb(SetAbbr);
149 converter.newDOM();
150
151 Document d = converter.getDOM(child_file);
152 NodeList e = d.getElementsByTagName("Element");
153
154 int length = e.getLength();
155
156 for(int y = 0; y<length; y++){
157 Node temp = e.item(y);
158 NamedNodeMap mmp = temp.getAttributes();
159
160 if(!mmp.item(0).getNodeValue().equals("Description")){
161 MetadataElement me = new MetadataElement();
162 me.setMetadataName((SetAbbr+"."+mmp.item(0).getNodeValue()));
163
164 if(!metadataNameList.contains(me.getMetadataName())){
165 metadataNameList.add(me.getMetadataName());
166 ms.addIndex(me.getMetadataName());
167 }
168 UsedMap.put(me.getMetadataName(),me);
169 }
170 }
171 MDS_list.put(ms.getName(), ms);
172 }
173 }
174 }
175 }
176 }
177
178 private void parseMetadata(ArrayList doc_list, String MetadataElement){
179
180 int counter = 0;
181 int docSize = doc_list.size();
182
183 while(true){
184
185 Node message = this.doc.createElement(GSXML.MESSAGE_ELEM);
186 Node request = GSXML.createBasicRequest(this.doc, GSXML.REQUEST_TYPE_PROCESS,collection+"/DocumentMetadataRetrieve","en", "");
187 Node param_list = this.doc.createElement(GSXML.PARAM_ELEM+GSXML.LIST_MODIFIER);
188 param_list.appendChild(GSXML.createParameter(this.doc, "metadata", "all"));
189 Node documentNode_list = this.doc.createElement(GSXML.DOC_NODE_ELEM+GSXML.LIST_MODIFIER);
190
191 if(docSize>=300){
192 docSize = docSize - 300;
193 for(int i = 0; i<300; i++){
194 Element documentNode = this.doc.createElement(GSXML.DOC_NODE_ELEM);
195 documentNode.setAttribute("nodeID",(String)doc_list.get(counter));
196 documentNode_list.appendChild(documentNode);
197 counter++;
198 }
199
200 request.appendChild(param_list);
201 request.appendChild(documentNode_list);
202 message.appendChild(request);
203 Node response = mr.process(message);
204 getAttributes(response);
205 response = null;
206 request = null;
207 param_list = null;
208 documentNode_list = null;
209 }
210
211 else if(docSize<300){
212 for(int i = 0; i<docSize; i++){
213 Element documentNode = this.doc.createElement(GSXML.DOC_NODE_ELEM);
214 documentNode.setAttribute("nodeID",(String)doc_list.get(counter));
215 documentNode_list.appendChild(documentNode);
216 counter++;
217 }
218
219 request.appendChild(param_list);
220 request.appendChild(documentNode_list);
221 message.appendChild(request);
222 Node response = mr.process(message);
223 getAttributes(response);
224 response = null;
225 request = null;
226 param_list = null;
227 documentNode_list = null;
228 break;
229 }
230 }
231 }
232
233 public boolean getStatus(){
234 return this.status;
235 }
236
237 private void Setup(String gdbm){
238
239 if (!this.gdbm_src.openDatabase(gdbm,GDBMWrapper.READ)) {
240 System.out.println("Could not open GDBM database!");
241 }
242
243 else{
244 String info = this.gdbm_src.getValue("browselist");
245
246 if (info == null) {
247 System.out.println("cannot locate the list");
248 }
249 else{
250 if (info == null) {
251 System.out.println("the db does not contain any info");
252 }
253
254 ArrayList children = new ArrayList();
255 StringTokenizer st = new StringTokenizer(info, ";");
256
257 while (st.hasMoreTokens()) {
258 String part = st.nextToken(";");
259
260 if(part.indexOf("<contains>")!=-1){
261 part = part.replaceAll("<contains>", "");
262 }
263 else if(part.indexOf("<thistype>")!=-1){
264 int location = part.indexOf("<thistype>");
265 part = part.substring(0,location-1);
266 }
267 children.add(part);
268 }
269
270 gdbm_src.closeDatabase();
271 parseMetadata(children,"all");
272 TotalDoc = children.size();
273 System.out.println("Total Doc:"+TotalDoc);
274 status = true;
275 }
276 }
277 }
278
279 public void getAttributes(Node f){
280
281 HashMap UsedMap = new HashMap();
282 Element e = (Element) f;
283 NodeList metadataNode = e.getElementsByTagName("metadata");
284 int length = metadataNode.getLength();
285
286 for (int j = 0; j < length; j++){
287 Node aNode = metadataNode.item(j);
288 NamedNodeMap NodeMap = aNode.getAttributes();
289 Node AttributeNode = NodeMap.item(0);
290 String att_name = AttributeNode.getNodeValue();
291 if(att_name.indexOf("dls.")!=-1){getAvailableMetadataSets(DLS_SET,UsedMap,"dls");}
292 else if(att_name.indexOf("dc.")!=-1){getAvailableMetadataSets(DC_SET,UsedMap,"dc");}
293 }
294
295 adjust(UsedMap,e);
296 UsedMap = null;
297 e = null;
298 System.gc();
299 }
300
301
302 /*
303 * MATCH UP THE ELEMENTS FROM BOTH DOCUMENTS AND PRE-DEFINED METADATA SET
304 * MOVE THE ELEMENT FROM NOUSEDMAP TO USEDMAP IF THE ELEMENT HAS NOT BEEN DISCOVERED BEFORE
305 * INCREASE THE COUNTER ONCE THE ELEMENT IS RECONGNIZED
306 * CALCULATE THE TIMES OF ELEMENTS USED AND STORE IT INTO THE LIST
307 */
308
309 private void adjust(HashMap UsedMap, Element response){
310
311 String DocID;;
312 NodeList nList = response.getElementsByTagName("documentNode");
313 int length = nList.getLength();
314
315 for (int j = 0; j <length; j++){
316
317 Node aNode = nList.item(j);
318 NamedNodeMap NodeMap = aNode.getAttributes();
319 Node AttributeNode = NodeMap.item(0);
320 String att_name = AttributeNode.getNodeValue();
321 DocID = att_name;
322
323 NodeList childList = aNode.getChildNodes();
324 Node nNode = childList.item(0);
325 NodeList grandChildList = nNode.getChildNodes();
326
327 int length1 = grandChildList.getLength();
328
329 for(int e = 0; e<length1 ; e++){
330
331 Node xNode = grandChildList.item(e);
332 NamedNodeMap xNodeMap = xNode.getAttributes();
333 Node xAttributeNode = xNodeMap.item(0);
334 String xatt_name = xAttributeNode.getNodeValue();
335 NodeList cList = xNode.getChildNodes();
336 String ActValue = cList.item(0).getNodeValue();
337 char firstchart = xatt_name.charAt(0);
338
339 if(xatt_name.indexOf('^')!=-1){
340 xatt_name = xatt_name.replace('^', '.');
341 }
342
343 if(UsedMap.containsKey(xatt_name)){
344
345 MetadataElement met = (MetadataElement) UsedMap.get(xatt_name);
346 met.IncreaseFrequency();
347 HashMap metadataMap = met.getMetadataList();
348
349 if(metadataMap.containsKey(DocID)){
350 DocumentInfo dc = (DocumentInfo)metadataMap.get(DocID);
351 dc.IncreaseFrequence();
352 dc.addActualValue(ActValue);
353 metadataMap.put(DocID, dc);
354 }
355 else{
356 DocumentInfo dc = new DocumentInfo();
357 dc.IncreaseFrequence();
358 dc.setDocumentID(DocID);
359 dc.addActualValue(ActValue);
360 metadataMap.put(DocID, dc);
361 }
362
363 HashMap valueMap = met.getValueList();
364
365 if(valueMap.containsKey(ActValue)){
366 Integer f = (Integer)valueMap.get(ActValue);
367 int fx = f.intValue();
368 fx++;
369 valueMap.put(ActValue,new Integer(fx));
370 }
371 else{
372 valueMap.put(ActValue, new Integer(1));
373 }
374
375 }
376 else if( (!UsedMap.containsKey(xatt_name)) && xatt_name.equals("archivedir")){
377
378 String SetAbbr = "ex";
379 String SetName = "extracted";
380 MetadataSet ms = new MetadataSet();
381 ms.setName(SetName);
382 ms.setAbb(SetAbbr);
383
384 MetadataElement me = new MetadataElement();
385 me.setMetadataName(xatt_name);
386 me.IncreaseFrequency();
387 HashMap metadataMap = me.getMetadataList();
388
389 if(metadataMap.containsKey(DocID)){
390 DocumentInfo dc = (DocumentInfo)metadataMap.get(DocID);
391 dc.IncreaseFrequence();
392 dc.addActualValue(ActValue);
393 metadataMap.put(DocID, dc);
394 }
395 else{
396 DocumentInfo dc = new DocumentInfo();
397 dc.IncreaseFrequence();
398 dc.setDocumentID(DocID);
399 dc.addActualValue(ActValue);
400 metadataMap.put(DocID, dc);
401 }
402
403 HashMap valueMap = me.getValueList();
404 if(valueMap.containsKey(ActValue)){
405 Integer f = (Integer)valueMap.get(ActValue);
406 int fx = f.intValue();
407 fx++;
408 valueMap.put(ActValue,new Integer(fx));
409 }
410 else{
411
412 valueMap.put(ActValue, new Integer(1));
413 }
414 if(!metadataNameList.contains(me.getMetadataName())){
415 metadataNameList.add(me.getMetadataName());
416 ms.addIndex(me.getMetadataName());
417 }
418
419 UsedMap.put(me.getMetadataName(), me);
420 if(!MDS_list.containsKey(ms.getName())){
421 MDS_list.put(ms.getName(), ms);}
422 else{
423 MetadataSet msx = (MetadataSet)MDS_list.get(ms.getName());
424 msx.addIndex(me.getMetadataName());
425 MDS_list.put(ms.getName(), msx);
426 }
427 }
428 else if((!UsedMap.containsKey(xatt_name)) && xatt_name.indexOf(".")!=-1 && xatt_name.indexOf("dc.")==-1){
429 int dotLocation = xatt_name.indexOf(".");
430 String SetAbbr = xatt_name.substring(0,dotLocation);
431 String SetName = SetAbbr;
432
433 MetadataSet ms = new MetadataSet();
434 ms.setName(SetName);
435 ms.setAbb(SetAbbr);
436
437 MetadataElement me = new MetadataElement();
438 me.setMetadataName(xatt_name);
439 me.IncreaseFrequency();
440 HashMap metadataMap = me.getMetadataList();
441
442 if(metadataMap.containsKey(DocID)){
443 DocumentInfo dc = (DocumentInfo)metadataMap.get(DocID);
444 dc.IncreaseFrequence();
445 dc.addActualValue(ActValue);
446 metadataMap.put(DocID, dc);
447 }
448 else{
449 DocumentInfo dc = new DocumentInfo();
450 dc.IncreaseFrequence();
451 dc.setDocumentID(DocID);
452 dc.addActualValue(ActValue);
453 metadataMap.put(DocID, dc);
454 }
455
456 HashMap valueMap = me.getValueList();
457 if(valueMap.containsKey(ActValue)){
458 Integer f = (Integer)valueMap.get(ActValue);
459 int fx = f.intValue();
460 fx++;
461 valueMap.put(ActValue,new Integer(fx));
462 }
463 else{
464
465 valueMap.put(ActValue, new Integer(1));
466 }
467 if(!metadataNameList.contains(me.getMetadataName())){
468 metadataNameList.add(me.getMetadataName());
469 ms.addIndex(me.getMetadataName());
470 }
471 UsedMap.put(me.getMetadataName(), me);
472
473 if(!MDS_list.containsKey(ms.getName())){
474 MDS_list.put(ms.getName(), ms);
475 }
476 else{
477 MetadataSet msx = (MetadataSet)MDS_list.get(ms.getName());
478 msx.addIndex(me.getMetadataName());
479 MDS_list.put(ms.getName(), msx);
480 }
481 }
482
483 else if ((!UsedMap.containsKey(xatt_name)) && (xatt_name.indexOf("dc.")==0) && xatt_name.indexOf("dc.Description")!=0){
484
485 String SetName = "dublin";
486 xatt_name = xatt_name.replace('^', '.');
487
488 MetadataElement me = new MetadataElement();
489 me.setMetadataName(xatt_name);
490 me.IncreaseFrequency();
491 HashMap metadataMap = me.getMetadataList();
492
493 DocumentInfo dc = new DocumentInfo();
494 dc.IncreaseFrequence();
495 dc.setDocumentID(DocID);
496 dc.addActualValue(ActValue);
497 metadataMap.put(DocID, dc);
498
499 HashMap valueMap = me.getValueList();
500 valueMap.put(ActValue, new Integer(1));
501
502 if(!metadataNameList.contains(me.getMetadataName())){
503 metadataNameList.add(me.getMetadataName());
504 }
505
506 UsedMap.put(me.getMetadataName(), me);
507 MetadataSet msx = (MetadataSet)MDS_list.get(SetName);
508 msx.addIndex(me.getMetadataName());
509 MDS_list.put(SetName, msx);
510 }
511 }
512 }
513
514 int counter = 0;
515 Set s = UsedMap.keySet();
516 Iterator is = s.iterator();
517
518 while(is.hasNext()){
519 String fileName = (String)is.next();
520 MetadataElement me = (MetadataElement)UsedMap.get(fileName);
521 HashMap hp = me.getMetadataList();
522 Collection ks = hp.values();
523 Iterator iks = ks.iterator();
524
525 try{
526 DocumentBuilderFactory docBuilderFactory = DocumentBuilderFactory.newInstance();
527 DocumentBuilder docBuilder = docBuilderFactory.newDocumentBuilder();
528 Document doc = docBuilder.newDocument();
529 Element root = doc.createElement("root");
530 boolean fileExist = (new File(StatsDirectory+fileName+".xml")).exists();
531
532 if(!fileExist){
533 root = doc.createElement("root");
534 }
535 else{
536 doc = docBuilder.parse (new File(StatsDirectory+fileName+".xml"));
537 root = doc.getDocumentElement();
538 }
539
540 while(iks.hasNext()){
541 DocumentInfo dc = (DocumentInfo)iks.next();
542 Element docID = doc.createElement("Document");
543 docID.setAttribute("id", dc.getDocumentID());
544
545 Element freq = doc.createElement("Frequency");
546 Text text = doc.createTextNode(dc.getFrequence()+"");
547 freq.appendChild(text);
548 docID.appendChild(freq);
549
550 Element actValue = doc.createElement("ActualValue");
551 ArrayList alist = dc.getActualValue();
552
553 if(alist.size()==0){
554 text = doc.createTextNode(" ");
555 actValue = doc.createElement("ActualValue");
556 actValue.appendChild(text);
557 docID.appendChild(actValue);
558 }
559
560 for(int i = 0; i<alist.size(); i++){
561 actValue = doc.createElement("ActualValue");
562 String utf8String = new String(((String)alist.get(i)).getBytes(),"UTF-8");
563 Text text1 = doc.createTextNode(utf8String);
564 actValue.appendChild(text1);
565 docID.appendChild(actValue);
566 }
567
568 root.appendChild(docID);
569 docID = null;
570 counter++;
571 }
572
573 TransformerFactory tf= TransformerFactory.newInstance();
574 Transformer transformer= tf.newTransformer();
575 DOMSource source= new DOMSource(root);
576 transformer.setOutputProperty(OutputKeys.INDENT,"yes");
577
578 Writer pwx= new BufferedWriter(new OutputStreamWriter(new FileOutputStream(StatsDirectory+fileName+".xml"),"UTF-8"));
579 StreamResult result= new StreamResult(pwx);
580 transformer.transform(source,result);
581 pwx.close();
582
583 root = null;
584 docBuilderFactory = null;
585 docBuilder = null;
586 doc = null;
587
588 }catch (Exception e) {
589 System.out.println(e);
590 }
591 }
592 }
593
594 public int getDocNum(){
595 return TotalDoc;
596 }
597
598 public HashMap getMetadataSetMap(){
599 MDS_list.remove("extracted");
600 return (HashMap)MDS_list.clone();
601 }
602
603 public String getCollectionName(){
604 return collection_Name;
605 }
606
607 public void setOAIURL(String url){
608 OAI_URL = url;
609 }
610
611 public String getOAIURL(){
612 return OAI_URL;
613 }
614
615 public String getOaiPrefix(){
616 return oai_Prefix;
617 }
618
619 public ArrayList getMetadataNameList(){
620 return (ArrayList)metadataNameList.clone();
621 }
622 private String constructUpdateMessgae (){
623 String message = "<message><request type='system' to=''><system type='configure' subset=''/></request></message>";
624 return message;
625 }
626}
Note: See TracBrowser for help on using the repository browser.