source: gs3-extensions/mat/trunk/src/src/org/greenstone/mat/MetadataStats.java@ 25025

Last change on this file since 25025 was 25025, checked in by sjm84, 12 years ago

Fixing Mat to work with the new UserContext construct

File size: 18.4 KB
Line 
1package org.greenstone.mat;
2
3import org.w3c.dom.*;
4import javax.xml.parsers.*;
5import javax.xml.transform.*;
6import javax.xml.transform.dom.*;
7import javax.xml.transform.stream.*;
8
9import java.io.BufferedWriter;
10import java.io.File;
11import java.io.FileOutputStream;
12import java.io.OutputStreamWriter;
13import java.io.Writer;
14
15import java.util.ArrayList;
16import java.util.Collection;
17import java.util.HashMap;
18import java.util.Iterator;
19import java.util.Set;
20import java.util.StringTokenizer;
21
22import org.greenstone.gsdl3.core.MessageRouter;
23import org.greenstone.gsdl3.util.GDBMWrapper;
24import org.greenstone.gsdl3.util.GSFile;
25import org.greenstone.gsdl3.util.GSXML;
26import org.greenstone.gsdl3.util.XMLConverter;
27import org.greenstone.util.GlobalProperties;
28import org.greenstone.gsdl3.util.GSPath;
29import org.greenstone.gsdl3.util.UserContext;
30
31import org.w3c.dom.Document;
32import org.w3c.dom.Element;
33import org.w3c.dom.NamedNodeMap;
34import org.w3c.dom.Node;
35import org.w3c.dom.NodeList;
36
37public class MetadataStats {
38
39 private Document doc=null;
40 private MessageRouter mr = null;
41 private XMLConverter converter=null;
42 private GDBMWrapper gdbm_src = null;
43 private GSPath gspath = null;
44 private GlobalProperties globalProperty = null;
45 private String site_name = "localsite";
46 private String DLS_SET = "dls";
47 private String DC_SET = "dublin";
48 private String OAI_URL = "";
49 private boolean status = false;
50 private String oai_Prefix ="";
51
52 String destination = "";
53
54 private int TotalDoc = 0;
55 private HashMap MDS_list = new HashMap();
56 public ArrayList metadataNameList = new ArrayList();
57 public String StatsDirectory;
58 public String HTMLDirectory;
59 private String collection_Name = null;
60 private String collection = null;
61 private final String DBType ="gdbm";
62 private final String fileSeparator = File.separator;
63 protected final String gsdl3Home = null;
64 MetadataElement me;
65
66 /*
67 The constructor connects to the database and retrieve
68 information for the collection
69 */
70
71 private static boolean deleteDir(File dir) {
72
73 if (dir.isDirectory()) {
74 String[] children = dir.list();
75 for (int i=0; i<children.length; i++) {
76 boolean success = deleteDir(new File(dir, children[i]));
77 if (!success) {
78 return false;
79 }
80 }
81 }
82 return dir.delete();
83 }
84
85 public MetadataStats(String site_home, String collection,String url,String oaiPrefix){
86
87 OAI_URL = url;
88 collection_Name = collection;
89 oai_Prefix = oaiPrefix;
90
91 try{
92 destination = globalProperty.getGSDL3Home()+fileSeparator+"mat"+fileSeparator+collection+fileSeparator;
93 }catch(Exception ex){
94 ex.printStackTrace();
95 }
96
97 HTMLDirectory = destination;
98 StatsDirectory = HTMLDirectory+"metadataStats"+fileSeparator;
99
100 if(new File(StatsDirectory).exists()){
101 deleteDir(new File(StatsDirectory));
102 }
103 new File(StatsDirectory).mkdirs();
104
105 String gdbm = GSFile.collectionDatabaseFile(site_home,collection,collection,DBType);
106
107 mr = new MessageRouter();
108 mr.setSiteName(this.site_name);
109 mr.configure();
110
111 this.collection = collection;
112 this.gdbm_src = new GDBMWrapper();
113 this.converter = new XMLConverter();
114 this.doc = this.converter.newDOM();
115 this.gspath = new GSPath();
116 this.globalProperty = new GlobalProperties();
117 Setup(gdbm);
118 }
119
120 public void getAvailableMetadataSets(String SetName,HashMap UsedMap, String SetAbbr){
121
122 String gsdl3Home = globalProperty.getGSDL3Home();
123 String os = "linux";
124
125 if(System.getProperty("os.name").toLowerCase().indexOf("windows")!=-1){
126 gsdl3Home = gsdl3Home.replaceAll("\\\\", "/");
127 os = "windows";
128 }
129 String metadataSetHome = gspath.removeLastLink(gsdl3Home)+fileSeparator+"gli"+fileSeparator+"metadata";
130 if(os.equals("windows")){
131 metadataSetHome = metadataSetHome.replaceAll("/", "\\\\");
132 }
133 File metadata_directory = new File(metadataSetHome);
134
135 if (metadata_directory.exists()) {
136
137 File[] directory_files = metadata_directory.listFiles();
138
139 for (int i = 0; i < directory_files.length; i++) {
140 File child_file = directory_files[i];
141
142 if (!child_file.isDirectory() && child_file.getName().endsWith("mds")) {
143 String fileName = child_file.getName();
144
145 if(!MDS_list.containsKey(SetName) && fileName.equals(SetName+".mds")){
146
147 MetadataSet ms = new MetadataSet();
148 ms.setName(SetName);
149 ms.setAbb(SetAbbr);
150 converter.newDOM();
151
152 Document d = converter.getDOM(child_file);
153 NodeList e = d.getElementsByTagName("Element");
154
155 int length = e.getLength();
156
157 for(int y = 0; y<length; y++){
158 Node temp = e.item(y);
159 NamedNodeMap mmp = temp.getAttributes();
160
161 if(!mmp.item(0).getNodeValue().equals("Description")){
162 MetadataElement me = new MetadataElement();
163 me.setMetadataName((SetAbbr+"."+mmp.item(0).getNodeValue()));
164
165 if(!metadataNameList.contains(me.getMetadataName())){
166 metadataNameList.add(me.getMetadataName());
167 ms.addIndex(me.getMetadataName());
168 }
169 UsedMap.put(me.getMetadataName(),me);
170 }
171 }
172 MDS_list.put(ms.getName(), ms);
173 }
174 }
175 }
176 }
177 }
178
179 private void parseMetadata(ArrayList doc_list, String MetadataElement){
180
181 int counter = 0;
182 int docSize = doc_list.size();
183
184 while(true){
185
186 Node message = this.doc.createElement(GSXML.MESSAGE_ELEM);
187 Node request = GSXML.createBasicRequest(this.doc, GSXML.REQUEST_TYPE_PROCESS,collection+"/DocumentMetadataRetrieve", new UserContext());
188 Node param_list = this.doc.createElement(GSXML.PARAM_ELEM+GSXML.LIST_MODIFIER);
189 param_list.appendChild(GSXML.createParameter(this.doc, "metadata", "all"));
190 Node documentNode_list = this.doc.createElement(GSXML.DOC_NODE_ELEM+GSXML.LIST_MODIFIER);
191
192 if(docSize>=300){
193 docSize = docSize - 300;
194 for(int i = 0; i<300; i++){
195 Element documentNode = this.doc.createElement(GSXML.DOC_NODE_ELEM);
196 documentNode.setAttribute("nodeID",(String)doc_list.get(counter));
197 documentNode_list.appendChild(documentNode);
198 counter++;
199 }
200
201 request.appendChild(param_list);
202 request.appendChild(documentNode_list);
203 message.appendChild(request);
204 Node response = mr.process(message);
205 getAttributes(response);
206 response = null;
207 request = null;
208 param_list = null;
209 documentNode_list = null;
210 }
211
212 else if(docSize<300){
213 for(int i = 0; i<docSize; i++){
214 Element documentNode = this.doc.createElement(GSXML.DOC_NODE_ELEM);
215 documentNode.setAttribute("nodeID",(String)doc_list.get(counter));
216 documentNode_list.appendChild(documentNode);
217 counter++;
218 }
219
220 request.appendChild(param_list);
221 request.appendChild(documentNode_list);
222 message.appendChild(request);
223 Node response = mr.process(message);
224 getAttributes(response);
225 response = null;
226 request = null;
227 param_list = null;
228 documentNode_list = null;
229 break;
230 }
231 }
232 }
233
234 public boolean getStatus(){
235 return this.status;
236 }
237
238 private void Setup(String gdbm){
239
240 if (!this.gdbm_src.openDatabase(gdbm,GDBMWrapper.READ)) {
241 System.out.println("Could not open GDBM database!");
242 }
243
244 else{
245 String info = this.gdbm_src.getValue("browselist");
246
247 if (info == null) {
248 System.out.println("cannot locate the list");
249 }
250 else{
251 if (info == null) {
252 System.out.println("the db does not contain any info");
253 }
254
255 ArrayList children = new ArrayList();
256 StringTokenizer st = new StringTokenizer(info, ";");
257
258 while (st.hasMoreTokens()) {
259 String part = st.nextToken(";");
260
261 if(part.indexOf("<contains>")!=-1){
262 part = part.replaceAll("<contains>", "");
263 }
264 else if(part.indexOf("<thistype>")!=-1){
265 int location = part.indexOf("<thistype>");
266 part = part.substring(0,location-1);
267 }
268 children.add(part);
269 }
270
271 gdbm_src.closeDatabase();
272 parseMetadata(children,"all");
273 TotalDoc = children.size();
274 System.out.println("Total Doc:"+TotalDoc);
275 status = true;
276 }
277 }
278 }
279
280 public void getAttributes(Node f){
281
282 HashMap UsedMap = new HashMap();
283 Element e = (Element) f;
284 NodeList metadataNode = e.getElementsByTagName("metadata");
285 int length = metadataNode.getLength();
286
287 for (int j = 0; j < length; j++){
288 Node aNode = metadataNode.item(j);
289 NamedNodeMap NodeMap = aNode.getAttributes();
290 Node AttributeNode = NodeMap.item(0);
291 String att_name = AttributeNode.getNodeValue();
292 if(att_name.indexOf("dls.")!=-1){getAvailableMetadataSets(DLS_SET,UsedMap,"dls");}
293 else if(att_name.indexOf("dc.")!=-1){getAvailableMetadataSets(DC_SET,UsedMap,"dc");}
294 }
295
296 adjust(UsedMap,e);
297 UsedMap = null;
298 e = null;
299 System.gc();
300 }
301
302
303 /*
304 * MATCH UP THE ELEMENTS FROM BOTH DOCUMENTS AND PRE-DEFINED METADATA SET
305 * MOVE THE ELEMENT FROM NOUSEDMAP TO USEDMAP IF THE ELEMENT HAS NOT BEEN DISCOVERED BEFORE
306 * INCREASE THE COUNTER ONCE THE ELEMENT IS RECONGNIZED
307 * CALCULATE THE TIMES OF ELEMENTS USED AND STORE IT INTO THE LIST
308 */
309
310 private void adjust(HashMap UsedMap, Element response){
311
312 String DocID;;
313 NodeList nList = response.getElementsByTagName("documentNode");
314 int length = nList.getLength();
315
316 for (int j = 0; j <length; j++){
317
318 Node aNode = nList.item(j);
319 NamedNodeMap NodeMap = aNode.getAttributes();
320 Node AttributeNode = NodeMap.item(0);
321 String att_name = AttributeNode.getNodeValue();
322 DocID = att_name;
323
324 NodeList childList = aNode.getChildNodes();
325 Node nNode = childList.item(0);
326 NodeList grandChildList = nNode.getChildNodes();
327
328 int length1 = grandChildList.getLength();
329
330 for(int e = 0; e<length1 ; e++){
331
332 Node xNode = grandChildList.item(e);
333 NamedNodeMap xNodeMap = xNode.getAttributes();
334 Node xAttributeNode = xNodeMap.item(0);
335 String xatt_name = xAttributeNode.getNodeValue();
336 NodeList cList = xNode.getChildNodes();
337 String ActValue = cList.item(0).getNodeValue();
338 char firstchart = xatt_name.charAt(0);
339
340 if(xatt_name.indexOf('^')!=-1){
341 xatt_name = xatt_name.replace('^', '.');
342 }
343
344 if(UsedMap.containsKey(xatt_name)){
345
346 MetadataElement met = (MetadataElement) UsedMap.get(xatt_name);
347 met.IncreaseFrequency();
348 HashMap metadataMap = met.getMetadataList();
349
350 if(metadataMap.containsKey(DocID)){
351 DocumentInfo dc = (DocumentInfo)metadataMap.get(DocID);
352 dc.IncreaseFrequence();
353 dc.addActualValue(ActValue);
354 metadataMap.put(DocID, dc);
355 }
356 else{
357 DocumentInfo dc = new DocumentInfo();
358 dc.IncreaseFrequence();
359 dc.setDocumentID(DocID);
360 dc.addActualValue(ActValue);
361 metadataMap.put(DocID, dc);
362 }
363
364 HashMap valueMap = met.getValueList();
365
366 if(valueMap.containsKey(ActValue)){
367 Integer f = (Integer)valueMap.get(ActValue);
368 int fx = f.intValue();
369 fx++;
370 valueMap.put(ActValue,new Integer(fx));
371 }
372 else{
373 valueMap.put(ActValue, new Integer(1));
374 }
375
376 }
377 else if( (!UsedMap.containsKey(xatt_name)) && xatt_name.equals("archivedir")){
378
379 String SetAbbr = "ex";
380 String SetName = "extracted";
381 MetadataSet ms = new MetadataSet();
382 ms.setName(SetName);
383 ms.setAbb(SetAbbr);
384
385 MetadataElement me = new MetadataElement();
386 me.setMetadataName(xatt_name);
387 me.IncreaseFrequency();
388 HashMap metadataMap = me.getMetadataList();
389
390 if(metadataMap.containsKey(DocID)){
391 DocumentInfo dc = (DocumentInfo)metadataMap.get(DocID);
392 dc.IncreaseFrequence();
393 dc.addActualValue(ActValue);
394 metadataMap.put(DocID, dc);
395 }
396 else{
397 DocumentInfo dc = new DocumentInfo();
398 dc.IncreaseFrequence();
399 dc.setDocumentID(DocID);
400 dc.addActualValue(ActValue);
401 metadataMap.put(DocID, dc);
402 }
403
404 HashMap valueMap = me.getValueList();
405 if(valueMap.containsKey(ActValue)){
406 Integer f = (Integer)valueMap.get(ActValue);
407 int fx = f.intValue();
408 fx++;
409 valueMap.put(ActValue,new Integer(fx));
410 }
411 else{
412
413 valueMap.put(ActValue, new Integer(1));
414 }
415 if(!metadataNameList.contains(me.getMetadataName())){
416 metadataNameList.add(me.getMetadataName());
417 ms.addIndex(me.getMetadataName());
418 }
419
420 UsedMap.put(me.getMetadataName(), me);
421 if(!MDS_list.containsKey(ms.getName())){
422 MDS_list.put(ms.getName(), ms);}
423 else{
424 MetadataSet msx = (MetadataSet)MDS_list.get(ms.getName());
425 msx.addIndex(me.getMetadataName());
426 MDS_list.put(ms.getName(), msx);
427 }
428 }
429 else if((!UsedMap.containsKey(xatt_name)) && xatt_name.indexOf(".")!=-1 && xatt_name.indexOf("dc.")==-1){
430 int dotLocation = xatt_name.indexOf(".");
431 String SetAbbr = xatt_name.substring(0,dotLocation);
432 String SetName = SetAbbr;
433
434 MetadataSet ms = new MetadataSet();
435 ms.setName(SetName);
436 ms.setAbb(SetAbbr);
437
438 MetadataElement me = new MetadataElement();
439 me.setMetadataName(xatt_name);
440 me.IncreaseFrequency();
441 HashMap metadataMap = me.getMetadataList();
442
443 if(metadataMap.containsKey(DocID)){
444 DocumentInfo dc = (DocumentInfo)metadataMap.get(DocID);
445 dc.IncreaseFrequence();
446 dc.addActualValue(ActValue);
447 metadataMap.put(DocID, dc);
448 }
449 else{
450 DocumentInfo dc = new DocumentInfo();
451 dc.IncreaseFrequence();
452 dc.setDocumentID(DocID);
453 dc.addActualValue(ActValue);
454 metadataMap.put(DocID, dc);
455 }
456
457 HashMap valueMap = me.getValueList();
458 if(valueMap.containsKey(ActValue)){
459 Integer f = (Integer)valueMap.get(ActValue);
460 int fx = f.intValue();
461 fx++;
462 valueMap.put(ActValue,new Integer(fx));
463 }
464 else{
465
466 valueMap.put(ActValue, new Integer(1));
467 }
468 if(!metadataNameList.contains(me.getMetadataName())){
469 metadataNameList.add(me.getMetadataName());
470 ms.addIndex(me.getMetadataName());
471 }
472 UsedMap.put(me.getMetadataName(), me);
473
474 if(!MDS_list.containsKey(ms.getName())){
475 MDS_list.put(ms.getName(), ms);
476 }
477 else{
478 MetadataSet msx = (MetadataSet)MDS_list.get(ms.getName());
479 msx.addIndex(me.getMetadataName());
480 MDS_list.put(ms.getName(), msx);
481 }
482 }
483
484 else if ((!UsedMap.containsKey(xatt_name)) && (xatt_name.indexOf("dc.")==0) && xatt_name.indexOf("dc.Description")!=0){
485
486 String SetName = "dublin";
487 xatt_name = xatt_name.replace('^', '.');
488
489 MetadataElement me = new MetadataElement();
490 me.setMetadataName(xatt_name);
491 me.IncreaseFrequency();
492 HashMap metadataMap = me.getMetadataList();
493
494 DocumentInfo dc = new DocumentInfo();
495 dc.IncreaseFrequence();
496 dc.setDocumentID(DocID);
497 dc.addActualValue(ActValue);
498 metadataMap.put(DocID, dc);
499
500 HashMap valueMap = me.getValueList();
501 valueMap.put(ActValue, new Integer(1));
502
503 if(!metadataNameList.contains(me.getMetadataName())){
504 metadataNameList.add(me.getMetadataName());
505 }
506
507 UsedMap.put(me.getMetadataName(), me);
508 MetadataSet msx = (MetadataSet)MDS_list.get(SetName);
509 msx.addIndex(me.getMetadataName());
510 MDS_list.put(SetName, msx);
511 }
512 }
513 }
514
515 int counter = 0;
516 Set s = UsedMap.keySet();
517 Iterator is = s.iterator();
518
519 while(is.hasNext()){
520 String fileName = (String)is.next();
521 MetadataElement me = (MetadataElement)UsedMap.get(fileName);
522 HashMap hp = me.getMetadataList();
523 Collection ks = hp.values();
524 Iterator iks = ks.iterator();
525
526 try{
527 DocumentBuilderFactory docBuilderFactory = DocumentBuilderFactory.newInstance();
528 DocumentBuilder docBuilder = docBuilderFactory.newDocumentBuilder();
529 Document doc = docBuilder.newDocument();
530 Element root = doc.createElement("root");
531 boolean fileExist = (new File(StatsDirectory+fileName+".xml")).exists();
532
533 if(!fileExist){
534 root = doc.createElement("root");
535 }
536 else{
537 doc = docBuilder.parse (new File(StatsDirectory+fileName+".xml"));
538 root = doc.getDocumentElement();
539 }
540
541 while(iks.hasNext()){
542 DocumentInfo dc = (DocumentInfo)iks.next();
543 Element docID = doc.createElement("Document");
544 docID.setAttribute("id", dc.getDocumentID());
545
546 Element freq = doc.createElement("Frequency");
547 Text text = doc.createTextNode(dc.getFrequence()+"");
548 freq.appendChild(text);
549 docID.appendChild(freq);
550
551 Element actValue = doc.createElement("ActualValue");
552 ArrayList alist = dc.getActualValue();
553
554 if(alist.size()==0){
555 text = doc.createTextNode(" ");
556 actValue = doc.createElement("ActualValue");
557 actValue.appendChild(text);
558 docID.appendChild(actValue);
559 }
560
561 for(int i = 0; i<alist.size(); i++){
562 actValue = doc.createElement("ActualValue");
563 String utf8String = new String(((String)alist.get(i)).getBytes(),"UTF-8");
564 Text text1 = doc.createTextNode(utf8String);
565 actValue.appendChild(text1);
566 docID.appendChild(actValue);
567 }
568
569 root.appendChild(docID);
570 docID = null;
571 counter++;
572 }
573
574 TransformerFactory tf= TransformerFactory.newInstance();
575 Transformer transformer= tf.newTransformer();
576 DOMSource source= new DOMSource(root);
577 transformer.setOutputProperty(OutputKeys.INDENT,"yes");
578
579 Writer pwx= new BufferedWriter(new OutputStreamWriter(new FileOutputStream(StatsDirectory+fileName+".xml"),"UTF-8"));
580 StreamResult result= new StreamResult(pwx);
581 transformer.transform(source,result);
582 pwx.close();
583
584 root = null;
585 docBuilderFactory = null;
586 docBuilder = null;
587 doc = null;
588
589 }catch (Exception e) {
590 System.out.println(e);
591 }
592 }
593 }
594
595 public int getDocNum(){
596 return TotalDoc;
597 }
598
599 public HashMap getMetadataSetMap(){
600 MDS_list.remove("extracted");
601 return (HashMap)MDS_list.clone();
602 }
603
604 public String getCollectionName(){
605 return collection_Name;
606 }
607
608 public void setOAIURL(String url){
609 OAI_URL = url;
610 }
611
612 public String getOAIURL(){
613 return OAI_URL;
614 }
615
616 public String getOaiPrefix(){
617 return oai_Prefix;
618 }
619
620 public ArrayList getMetadataNameList(){
621 return (ArrayList)metadataNameList.clone();
622 }
623 private String constructUpdateMessgae (){
624 String message = "<message><request type='system' to=''><system type='configure' subset=''/></request></message>";
625 return message;
626 }
627}
Note: See TracBrowser for help on using the repository browser.