source: other-projects/trunk/greenstone3-extension/mat/src/org/greenstone/gsdl3_extension/mat/MetadataStats.java@ 18000

Last change on this file since 18000 was 18000, checked in by cc108, 15 years ago

new source code

File size: 18.3 KB
Line 
1package org.greenstone.gsdl3_extension.mat;
2
3import org.w3c.dom.*;
4import javax.xml.parsers.*;
5import javax.xml.transform.*;
6import javax.xml.transform.dom.*;
7import javax.xml.transform.stream.*;
8
9import java.io.BufferedWriter;
10import java.io.File;
11import java.io.FileOutputStream;
12import java.io.OutputStreamWriter;
13import java.io.Writer;
14
15import java.util.ArrayList;
16import java.util.Collection;
17import java.util.HashMap;
18import java.util.Iterator;
19import java.util.Set;
20import java.util.StringTokenizer;
21
22import org.greenstone.gsdl3.core.MessageRouter;
23import org.greenstone.gsdl3.util.GDBMWrapper;
24import org.greenstone.gsdl3.util.GSFile;
25import org.greenstone.gsdl3.util.GSXML;
26import org.greenstone.gsdl3.util.XMLConverter;
27import org.greenstone.gsdl3.util.GlobalProperties;
28import org.greenstone.gsdl3.util.GSPath;
29
30import org.w3c.dom.Document;
31import org.w3c.dom.Element;
32import org.w3c.dom.NamedNodeMap;
33import org.w3c.dom.Node;
34import org.w3c.dom.NodeList;
35
36public class MetadataStats {
37
38 private Document doc=null;
39 private MessageRouter mr = null;
40 private XMLConverter converter=null;
41 private GDBMWrapper gdbm_src = null;
42 private GSPath gspath = null;
43 private GlobalProperties globalProperty = null;
44 private String site_name = "localsite";
45 private String DLS_SET = "dls";
46 private String DC_SET = "dublin";
47 private String OAI_URL = "";
48 private boolean status = false;
49 private String oai_Prefix ="";
50
51 String destination = "";
52
53 private int TotalDoc = 0;
54 private HashMap MDS_list = new HashMap();
55 public ArrayList metadataNameList = new ArrayList();
56 public String StatsDirectory;
57 public String HTMLDirectory;
58 private String collection_Name = null;
59 private String collection = null;
60 private final String DBType ="gdbm";
61 private final String fileSeparator = File.separator;
62 protected final String gsdl3Home = null;
63 MetadataElement me;
64
65 /*
66 The constructor connects to the database and retrieve
67 information for the collection
68 */
69
70 private static boolean deleteDir(File dir) {
71
72 if (dir.isDirectory()) {
73 String[] children = dir.list();
74 for (int i=0; i<children.length; i++) {
75 boolean success = deleteDir(new File(dir, children[i]));
76 if (!success) {
77 return false;
78 }
79 }
80 }
81 return dir.delete();
82 }
83
84 public MetadataStats(String site_home, String collection,String url,String oaiPrefix){
85
86 OAI_URL = url;
87 collection_Name = collection;
88 oai_Prefix = oaiPrefix;
89
90 try{
91 destination = globalProperty.getGSDL3Home()+fileSeparator+"mat"+fileSeparator+collection+fileSeparator;
92 }catch(Exception ex){
93 ex.printStackTrace();
94 }
95
96 HTMLDirectory = destination;
97 StatsDirectory = HTMLDirectory+"metadataStats"+fileSeparator;
98
99 if(new File(StatsDirectory).exists()){
100 deleteDir(new File(StatsDirectory));
101 }
102 new File(StatsDirectory).mkdirs();
103
104 String gdbm = GSFile.collectionDatabaseFile(site_home,collection,collection,DBType);
105
106 mr = new MessageRouter();
107 mr.setSiteName(this.site_name);
108 mr.configure();
109
110 this.collection = collection;
111 this.gdbm_src = new GDBMWrapper();
112 this.converter = new XMLConverter();
113 this.doc = this.converter.newDOM();
114 this.gspath = new GSPath();
115 this.globalProperty = new GlobalProperties();
116 Setup(gdbm);
117 }
118
119 public void getAvailableMetadataSets(String SetName,HashMap UsedMap, String SetAbbr){
120
121 String gsdl3Home = globalProperty.getGSDL3Home();
122 String os = "linux";
123 if(fileSeparator.equals("\\")){
124 gsdl3Home = gsdl3Home.replace("\\", "/");
125 os = "windows";
126 }
127 String metadataSetHome = gspath.removeLastLink(gsdl3Home)+fileSeparator+"gli"+fileSeparator+"metadata";
128 if(os.equals("windows")){
129 metadataSetHome = metadataSetHome.replace("/", "\\");
130 }
131 File metadata_directory = new File(metadataSetHome);
132
133 if (metadata_directory.exists()) {
134
135 File[] directory_files = metadata_directory.listFiles();
136
137 for (int i = 0; i < directory_files.length; i++) {
138 File child_file = directory_files[i];
139
140 if (!child_file.isDirectory() && child_file.getName().endsWith("mds")) {
141 String fileName = child_file.getName();
142
143 if(!MDS_list.containsKey(SetName) && fileName.equals(SetName+".mds")){
144
145 MetadataSet ms = new MetadataSet();
146 ms.setName(SetName);
147 ms.setAbb(SetAbbr);
148 converter.newDOM();
149
150 Document d = converter.getDOM(child_file);
151 NodeList e = d.getElementsByTagName("Element");
152
153 int length = e.getLength();
154
155 for(int y = 0; y<length; y++){
156 Node temp = e.item(y);
157 NamedNodeMap mmp = temp.getAttributes();
158
159 if(!mmp.item(0).getNodeValue().equals("Description")){
160 MetadataElement me = new MetadataElement();
161 me.setMetadataName((SetAbbr+"."+mmp.item(0).getNodeValue()));
162
163 if(!metadataNameList.contains(me.getMetadataName())){
164 metadataNameList.add(me.getMetadataName());
165 ms.addIndex(me.getMetadataName());
166 }
167 UsedMap.put(me.getMetadataName(),me);
168 }
169 }
170 MDS_list.put(ms.getName(), ms);
171 }
172 }
173 }
174 }
175 }
176
177 private void parseMetadata(ArrayList doc_list, String MetadataElement){
178
179 int counter = 0;
180 int docSize = doc_list.size();
181
182 while(true){
183
184 Node message = this.doc.createElement(GSXML.MESSAGE_ELEM);
185 Node request = GSXML.createBasicRequest(this.doc, GSXML.REQUEST_TYPE_PROCESS,collection+"/DocumentMetadataRetrieve","en", "");
186 Node param_list = this.doc.createElement(GSXML.PARAM_ELEM+GSXML.LIST_MODIFIER);
187 param_list.appendChild(GSXML.createParameter(this.doc, "metadata", "all"));
188 Node documentNode_list = this.doc.createElement(GSXML.DOC_NODE_ELEM+GSXML.LIST_MODIFIER);
189
190 if(docSize>=300){
191 docSize = docSize - 300;
192 for(int i = 0; i<300; i++){
193 Element documentNode = this.doc.createElement(GSXML.DOC_NODE_ELEM);
194 documentNode.setAttribute("nodeID",(String)doc_list.get(counter));
195 documentNode_list.appendChild(documentNode);
196 counter++;
197 }
198
199 request.appendChild(param_list);
200 request.appendChild(documentNode_list);
201 message.appendChild(request);
202 Node response = mr.process(message);
203 getAttributes(response);
204 response = null;
205 request = null;
206 param_list = null;
207 documentNode_list = null;
208 }
209
210 else if(docSize<300){
211 for(int i = 0; i<docSize; i++){
212 Element documentNode = this.doc.createElement(GSXML.DOC_NODE_ELEM);
213 documentNode.setAttribute("nodeID",(String)doc_list.get(counter));
214 documentNode_list.appendChild(documentNode);
215 counter++;
216 }
217
218 request.appendChild(param_list);
219 request.appendChild(documentNode_list);
220 message.appendChild(request);
221 Node response = mr.process(message);
222 getAttributes(response);
223 response = null;
224 request = null;
225 param_list = null;
226 documentNode_list = null;
227 break;
228 }
229 }
230 }
231
232 public boolean getStatus(){
233 return this.status;
234 }
235
236 private void Setup(String gdbm){
237
238 if (!this.gdbm_src.openDatabase(gdbm,GDBMWrapper.READ)) {
239 System.out.println("Could not open GDBM database!");
240 }
241
242 else{
243 String info = this.gdbm_src.getValue("browselist");
244
245 if (info == null) {
246 System.out.println("cannot locate the list");
247 }
248 else{
249 if (info == null) {
250 System.out.println("the db does not contain any info");
251 }
252
253 ArrayList children = new ArrayList();
254 StringTokenizer st = new StringTokenizer(info, ";");
255
256 while (st.hasMoreTokens()) {
257 String part = st.nextToken(";");
258
259 if(part.contains("<contains>")){
260 part = part.replace("<contains>", "");
261 }
262 else if(part.contains("<thistype>")){
263 int location = part.indexOf("<thistype>");
264 part = part.substring(0,location-1);
265 }
266 children.add(part);
267 }
268
269 gdbm_src.closeDatabase();
270 parseMetadata(children,"all");
271 TotalDoc = children.size();
272 System.out.println("Total Doc:"+TotalDoc);
273 status = true;
274 }
275 }
276 }
277
278 public void getAttributes(Node f){
279
280 HashMap UsedMap = new HashMap();
281 Element e = (Element) f;
282 NodeList metadataNode = e.getElementsByTagName("metadata");
283 int length = metadataNode.getLength();
284
285 for (int j = 0; j < length; j++){
286 Node aNode = metadataNode.item(j);
287 NamedNodeMap NodeMap = aNode.getAttributes();
288 Node AttributeNode = NodeMap.item(0);
289 String att_name = AttributeNode.getNodeValue();
290 if(att_name.indexOf("dls.")!=-1){getAvailableMetadataSets(DLS_SET,UsedMap,"dls");}
291 else if(att_name.indexOf("dc.")!=-1){getAvailableMetadataSets(DC_SET,UsedMap,"dc");}
292 }
293
294 adjust(UsedMap,e);
295 UsedMap = null;
296 e = null;
297 System.gc();
298 }
299
300
301 /*
302 * MATCH UP THE ELEMENTS FROM BOTH DOCUMENTS AND PRE-DEFINED METADATA SET
303 * MOVE THE ELEMENT FROM NOUSEDMAP TO USEDMAP IF THE ELEMENT HAS NOT BEEN DISCOVERED BEFORE
304 * INCREASE THE COUNTER ONCE THE ELEMENT IS RECONGNIZED
305 * CALCULATE THE TIMES OF ELEMENTS USED AND STORE IT INTO THE LIST
306 */
307
308 private void adjust(HashMap UsedMap, Element response){
309
310 String DocID;;
311 NodeList nList = response.getElementsByTagName("documentNode");
312 int length = nList.getLength();
313
314 for (int j = 0; j <length; j++){
315
316 Node aNode = nList.item(j);
317 NamedNodeMap NodeMap = aNode.getAttributes();
318 Node AttributeNode = NodeMap.item(0);
319 String att_name = AttributeNode.getNodeValue();
320 DocID = att_name;
321
322 NodeList childList = aNode.getChildNodes();
323 Node nNode = childList.item(0);
324 NodeList grandChildList = nNode.getChildNodes();
325
326 int length1 = grandChildList.getLength();
327
328 for(int e = 0; e<length1 ; e++){
329
330 Node xNode = grandChildList.item(e);
331 NamedNodeMap xNodeMap = xNode.getAttributes();
332 Node xAttributeNode = xNodeMap.item(0);
333 String xatt_name = xAttributeNode.getNodeValue();
334 NodeList cList = xNode.getChildNodes();
335 String ActValue = cList.item(0).getNodeValue();
336 char firstchart = xatt_name.charAt(0);
337
338 if(xatt_name.indexOf('^')!=-1){
339 xatt_name = xatt_name.replace('^', '.');
340 }
341
342 if(UsedMap.containsKey(xatt_name)){
343
344 MetadataElement met = (MetadataElement) UsedMap.get(xatt_name);
345 met.IncreaseFrequency();
346 HashMap metadataMap = met.getMetadataList();
347
348 if(metadataMap.containsKey(DocID)){
349 DocumentInfo dc = (DocumentInfo)metadataMap.get(DocID);
350 dc.IncreaseFrequence();
351 dc.addActualValue(ActValue);
352 metadataMap.put(DocID, dc);
353 }
354 else{
355 DocumentInfo dc = new DocumentInfo();
356 dc.IncreaseFrequence();
357 dc.setDocumentID(DocID);
358 dc.addActualValue(ActValue);
359 metadataMap.put(DocID, dc);
360 }
361
362 HashMap valueMap = met.getValueList();
363
364 if(valueMap.containsKey(ActValue)){
365 Integer f = (Integer)valueMap.get(ActValue);
366 int fx = f.intValue();
367 fx++;
368 valueMap.put(ActValue,new Integer(fx));
369 }
370 else{
371 valueMap.put(ActValue, new Integer(1));
372 }
373
374 }
375 else if( (!UsedMap.containsKey(xatt_name)) && xatt_name.equals("archivedir")){
376
377 String SetAbbr = "ex";
378 String SetName = "extracted";
379 MetadataSet ms = new MetadataSet();
380 ms.setName(SetName);
381 ms.setAbb(SetAbbr);
382
383 MetadataElement me = new MetadataElement();
384 me.setMetadataName(xatt_name);
385 me.IncreaseFrequency();
386 HashMap metadataMap = me.getMetadataList();
387
388 if(metadataMap.containsKey(DocID)){
389 DocumentInfo dc = (DocumentInfo)metadataMap.get(DocID);
390 dc.IncreaseFrequence();
391 dc.addActualValue(ActValue);
392 metadataMap.put(DocID, dc);
393 }
394 else{
395 DocumentInfo dc = new DocumentInfo();
396 dc.IncreaseFrequence();
397 dc.setDocumentID(DocID);
398 dc.addActualValue(ActValue);
399 metadataMap.put(DocID, dc);
400 }
401
402 HashMap valueMap = me.getValueList();
403 if(valueMap.containsKey(ActValue)){
404 Integer f = (Integer)valueMap.get(ActValue);
405 int fx = f.intValue();
406 fx++;
407 valueMap.put(ActValue,new Integer(fx));
408 }
409 else{
410
411 valueMap.put(ActValue, new Integer(1));
412 }
413 if(!metadataNameList.contains(me.getMetadataName())){
414 metadataNameList.add(me.getMetadataName());
415 ms.addIndex(me.getMetadataName());
416 }
417
418 UsedMap.put(me.getMetadataName(), me);
419 if(!MDS_list.containsKey(ms.getName())){
420 MDS_list.put(ms.getName(), ms);}
421 else{
422 MetadataSet msx = (MetadataSet)MDS_list.get(ms.getName());
423 msx.addIndex(me.getMetadataName());
424 MDS_list.put(ms.getName(), msx);
425 }
426 }
427 else if((!UsedMap.containsKey(xatt_name)) && xatt_name.indexOf(".")!=-1 && xatt_name.indexOf("dc.")==-1){
428 int dotLocation = xatt_name.indexOf(".");
429 String SetAbbr = xatt_name.substring(0,dotLocation);
430 String SetName = SetAbbr;
431
432 MetadataSet ms = new MetadataSet();
433 ms.setName(SetName);
434 ms.setAbb(SetAbbr);
435
436 MetadataElement me = new MetadataElement();
437 me.setMetadataName(xatt_name);
438 me.IncreaseFrequency();
439 HashMap metadataMap = me.getMetadataList();
440
441 if(metadataMap.containsKey(DocID)){
442 DocumentInfo dc = (DocumentInfo)metadataMap.get(DocID);
443 dc.IncreaseFrequence();
444 dc.addActualValue(ActValue);
445 metadataMap.put(DocID, dc);
446 }
447 else{
448 DocumentInfo dc = new DocumentInfo();
449 dc.IncreaseFrequence();
450 dc.setDocumentID(DocID);
451 dc.addActualValue(ActValue);
452 metadataMap.put(DocID, dc);
453 }
454
455 HashMap valueMap = me.getValueList();
456 if(valueMap.containsKey(ActValue)){
457 Integer f = (Integer)valueMap.get(ActValue);
458 int fx = f.intValue();
459 fx++;
460 valueMap.put(ActValue,new Integer(fx));
461 }
462 else{
463
464 valueMap.put(ActValue, new Integer(1));
465 }
466 if(!metadataNameList.contains(me.getMetadataName())){
467 metadataNameList.add(me.getMetadataName());
468 ms.addIndex(me.getMetadataName());
469 }
470 UsedMap.put(me.getMetadataName(), me);
471
472 if(!MDS_list.containsKey(ms.getName())){
473 MDS_list.put(ms.getName(), ms);
474 }
475 else{
476 MetadataSet msx = (MetadataSet)MDS_list.get(ms.getName());
477 msx.addIndex(me.getMetadataName());
478 MDS_list.put(ms.getName(), msx);
479 }
480 }
481
482 else if ((!UsedMap.containsKey(xatt_name)) && (xatt_name.indexOf("dc.")==0) && xatt_name.indexOf("dc.Description")!=0){
483
484 String SetName = "dublin";
485 xatt_name = xatt_name.replace('^', '.');
486
487 MetadataElement me = new MetadataElement();
488 me.setMetadataName(xatt_name);
489 me.IncreaseFrequency();
490 HashMap metadataMap = me.getMetadataList();
491
492 DocumentInfo dc = new DocumentInfo();
493 dc.IncreaseFrequence();
494 dc.setDocumentID(DocID);
495 dc.addActualValue(ActValue);
496 metadataMap.put(DocID, dc);
497
498 HashMap valueMap = me.getValueList();
499 valueMap.put(ActValue, new Integer(1));
500
501 if(!metadataNameList.contains(me.getMetadataName())){
502 metadataNameList.add(me.getMetadataName());
503 }
504
505 UsedMap.put(me.getMetadataName(), me);
506 MetadataSet msx = (MetadataSet)MDS_list.get(SetName);
507 msx.addIndex(me.getMetadataName());
508 MDS_list.put(SetName, msx);
509 }
510 }
511 }
512
513 int counter = 0;
514 Set s = UsedMap.keySet();
515 Iterator is = s.iterator();
516
517 while(is.hasNext()){
518 String fileName = (String)is.next();
519 MetadataElement me = (MetadataElement)UsedMap.get(fileName);
520 HashMap hp = me.getMetadataList();
521 Collection ks = hp.values();
522 Iterator iks = ks.iterator();
523
524 try{
525 DocumentBuilderFactory docBuilderFactory = DocumentBuilderFactory.newInstance();
526 DocumentBuilder docBuilder = docBuilderFactory.newDocumentBuilder();
527 Document doc = docBuilder.newDocument();
528 Element root = doc.createElement("root");
529 boolean fileExist = (new File(StatsDirectory+fileName+".xml")).exists();
530
531 if(!fileExist){
532 root = doc.createElement("root");
533 }
534 else{
535 doc = docBuilder.parse (new File(StatsDirectory+fileName+".xml"));
536 root = doc.getDocumentElement();
537 }
538
539 while(iks.hasNext()){
540 DocumentInfo dc = (DocumentInfo)iks.next();
541 Element docID = doc.createElement("Document");
542 docID.setAttribute("id", dc.getDocumentID());
543
544 Element freq = doc.createElement("Frequency");
545 Text text = doc.createTextNode(dc.getFrequence()+"");
546 freq.appendChild(text);
547 docID.appendChild(freq);
548
549 Element actValue = doc.createElement("ActualValue");
550 ArrayList alist = dc.getActualValue();
551
552 if(alist.size()==0){
553 text = doc.createTextNode(" ");
554 actValue = doc.createElement("ActualValue");
555 actValue.appendChild(text);
556 docID.appendChild(actValue);
557 }
558
559 for(int i = 0; i<alist.size(); i++){
560 actValue = doc.createElement("ActualValue");
561 String utf8String = new String(((String)alist.get(i)).getBytes(),"UTF-8");
562 Text text1 = doc.createTextNode(utf8String);
563 actValue.appendChild(text1);
564 docID.appendChild(actValue);
565 }
566
567 root.appendChild(docID);
568 docID = null;
569 counter++;
570 }
571
572 TransformerFactory tf= TransformerFactory.newInstance();
573 Transformer transformer= tf.newTransformer();
574 DOMSource source= new DOMSource(root);
575 transformer.setOutputProperty(OutputKeys.INDENT,"yes");
576
577 Writer pwx= new BufferedWriter(new OutputStreamWriter(new FileOutputStream(StatsDirectory+fileName+".xml"),"UTF-8"));
578 StreamResult result= new StreamResult(pwx);
579 transformer.transform(source,result);
580 pwx.close();
581
582 root = null;
583 docBuilderFactory = null;
584 docBuilder = null;
585 doc = null;
586
587 }catch (Exception e) {
588 System.out.println(e);
589 }
590 }
591 }
592
593 public int getDocNum(){
594 return TotalDoc;
595 }
596
597 public HashMap getMetadataSetMap(){
598 MDS_list.remove("extracted");
599 return (HashMap)MDS_list.clone();
600 }
601
602 public String getCollectionName(){
603 return collection_Name;
604 }
605
606 public void setOAIURL(String url){
607 OAI_URL = url;
608 }
609
610 public String getOAIURL(){
611 return OAI_URL;
612 }
613
614 public String getOaiPrefix(){
615 return oai_Prefix;
616 }
617
618 public ArrayList getMetadataNameList(){
619 return (ArrayList)metadataNameList.clone();
620 }
621 private String constructUpdateMessgae (){
622 String message = "<message><request type='system' to=''><system type='configure' subset=''/></request></message>";
623 return message;
624 }
625}
Note: See TracBrowser for help on using the repository browser.