package org.greenstone.mat.servlet; import org.w3c.dom.*; import javax.xml.parsers.*; import javax.xml.transform.*; import javax.xml.transform.dom.*; import javax.xml.transform.stream.*; import java.io.BufferedWriter; import java.io.File; import java.io.FileOutputStream; import java.io.OutputStreamWriter; import java.io.PrintWriter; import java.io.Writer; import java.util.ArrayList; import java.util.Collection; import java.util.HashMap; import java.util.Iterator; import java.util.Set; import java.util.StringTokenizer; import org.greenstone.gsdl3.core.MessageRouter; import org.greenstone.gsdl3.util.GDBMWrapper; import org.greenstone.gsdl3.util.GSFile; import org.greenstone.gsdl3.util.GSXML; import org.greenstone.util.GlobalProperties; import org.greenstone.gsdl3.util.XMLConverter; import org.greenstone.gsdl3.util.GSPath; import org.w3c.dom.Document; import org.w3c.dom.Element; import org.w3c.dom.NamedNodeMap; import org.w3c.dom.Node; import org.w3c.dom.NodeList; public class MetadataStats { private Document doc=null; private MessageRouter mr = null; private XMLConverter converter=null; private GDBMWrapper gdbm_src = null; private GSPath gspath = null; private GlobalProperties globalProperty = null; private String site_name = "localsite"; //private String node_id = "browselist"; private String DLS_SET = "dls"; private String DC_SET = "dublin"; private String OAI_URL = ""; private boolean status = false; private String oai_Prefix =""; private PrintWriter pw; public String StatsDirectory; public String HTMLDirectory; private String destination = ""; private int TotalDoc = 0; String linkIdentifier =""; private HashMap MDS_list = new HashMap(); private String collection = null; public ArrayList metadataNameList = new ArrayList(); private String collection_Name = null; private String DB_TYPE = "gdbm"; protected final String FrequencyNode = "Frequency"; protected final String DocumentNode = "Document"; protected final String ActualValueNode = "ActualValue"; protected final String RootDirectory = "archivedir"; protected final String fileSeparator = File.separator; MetadataElement me; private static boolean deleteDir(File dir) { if (dir.isDirectory()) { String[] children = dir.list(); for (int i=0; i"); while(true){ Node message = this.doc.createElement(GSXML.MESSAGE_ELEM); Node request = GSXML.createBasicRequest( this.doc, GSXML.REQUEST_TYPE_PROCESS,collection+"/DocumentMetadataRetrieve","en", ""); Node param_list = this.doc.createElement( GSXML.PARAM_ELEM+GSXML.LIST_MODIFIER); param_list.appendChild(GSXML.createParameter( this.doc, "metadata", MetadataElement)); Node documentNode_list = this.doc.createElement( GSXML.DOC_NODE_ELEM+GSXML.LIST_MODIFIER); if(docSize>=500){ docSize = docSize - 500; for(int i = 0; i<500; i++){ Element documentNode = this.doc.createElement(GSXML.DOC_NODE_ELEM); documentNode.setAttribute("nodeID",(String)doc_list.get(counter)); documentNode_list.appendChild(documentNode); counter++; } request.appendChild(param_list); request.appendChild(documentNode_list); message.appendChild(request); //System.out.println(this.converter.getPrettyString(message)); //pw.write("send request"); //pw.flush(); Node response = mr.process(message); //pw.write("receive request"); //pw.flush(); //System.out.println(this.converter.getPrettyString(response)); getAttributes(response); response = null; request = null; param_list = null; documentNode_list = null; //System.gc(); } else if(docSize<500){ for(int i = 0; i")!=-1){ part = part.replaceAll("", ""); } else if(part.indexOf("")!=-1){ int location = part.indexOf(""); part = part.substring(0,location-1); } children.add(part); } gdbm_src.closeDatabase(); parseMetadata(children,"all"); TotalDoc = children.size(); status = true; } } } public void getAttributes(Node f){ HashMap UsedMap = new HashMap(); Element e = (Element) f; NodeList metadataNode = e.getElementsByTagName("metadata"); int length = metadataNode.getLength(); for (int j = 0; j < length; j++){ Node aNode = metadataNode.item(j); NamedNodeMap NodeMap = aNode.getAttributes(); Node AttributeNode = NodeMap.item(0); String att_name = AttributeNode.getNodeValue(); if(att_name.indexOf("dls.")!=-1){getAvailableMetadataSets(DLS_SET,UsedMap,"dls");} else if(att_name.indexOf("dc.")!=-1){getAvailableMetadataSets(DC_SET,UsedMap,"dc");} } //UsedMap = combineList(UsedMap); adjust(UsedMap,e); UsedMap = null; e = null; System.gc(); } /* * MATCH UP THE ELEMENTS FROM BOTH DOCUMENTS AND PRE-DEFINED METADATA SET * MOVE THE ELEMENT FROM NOUSEDMAP TO USEDMAP IF THE ELEMENT HAS NOT BEEN DISCOVERED BEFORE * INCREASE THE COUNTER ONCE THE ELEMENT IS RECONGNIZED * CALCULATE THE TIMES OF ELEMENTS USED AND STORE IT INTO THE LIST */ private void adjust(HashMap UsedMap, Element response){ pw.write(""); pw.flush(); //try{ String DocID;; NodeList nList = response.getElementsByTagName("documentNode"); int length = nList.getLength(); for (int j = 0; j "); pw.flush(); //System.out.println(xatt_name+" "+UsedMap.containsKey(xatt_name)); if(xatt_name.indexOf("dc.")!=-1){ int position = xatt_name.indexOf("."); char x = xatt_name.charAt(position+1); x = Character.toUpperCase(x); xatt_name = "dc."+x+xatt_name.substring(position+2); } if(xatt_name.indexOf('^')!=-1){ xatt_name = xatt_name.replace('^', '.'); } if(UsedMap.containsKey(xatt_name)){ //pw.write("add element to list"); MetadataElement met = (MetadataElement) UsedMap.get(xatt_name); met.IncreaseFrequency(); HashMap metadataMap = met.getMetadataList(); if(metadataMap.containsKey(DocID)){ //pw.write("element contains ID"); DocumentInfo dc = (DocumentInfo)metadataMap.get(DocID); dc.IncreaseFrequence(); dc.addActualValue(ActValue); metadataMap.put(DocID, dc); } else{ //pw.write("element DOES NOT contain ID"); DocumentInfo dc = new DocumentInfo(); dc.IncreaseFrequence(); dc.setDocumentID(DocID); dc.addActualValue(ActValue); metadataMap.put(DocID, dc); } HashMap valueMap = met.getValueList(); if(valueMap.containsKey(ActValue)){ Integer f = (Integer)valueMap.get(ActValue); int fx = f.intValue(); fx++; valueMap.put(ActValue,new Integer(fx)); } else{ valueMap.put(ActValue, new Integer(1)); } } //else if ((!UsedMap.containsKey(xatt_name)) && (xatt_name.indexOf(".")==-1) && (!xatt_name.equals("prettymd")) && (!xatt_name.equals("Description")) && (Character.isLetter(firstchart)) && (xatt_name.indexOf("-")==-1)){ else if( (!UsedMap.containsKey(xatt_name)) && xatt_name.equals(RootDirectory)){ //pw.write("NEW EX element"); String SetAbbr = "ex"; String SetName = "extracted"; MetadataSet ms = new MetadataSet(); ms.setName(SetName); ms.setAbb(SetAbbr); MetadataElement me = new MetadataElement(); me.setMetadataName(xatt_name); me.IncreaseFrequency(); HashMap metadataMap = me.getMetadataList(); if(metadataMap.containsKey(DocID)){ DocumentInfo dc = (DocumentInfo)metadataMap.get(DocID); dc.IncreaseFrequence(); dc.addActualValue(ActValue); metadataMap.put(DocID, dc); } else{ DocumentInfo dc = new DocumentInfo(); dc.IncreaseFrequence(); dc.setDocumentID(DocID); dc.addActualValue(ActValue); metadataMap.put(DocID, dc); } HashMap valueMap = me.getValueList(); if(valueMap.containsKey(ActValue)){ Integer f = (Integer)valueMap.get(ActValue); int fx = f.intValue(); fx++; valueMap.put(ActValue,new Integer(fx)); } else{ //System.out.println(ActValue+" doesn't already exist"); valueMap.put(ActValue, new Integer(1)); } if(!metadataNameList.contains(me.getMetadataName())){ metadataNameList.add(me.getMetadataName()); ms.addIndex(me.getMetadataName()); } //pw.write("adding element"); UsedMap.put(me.getMetadataName(), me); if(!MDS_list.containsKey(ms.getName())){MDS_list.put(ms.getName(), ms);} else{ MetadataSet msx = (MetadataSet)MDS_list.get(ms.getName()); msx.addIndex(me.getMetadataName()); MDS_list.put(ms.getName(), msx); } } else if((!UsedMap.containsKey(xatt_name)) && xatt_name.indexOf(".")!=-1 && xatt_name.indexOf("dc.")==-1){ //pw.write("NEW EX element"); int dotLocation = xatt_name.indexOf("."); String SetAbbr = new String(); String SetName = new String(); if(dotLocation==0){ SetAbbr = "Other"; SetName = "Other"; } else{ SetAbbr = xatt_name.substring(0,dotLocation); SetName = SetAbbr; } MetadataSet ms = new MetadataSet(); ms.setName(SetName); ms.setAbb(SetAbbr); MetadataElement me = new MetadataElement(); me.setMetadataName(xatt_name); me.IncreaseFrequency(); HashMap metadataMap = me.getMetadataList(); if(metadataMap.containsKey(DocID)){ DocumentInfo dc = (DocumentInfo)metadataMap.get(DocID); dc.IncreaseFrequence(); dc.addActualValue(ActValue); metadataMap.put(DocID, dc); } else{ DocumentInfo dc = new DocumentInfo(); dc.IncreaseFrequence(); dc.setDocumentID(DocID); dc.addActualValue(ActValue); metadataMap.put(DocID, dc); } HashMap valueMap = me.getValueList(); if(valueMap.containsKey(ActValue)){ Integer f = (Integer)valueMap.get(ActValue); int fx = f.intValue(); fx++; valueMap.put(ActValue,new Integer(fx)); } else{ //System.out.println(ActValue+" doesn't already exist"); valueMap.put(ActValue, new Integer(1)); } if(!metadataNameList.contains(me.getMetadataName())){ metadataNameList.add(me.getMetadataName()); ms.addIndex(me.getMetadataName()); } //pw.write("adding element"); UsedMap.put(me.getMetadataName(), me); if(!MDS_list.containsKey(ms.getName())){MDS_list.put(ms.getName(), ms);} else{ MetadataSet msx = (MetadataSet)MDS_list.get(ms.getName()); msx.addIndex(me.getMetadataName()); MDS_list.put(ms.getName(), msx); } } //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// else if ((!UsedMap.containsKey(xatt_name)) && (xatt_name.indexOf("dc.")==0) && xatt_name.indexOf("dc.Description")!=0){ //pw.write("QUALIFIED DC element"); String SetName = "dublin"; xatt_name = xatt_name.replace('^', '.'); MetadataElement me = new MetadataElement(); me.setMetadataName(xatt_name); me.IncreaseFrequency(); HashMap metadataMap = me.getMetadataList(); DocumentInfo dc = new DocumentInfo(); dc.IncreaseFrequence(); dc.setDocumentID(DocID); dc.addActualValue(ActValue); metadataMap.put(DocID, dc); HashMap valueMap = me.getValueList(); valueMap.put(ActValue, new Integer(1)); if(!metadataNameList.contains(me.getMetadataName())){ metadataNameList.add(me.getMetadataName()); } //pw.write("adding element"); UsedMap.put(me.getMetadataName(), me); MetadataSet msx = (MetadataSet)MDS_list.get(SetName); msx.addIndex(me.getMetadataName()); MDS_list.put(SetName, msx); } ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// }// element }//doc id //////////////////// //write file int counter = 0; //System.out.println("write"); //pw.write("write files" + UsedMap.size()+ metadataNameList.size() ); //pw.flush(); Set s = UsedMap.keySet(); Iterator is = s.iterator(); //System.out.println(UsedMap.size()); while(is.hasNext()){ String fileName = (String)is.next(); MetadataElement me = (MetadataElement)UsedMap.get(fileName); HashMap hp = me.getMetadataList(); Collection ks = hp.values(); Iterator iks = ks.iterator(); try{ DocumentBuilderFactory docBuilderFactory = DocumentBuilderFactory.newInstance(); DocumentBuilder docBuilder = docBuilderFactory.newDocumentBuilder(); Document doc = docBuilder.newDocument(); Element root = doc.createElement("root"); boolean fileExist = (new File(StatsDirectory+fileName+".xml")).exists(); if(!fileExist){ root = doc.createElement("root"); } else{ doc = docBuilder.parse (new File(StatsDirectory+fileName+".xml")); root = doc.getDocumentElement(); } while(iks.hasNext()){ DocumentInfo dc = (DocumentInfo)iks.next(); Element docID = doc.createElement(DocumentNode); docID.setAttribute("id", dc.getDocumentID()); //pw.write("doc id"+dc.getDocumentID()); Element freq = doc.createElement(FrequencyNode); Text text = doc.createTextNode(dc.getFrequence()+""); freq.appendChild(text); docID.appendChild(freq); Element actValue = doc.createElement(ActualValueNode); ArrayList alist = dc.getActualValue(); if(alist.size()==0){ text = doc.createTextNode(" "); actValue = doc.createElement(ActualValueNode); actValue.appendChild(text); docID.appendChild(actValue); } for(int i = 0; i