Changeset 14483


Ignore:
Timestamp:
2007-09-06T11:19:44+12:00 (14 years ago)
Author:
xiao
Message:

make MGPPRetrieveWrapper and MGPPSearchWrapper static variables; synchronize the method findPhraseNumberFromWord() for search and getPhraseData() for retrieve.

File:
1 edited

Legend:

Unmodified
Added
Removed
  • greenstone3/trunk/src/java/org/greenstone/gsdl3/service/PhindPhraseBrowse.java

    r13270 r14483  
    2222
    2323import org.greenstone.mgpp.*;
    24 import org.w3c.dom.Document; 
    25 import org.w3c.dom.Node; 
    26 import org.w3c.dom.Element; 
    27 import org.w3c.dom.Text; 
     24import org.w3c.dom.Document;
     25import org.w3c.dom.Node;
     26import org.w3c.dom.Element;
     27import org.w3c.dom.Text;
    2828
    2929import java.util.Vector;
     
    3333import org.apache.log4j.*;
    3434
    35 /** 
     35/**
    3636 * PhindServices - the phind phrase browsing service
    37  * 
     37 *
    3838 * @author <a href="mailto:kjdon@cs.waikato.ac.nz">Katherine Don</a>
    3939 * @version $Revision$
    4040 */
    4141public class PhindPhraseBrowse
    42     extends ServiceRack {
    43    
    44      static Logger logger = Logger.getLogger(org.greenstone.gsdl3.service.PhindPhraseBrowse.class.getName());
    45 
    46     // the services on offer
    47     private static final String PHIND_SERVICE = "PhindApplet";
    48 
    49     private MGPPWrapper mgpp_src=null;
    50     private String basepath = null;
    51 
    52     private Element applet_description = null;
    53    
    54     public PhindPhraseBrowse() {
    55     this.mgpp_src = new MGPPWrapper();
    56     // set up the default params
    57     this.mgpp_src.setQueryLevel("Document");
    58     this.mgpp_src.setReturnLevel("Document");
    59     this.mgpp_src.setMaxDocs(5);
    60     this.mgpp_src.setStem(false);
    61     this.mgpp_src.setCase(true);
    62     }
    63 
    64     public void cleanUp() {
    65     super.cleanUp();
    66     this.mgpp_src.unloadIndexData();
    67     }
    68 
    69     /** configure the service module
    70      *
    71      * @param info a DOM Element containing any config info for the service
    72      * @return true if configured
    73      */
    74     public boolean configure(Element info, Element extra_info) {
    75 
    76     if (!super.configure(info, extra_info)){
    77         return false;
    78     }
    79 
    80     logger.info("configuring PhindPhraseBrowse");
    81 
    82     // set up short_service_info_ - for now just has name and type
    83     Element e = this.doc.createElement(GSXML.SERVICE_ELEM);
    84     e.setAttribute(GSXML.TYPE_ATT, GSXML.SERVICE_TYPE_APPLET);
    85     e.setAttribute(GSXML.NAME_ATT, PHIND_SERVICE);
    86     this.short_service_info.appendChild(e);
    87 
    88     // set up the static applet description
    89 
    90     applet_description = this.doc.createElement(GSXML.SERVICE_ELEM);
    91     applet_description.setAttribute(GSXML.TYPE_ATT, GSXML.SERVICE_TYPE_APPLET);
    92     applet_description.setAttribute(GSXML.NAME_ATT, PHIND_SERVICE);
    93 
    94     // add in the applet info for the phind applet
    95     // need to make this dynamic - library names etc
    96     // change the applet params - have a single param with the library name
    97     // this is left blank at this end, and must be filled in by applet action - if the library name is not needed, this param is left out
    98     // phindcgi param now is not complete - library must be prepended to it.
    99     String app_info = "<"+GSXML.APPLET_ELEM+" CODEBASE='applet' CODE='org.greenstone.applet.phind.Phind.class' ARCHIVE='phind.jar, xercesImpl.jar, xml-apis.jar' WIDTH='500' HEIGHT='400'><PARAM NAME='library' VALUE=''/> <PARAM NAME='phindcgi' VALUE='?";
    100     app_info += GSParams.ACTION +"=a&amp;"+GSParams.REQUEST_TYPE +"=r&amp;"+GSParams.SERVICE+"="+PHIND_SERVICE+"&amp;"+GSParams.OUTPUT+"=xml&amp;"+GSParams.RESPONSE_ONLY+"=1'/>";
    101     app_info +="<PARAM NAME='collection'   VALUE='";
    102     app_info += this.cluster_name;
    103     app_info += "'/> <PARAM NAME='classifier' VALUE='1'/>  <PARAM NAME='orientation'  VALUE='vertical'/> <PARAM NAME='depth' VALUE='2'/> <PARAM NAME='resultorder' VALUE='L,l,E,e,D,d'/> <PARAM NAME='backdrop' VALUE='interfaces/default/images/phindbg1.jpg'/><PARAM NAME='fontsize' VALUE='10'/> <PARAM NAME='blocksize'    VALUE='10'/>The Phind java applet.</"+GSXML.APPLET_ELEM+">";
    104    
    105     Document dom = this.converter.getDOM(app_info);
    106     if (dom==null) {
    107         logger.error("Couldn't parse applet info");
    108         return false;
    109     }
    110     Element app_elem = dom.getDocumentElement();
    111     applet_description.appendChild(this.doc.importNode(app_elem, true));
    112 
    113     return true;
    114     }
    115 
    116     protected Element getServiceDescription(String service, String lang, String subset) {
    117     if (!service.equals(PHIND_SERVICE)) {
    118         return null;
    119     }
    120     Element describe = (Element) applet_description.cloneNode(true);
    121     describe.appendChild(GSXML.createDisplayTextElement(this.doc, GSXML.DISPLAY_TEXT_NAME,  getTextString(PHIND_SERVICE+".name", lang)));
    122     describe.appendChild(GSXML.createDisplayTextElement(this.doc, GSXML.DISPLAY_TEXT_DESCRIPTION,  getTextString(PHIND_SERVICE+".description", lang)));
    123     return describe;
    124     }
    125 
    126     protected Element processPhindApplet(Element request) {
    127    
    128     Element param_elem = (Element)GSXML.getChildByTagName(request, GSXML.PARAM_ELEM+GSXML.LIST_MODIFIER);
    129     HashMap params = GSXML.extractParams(param_elem, false);
    130    
    131     long first_e = Long.parseLong((String)params.get("pfe"));
    132     long last_e = Long.parseLong((String)params.get("ple"));
    133     long first_l = Long.parseLong((String)params.get("pfl"));
    134     long last_l = Long.parseLong((String)params.get("pll"));
    135     long first_d = Long.parseLong((String)params.get("pfd"));
    136     long last_d = Long.parseLong((String)params.get("pld"));
    137    
    138     long phrase;
    139     String phrase_str = (String)params.get("ppnum");
    140     if (phrase_str == null || phrase_str.equals("")) {
    141         phrase=0;
    142     } else {
    143         phrase = Long.parseLong(phrase_str);
    144     }
    145     String word = (String)params.get("pptext");
    146     String phind_index = (String)params.get("pc");
    147     // the location of the mgpp database files
    148     this.basepath = GSFile.phindBaseDir(this.site_home, this.cluster_name, phind_index);
    149 
    150     // the result element
    151     Element result = this.doc.createElement(GSXML.RESPONSE_ELEM);
    152     result.setAttribute(GSXML.FROM_ATT, PHIND_SERVICE);
    153     result.setAttribute(GSXML.TYPE_ATT, GSXML.REQUEST_TYPE_PROCESS);
    154 
    155     // applet result info must be in appletInfo element
    156     Element applet_data = this.doc.createElement(GSXML.APPLET_DATA_ELEM);
    157     result.appendChild(applet_data);
    158     Element phind_data = this.doc.createElement("phindData");
    159     applet_data.appendChild(phind_data);
    160 
    161 
    162     // if we dont know the phrase number, look it up
    163     if (phrase == 0) {
    164         if (word==null || word.equals("")) {
    165         Element error = phindError("no word or phrase");
    166         phind_data.appendChild(error);
    167         return result;
    168         }
    169         phrase = findPhraseNumberFromWord( word);
    170     }
    171     if (phrase==0) {
    172         // the word is not in the collection
    173         // return a phind error string
    174         Element error = phindError("the term "+word+" is not in the collection");
    175         phind_data.appendChild(error);
    176         return result;
    177     }
    178    
    179     // get the phrase data into the phind_data node
    180     getPhraseData(phind_data, phrase, first_l, last_l,
    181               first_e, last_e,  first_d, last_d);
    182     return result;
    183    
    184    
    185     }// processPhindApplet
    186    
    187     protected long findPhraseNumberFromWord(String word) {
    188 
    189     // set the mgpp index data - we are looking up pword
    190     this.mgpp_src.loadIndexData(this.basepath+File.separatorChar+"pword");
    191 
    192     this.mgpp_src.runQuery(word);
    193 
    194     MGPPQueryResult res = this.mgpp_src.getQueryResult();
    195     Vector docs = res.getDocs();
    196     if (docs.size()==0) {
    197         // phrase not found
    198         return 0;
    199     }
    200     MGPPDocInfo doc = (MGPPDocInfo)docs.firstElement();
    201     return doc.num_;
    202     }
    203 
    204     protected boolean getPhraseData(Element phind_data,
    205                     long phrase, long first_l, long last_l,
    206                     long first_e, long last_e, long first_d,
    207                     long last_d) {
    208 
    209     String record = this.mgpp_src.getDocument(this.basepath+File.separatorChar+"pdata", "Document",
    210                           phrase);
    211     if (record.equals("")) {
    212         Element error = phindError("somethings gone wrong - we haven't got a record for phrase number "+phrase);
    213         phind_data.appendChild(error);
    214         return false;
    215     }
    216    
    217     // parse the record - its in gordons cryptic form
    218     // ":word:tf:ef:df:el:dl:lf:ll"
    219     // el: e,e,e
    220     // dl: d;f,d;f,
    221     // lf and ll may be null
    222     // l: type,dest, dest; type,dest,dest
    223 
    224     // ignore everything up to and including first colon (has
    225     // <Document>3505: at the start)
    226     record = record.substring(record.indexOf(':')+1);
    227 
    228     // split on ':'
    229     String [] fields = record.split(":");
    230     String word = fields[0];
    231     String tf = fields[1];
    232     String ef = fields[2];
    233     String df = fields[3];
    234 
    235    
    236     String expansions = fields[4];
    237     String documents = fields[5];
    238     String lf = "0";
    239     String linklist = "";
    240     if (fields.length > 7) {// have thesaurus stuff
    241         lf =fields[6];
    242         linklist = fields[7];
    243     }
    244    
    245     // the phindData attributes and phrase
    246     phind_data.setAttribute("id", Long.toString(phrase));
    247     phind_data.setAttribute("df", df);
    248     phind_data.setAttribute("ef", ef);
    249     phind_data.setAttribute("lf", lf);
    250     phind_data.setAttribute("tf", tf);
    251     GSXML.createTextElement(this.doc, "phrase", word);
    252 
    253     addExpansionList(phind_data, expansions, word, ef, first_e, last_e);
    254     addDocumentList(phind_data, documents, word, df, first_d, last_d);
    255     if (!lf.equals("0")) {
    256         addThesaurusList(phind_data, linklist, word, lf, first_l, last_l);
    257     }
    258     return true;
    259     }
    260 
    261     protected boolean addExpansionList( Element phind_data, String record,
    262                        String word,
    263                        String freq,
    264                        long first, long last) {
    265 
    266     Element expansion_list = this.doc.createElement("expansionList");
    267     phind_data.appendChild(expansion_list);
    268     expansion_list.setAttribute("length", freq);
    269     expansion_list.setAttribute("start", Long.toString(first));
    270     expansion_list.setAttribute("end", Long.toString(last));
    271 
    272     // get the list of strings
    273     String [] expansions = record.split(",");
    274     int length = expansions.length;
    275     if (length < last) last = length;
    276     for (long i = first; i < last; i++) {
    277         long num  = Long.parseLong(expansions[(int)i]);
    278         Element expansion = getExpansion( num, word);
    279         expansion.setAttribute("num", Long.toString(i));
    280         expansion_list.appendChild(expansion);
    281     }
    282     return true;
    283     }
    284    
    285     protected Element getExpansion(long phrase_num,
    286                    String orig_phrase) {
    287    
    288     // look up the phrase in the pdata thingy
    289     String record = this.mgpp_src.getDocument(this.basepath+File.separatorChar+"pdata", "Document",
    290                           phrase_num);
    291 
    292     if (record ==null || record.equals("")) return null;
    293 
    294     // ignore everything up to and including first colon
    295     record = record.substring(record.indexOf(':')+1);
    296 
    297     String [] fields = record.split(":");
    298     String phrase = fields[0];
    299     String tf = fields[1];
    300     //String ef = fields[2]; dont use this
    301     String df = fields[3];
    302 
    303     Element expansion = this.doc.createElement("expansion");
    304     expansion.setAttribute("tf", tf);
    305     expansion.setAttribute("df", df);
    306     expansion.setAttribute("id", Long.toString(phrase_num));
    307 
    308     // get teh suffix and prefix
    309     String [] ends = splitPhraseOnWord(phrase, orig_phrase);
    310     if (!ends[0].equals("")) {
    311         expansion.appendChild(GSXML.createTextElement(this.doc, "prefix", ends[0]));
    312     }
    313     if (!ends[1].equals("")) {
    314         expansion.appendChild(GSXML.createTextElement(this.doc, "suffix", ends[1]));
    315     }
    316 
    317     return expansion;
    318 
    319     }
    320 
    321     protected boolean addDocumentList(Element phind_data, String record,
    322                       String word,
    323                       String freq,
    324                       long first, long last) {
    325 
    326     Element document_list = this.doc.createElement("documentList");
    327     phind_data.appendChild(document_list);
    328     document_list.setAttribute("length", freq);
    329     document_list.setAttribute("start", Long.toString(first));
    330     document_list.setAttribute("end", Long.toString(last));
    331 
    332     // get the list of doc,freq
    333     String [] doc_freqs = record.split(";");
    334     int length = doc_freqs.length;
    335     if (length<last) last=length;
    336 
    337     for (long i = first; i < last; i++) {
    338         String doc_elem = doc_freqs[(int)i];
    339         int p = doc_elem.indexOf(',');
    340         long doc_num;
    341         String doc_freq;
    342         if (p == -1) { // there is no freq in the record
    343         doc_num =Long.parseLong(doc_elem);
    344         doc_freq = "1";
    345         } else {
    346         doc_num = Long.parseLong(doc_elem.substring(0,p));
    347         doc_freq = doc_elem.substring(p+1);
    348         }
    349         Element document = getDocument( doc_num);
    350         document.setAttribute("freq", doc_freq);
    351         document.setAttribute("num", Long.toString(i));
    352         document_list.appendChild(document);
    353     }
    354 
    355    
    356     return true;
    357     }
    358 
    359 
    360     protected Element getDocument(long doc_num) {
    361    
    362     // look up the phrase in the docs thingy
    363     String record = this.mgpp_src.getDocument(this.basepath+File.separatorChar+"docs", "Document",
    364                           doc_num);
    365    
    366     if (record ==null || record.equals("")) return null;
    367    
    368     // ignore everything up to and including first \t
    369     record = record.substring(record.indexOf('\t')+1);
    370 
    371     String [] fields = record.split("\t");
    372     String hash = fields[0];
    373     String title = fields[1];
    374 
    375     Element d = this.doc.createElement("document");
    376     d.setAttribute("hash", hash);
    377     d.appendChild(GSXML.createTextElement(this.doc, "title", title));
    378    
    379     return d;
    380 
    381     }
    382     protected boolean addThesaurusList(Element phind_data, String record,
    383                        String word,
    384                        String freq,
    385                        long first, long last) {
    386 
    387 
    388     Element thesaurus_list = this.doc.createElement("thesaurusList");
    389     phind_data.appendChild(thesaurus_list);
    390     thesaurus_list.setAttribute("length", freq);
    391     thesaurus_list.setAttribute("start", Long.toString(first));
    392     thesaurus_list.setAttribute("end", Long.toString(last));
    393    
    394     // get the list of type,dest,dest
    395     String [] links = record.split(";");
    396     int length = links.length;
    397     long index = 0;
    398     for (int i = 0; i < length; i++) { // go through the entries
    399         String link_info = links[(int)i];
    400         String [] items = link_info.split(",");
    401         // the first entry is teh type
    402         String type = items[0];
    403         for (int j = 1; j<items.length; j++, index++) {
    404         if (index >= first && index < last) { // only output the ones we want
    405             long phrase = Long.parseLong(items[j]);
    406             Element t = getThesaurus(phrase);
    407             t.setAttribute("type", type);
    408             thesaurus_list.appendChild(t);
    409         }
    410         }
    411     }
    412 
    413     return true;
    414     }
    415 
    416     protected Element getThesaurus(long phrase_num) {
    417 
    418     // look up the phrase in the pdata thingy
    419     String record = this.mgpp_src.getDocument(this.basepath+File.separatorChar+"pdata", "Document",
    420                           phrase_num);
    421 
    422     if (record ==null || record.equals("")) return null;
    423 
    424     // ignore everything up to and including first colon
    425     record = record.substring(record.indexOf(':')+1);
    426 
    427     String [] fields = record.split(":");
    428     String phrase = fields[0];
    429     String tf = fields[1];
    430     //String ef = fields[2]; dont use this
    431     String df = fields[3];
    432 
    433     Element thesaurus = this.doc.createElement("thesaurus");
    434     thesaurus.setAttribute("tf", tf);
    435     thesaurus.setAttribute("df", df);
    436     thesaurus.setAttribute("id", Long.toString(phrase_num));
    437     thesaurus.appendChild(GSXML.createTextElement(this.doc, "phrase", phrase));
    438     return thesaurus;
    439 
    440     }
    441 
    442     /** returns an array of two elements - the prefix and the suffix*/
    443     protected String [] splitPhraseOnWord(String phrase, String word) {
    444    
    445     if (word.equals("")) {
    446        
    447         String [] res =  {phrase, ""};
    448         return res;
    449     }
    450     // use 2 so that we only split on the first occurrance. trailing empty strings should be included
    451     String [] result = phrase.split(word, 2);
    452     return result;
    453    
    454     }
    455 
    456     protected Element phindError(String message) {
    457     Element e = this.doc.createElement("phindError");
    458     Text t = this.doc.createTextNode(message);
    459     e.appendChild(t);
    460     return e;
    461     }
    462    
     42  extends ServiceRack {
     43 
     44  static Logger logger = Logger.getLogger(org.greenstone.gsdl3.service.PhindPhraseBrowse.class.getName());
     45 
     46  // the services on offer
     47  private static final String PHIND_SERVICE = "PhindApplet";
     48 
     49  private static MGPPRetrieveWrapper mgpp_retrieve_src=null;
     50  private static MGPPSearchWrapper mgpp_search_src=null;
     51  private String basepath = null;
     52 
     53  private Element applet_description = null;
     54 
     55  public PhindPhraseBrowse() {
     56    if(this.mgpp_retrieve_src == null) {
     57      this.mgpp_retrieve_src = new MGPPRetrieveWrapper();
     58    }
     59    if(this.mgpp_search_src == null) {
     60      this.mgpp_search_src = new MGPPSearchWrapper();
     61    }
     62    // set up the default params
     63    this.mgpp_search_src.setQueryLevel("Document");
     64    this.mgpp_search_src.setReturnLevel("Document");
     65    this.mgpp_search_src.setMaxDocs(5);
     66    this.mgpp_search_src.setStem(false);
     67    this.mgpp_search_src.setCase(true);
     68  }
     69 
     70  public void cleanUp() {
     71    super.cleanUp();
     72    this.mgpp_retrieve_src.unloadIndexData();
     73    this.mgpp_search_src.unloadIndexData();
     74  }
     75 
     76  /** configure the service module
     77   *
     78   * @param info a DOM Element containing any config info for the service
     79   * @return true if configured
     80   */
     81  public boolean configure(Element info, Element extra_info) {
     82   
     83    if (!super.configure(info, extra_info)){
     84      return false;
     85    }
     86   
     87    logger.info("configuring PhindPhraseBrowse");
     88   
     89    // set up short_service_info_ - for now just has name and type
     90    Element e = this.doc.createElement(GSXML.SERVICE_ELEM);
     91    e.setAttribute(GSXML.TYPE_ATT, GSXML.SERVICE_TYPE_APPLET);
     92    e.setAttribute(GSXML.NAME_ATT, PHIND_SERVICE);
     93    this.short_service_info.appendChild(e);
     94   
     95    // set up the static applet description
     96   
     97    applet_description = this.doc.createElement(GSXML.SERVICE_ELEM);
     98    applet_description.setAttribute(GSXML.TYPE_ATT, GSXML.SERVICE_TYPE_APPLET);
     99    applet_description.setAttribute(GSXML.NAME_ATT, PHIND_SERVICE);
     100   
     101    // add in the applet info for the phind applet
     102    // need to make this dynamic - library names etc
     103    // change the applet params - have a single param with the library name
     104    // this is left blank at this end, and must be filled in by applet action - if the library name is not needed, this param is left out
     105    // phindcgi param now is not complete - library must be prepended to it.
     106    String app_info = "<"+GSXML.APPLET_ELEM+" CODEBASE='applet' CODE='org.greenstone.applet.phind.Phind.class' ARCHIVE='phind.jar, xercesImpl.jar, xml-apis.jar' WIDTH='500' HEIGHT='400'><PARAM NAME='library' VALUE=''/> <PARAM NAME='phindcgi' VALUE='?";
     107    app_info += GSParams.ACTION +"=a&amp;"+GSParams.REQUEST_TYPE +"=r&amp;"+GSParams.SERVICE+"="+PHIND_SERVICE+"&amp;"+GSParams.OUTPUT+"=xml&amp;"+GSParams.RESPONSE_ONLY+"=1'/>";
     108    app_info +="<PARAM NAME='collection'   VALUE='";
     109    app_info += this.cluster_name;
     110    app_info += "'/> <PARAM NAME='classifier' VALUE='1'/>  <PARAM NAME='orientation'  VALUE='vertical'/> <PARAM NAME='depth' VALUE='2'/> <PARAM NAME='resultorder' VALUE='L,l,E,e,D,d'/> <PARAM NAME='backdrop' VALUE='interfaces/default/images/phindbg1.jpg'/><PARAM NAME='fontsize' VALUE='10'/> <PARAM NAME='blocksize'    VALUE='10'/>The Phind java applet.</"+GSXML.APPLET_ELEM+">";
     111   
     112    Document dom = this.converter.getDOM(app_info);
     113    if (dom==null) {
     114      logger.error("Couldn't parse applet info");
     115      return false;
     116    }
     117    Element app_elem = dom.getDocumentElement();
     118    applet_description.appendChild(this.doc.importNode(app_elem, true));
     119   
     120    return true;
     121  }
     122 
     123  protected Element getServiceDescription(String service, String lang, String subset) {
     124    if (!service.equals(PHIND_SERVICE)) {
     125      return null;
     126    }
     127    Element describe = (Element) applet_description.cloneNode(true);
     128    describe.appendChild(GSXML.createDisplayTextElement(this.doc, GSXML.DISPLAY_TEXT_NAME,  getTextString(PHIND_SERVICE+".name", lang)));
     129    describe.appendChild(GSXML.createDisplayTextElement(this.doc, GSXML.DISPLAY_TEXT_DESCRIPTION,  getTextString(PHIND_SERVICE+".description", lang)));
     130    return describe;
     131  }
     132 
     133  protected Element processPhindApplet(Element request) {
     134   
     135    Element param_elem = (Element)GSXML.getChildByTagName(request, GSXML.PARAM_ELEM+GSXML.LIST_MODIFIER);
     136    HashMap params = GSXML.extractParams(param_elem, false);
     137   
     138    long first_e = Long.parseLong((String)params.get("pfe"));
     139    long last_e = Long.parseLong((String)params.get("ple"));
     140    long first_l = Long.parseLong((String)params.get("pfl"));
     141    long last_l = Long.parseLong((String)params.get("pll"));
     142    long first_d = Long.parseLong((String)params.get("pfd"));
     143    long last_d = Long.parseLong((String)params.get("pld"));
     144   
     145    long phrase;
     146    String phrase_str = (String)params.get("ppnum");
     147    if (phrase_str == null || phrase_str.equals("")) {
     148      phrase=0;
     149    } else {
     150      phrase = Long.parseLong(phrase_str);
     151    }
     152    String word = (String)params.get("pptext");
     153    String phind_index = (String)params.get("pc");
     154    // the location of the mgpp database files
     155    this.basepath = GSFile.phindBaseDir(this.site_home, this.cluster_name, phind_index);
     156   
     157    // the result element
     158    Element result = this.doc.createElement(GSXML.RESPONSE_ELEM);
     159    result.setAttribute(GSXML.FROM_ATT, PHIND_SERVICE);
     160    result.setAttribute(GSXML.TYPE_ATT, GSXML.REQUEST_TYPE_PROCESS);
     161   
     162    // applet result info must be in appletInfo element
     163    Element applet_data = this.doc.createElement(GSXML.APPLET_DATA_ELEM);
     164    result.appendChild(applet_data);
     165    Element phind_data = this.doc.createElement("phindData");
     166    applet_data.appendChild(phind_data);
     167   
     168   
     169    // if we dont know the phrase number, look it up
     170    if (phrase == 0) {
     171      if (word==null || word.equals("")) {
     172        Element error = phindError("no word or phrase");
     173        phind_data.appendChild(error);
     174        return result;
     175      }
     176      phrase = findPhraseNumberFromWord( word);
     177    }
     178    if (phrase==0) {
     179      // the word is not in the collection
     180      // return a phind error string
     181      Element error = phindError("the term "+word+" is not in the collection");
     182      phind_data.appendChild(error);
     183      return result;
     184    }
     185   
     186    // get the phrase data into the phind_data node
     187    getPhraseData(phind_data, phrase, first_l, last_l,
     188      first_e, last_e,  first_d, last_d);
     189    return result;
     190   
     191   
     192  }// processPhindApplet
     193 
     194  protected long findPhraseNumberFromWord(String word) {
     195    synchronized (mgpp_search_src) {
     196        // set the mgpp index data - we are looking up pword
     197        mgpp_search_src.loadIndexData(this.basepath+File.separatorChar+"pword");
     198       
     199        mgpp_search_src.runQuery(word);
     200       
     201        MGPPQueryResult res = mgpp_search_src.getQueryResult();
     202        Vector docs = res.getDocs();
     203        if (docs.size()==0) {
     204            // phrase not found
     205            return 0;
     206        }
     207        MGPPDocInfo doc = (MGPPDocInfo)docs.firstElement();
     208        return doc.num_;
     209    }
     210  }
     211 
     212  protected boolean getPhraseData(Element phind_data,
     213    long phrase, long first_l, long last_l,
     214    long first_e, long last_e, long first_d,
     215    long last_d) {
     216   
     217      synchronized (mgpp_retrieve_src) {
     218    String record = this.mgpp_retrieve_src.getDocument(this.basepath+File.separatorChar+"pdata", "Document",
     219      phrase);
     220    if (record.equals("")) {
     221      Element error = phindError("somethings gone wrong - we haven't got a record for phrase number "+phrase);
     222      phind_data.appendChild(error);
     223      return false;
     224    }
     225   
     226    // parse the record - its in gordons cryptic form
     227    // ":word:tf:ef:df:el:dl:lf:ll"
     228    // el: e,e,e
     229    // dl: d;f,d;f,
     230    // lf and ll may be null
     231    // l: type,dest, dest; type,dest,dest
     232   
     233    // ignore everything up to and including first colon (has
     234    // <Document>3505: at the start)
     235    record = record.substring(record.indexOf(':')+1);
     236   
     237    // split on ':'
     238    String [] fields = record.split(":");
     239    String word = fields[0];
     240    String tf = fields[1];
     241    String ef = fields[2];
     242    String df = fields[3];
     243   
     244   
     245    String expansions = fields[4];
     246    String documents = fields[5];
     247    String lf = "0";
     248    String linklist = "";
     249    if (fields.length > 7) {// have thesaurus stuff
     250      lf =fields[6];
     251      linklist = fields[7];
     252    }
     253   
     254    // the phindData attributes and phrase
     255    phind_data.setAttribute("id", Long.toString(phrase));
     256    phind_data.setAttribute("df", df);
     257    phind_data.setAttribute("ef", ef);
     258    phind_data.setAttribute("lf", lf);
     259    phind_data.setAttribute("tf", tf);
     260    GSXML.createTextElement(this.doc, "phrase", word);
     261   
     262    addExpansionList(phind_data, expansions, word, ef, first_e, last_e);
     263    addDocumentList(phind_data, documents, word, df, first_d, last_d);
     264    if (!lf.equals("0")) {
     265      addThesaurusList(phind_data, linklist, word, lf, first_l, last_l);
     266    }
     267    return true;
     268      }
     269  }
     270 
     271  protected boolean addExpansionList( Element phind_data, String record,
     272    String word,
     273    String freq,
     274    long first, long last) {
     275   
     276    Element expansion_list = this.doc.createElement("expansionList");
     277    phind_data.appendChild(expansion_list);
     278    expansion_list.setAttribute("length", freq);
     279    expansion_list.setAttribute("start", Long.toString(first));
     280    expansion_list.setAttribute("end", Long.toString(last));
     281   
     282    // get the list of strings
     283    String [] expansions = record.split(",");
     284    int length = expansions.length;
     285    if (length < last) last = length;
     286    for (long i = first; i < last; i++) {
     287      long num  = Long.parseLong(expansions[(int)i]);
     288      Element expansion = getExpansion( num, word);
     289      expansion.setAttribute("num", Long.toString(i));
     290      expansion_list.appendChild(expansion);
     291    }
     292    return true;
     293  }
     294 
     295  protected Element getExpansion(long phrase_num,
     296    String orig_phrase) {
     297   
     298    // look up the phrase in the pdata thingy
     299    String record = this.mgpp_retrieve_src.getDocument(this.basepath+File.separatorChar+"pdata", "Document",
     300      phrase_num);
     301   
     302    if (record ==null || record.equals("")) return null;
     303   
     304    // ignore everything up to and including first colon
     305    record = record.substring(record.indexOf(':')+1);
     306   
     307    String [] fields = record.split(":");
     308    String phrase = fields[0];
     309    String tf = fields[1];
     310    //String ef = fields[2]; dont use this
     311    String df = fields[3];
     312   
     313    Element expansion = this.doc.createElement("expansion");
     314    expansion.setAttribute("tf", tf);
     315    expansion.setAttribute("df", df);
     316    expansion.setAttribute("id", Long.toString(phrase_num));
     317   
     318    // get teh suffix and prefix
     319    String [] ends = splitPhraseOnWord(phrase, orig_phrase);
     320    if (!ends[0].equals("")) {
     321      expansion.appendChild(GSXML.createTextElement(this.doc, "prefix", ends[0]));
     322    }
     323    if (!ends[1].equals("")) {
     324      expansion.appendChild(GSXML.createTextElement(this.doc, "suffix", ends[1]));
     325    }
     326   
     327    return expansion;
     328   
     329  }
     330 
     331  protected boolean addDocumentList(Element phind_data, String record,
     332    String word,
     333    String freq,
     334    long first, long last) {
     335   
     336    Element document_list = this.doc.createElement("documentList");
     337    phind_data.appendChild(document_list);
     338    document_list.setAttribute("length", freq);
     339    document_list.setAttribute("start", Long.toString(first));
     340    document_list.setAttribute("end", Long.toString(last));
     341   
     342    // get the list of doc,freq
     343    String [] doc_freqs = record.split(";");
     344    int length = doc_freqs.length;
     345    if (length<last) last=length;
     346   
     347    for (long i = first; i < last; i++) {
     348      String doc_elem = doc_freqs[(int)i];
     349      int p = doc_elem.indexOf(',');
     350      long doc_num;
     351      String doc_freq;
     352      if (p == -1) { // there is no freq in the record
     353        doc_num =Long.parseLong(doc_elem);
     354        doc_freq = "1";
     355      } else {
     356        doc_num = Long.parseLong(doc_elem.substring(0,p));
     357        doc_freq = doc_elem.substring(p+1);
     358      }
     359      Element document = getDocument( doc_num);
     360      document.setAttribute("freq", doc_freq);
     361      document.setAttribute("num", Long.toString(i));
     362      document_list.appendChild(document);
     363    }
     364   
     365   
     366    return true;
     367  }
     368 
     369 
     370  protected Element getDocument(long doc_num) {
     371   
     372    // look up the phrase in the docs thingy
     373    String record = this.mgpp_retrieve_src.getDocument(this.basepath+File.separatorChar+"docs", "Document",
     374      doc_num);
     375   
     376    if (record ==null || record.equals("")) return null;
     377   
     378    // ignore everything up to and including first \t
     379    record = record.substring(record.indexOf('\t')+1);
     380   
     381    String [] fields = record.split("\t");
     382    String hash = fields[0];
     383    String title = fields[1];
     384   
     385    Element d = this.doc.createElement("document");
     386    d.setAttribute("hash", hash);
     387    d.appendChild(GSXML.createTextElement(this.doc, "title", title));
     388   
     389    return d;
     390   
     391  }
     392  protected boolean addThesaurusList(Element phind_data, String record,
     393    String word,
     394    String freq,
     395    long first, long last) {
     396   
     397   
     398    Element thesaurus_list = this.doc.createElement("thesaurusList");
     399    phind_data.appendChild(thesaurus_list);
     400    thesaurus_list.setAttribute("length", freq);
     401    thesaurus_list.setAttribute("start", Long.toString(first));
     402    thesaurus_list.setAttribute("end", Long.toString(last));
     403   
     404    // get the list of type,dest,dest
     405    String [] links = record.split(";");
     406    int length = links.length;
     407    long index = 0;
     408    for (int i = 0; i < length; i++) { // go through the entries
     409      String link_info = links[(int)i];
     410      String [] items = link_info.split(",");
     411      // the first entry is teh type
     412      String type = items[0];
     413      for (int j = 1; j<items.length; j++, index++) {
     414        if (index >= first && index < last) { // only output the ones we want
     415          long phrase = Long.parseLong(items[j]);
     416          Element t = getThesaurus(phrase);
     417          t.setAttribute("type", type);
     418          thesaurus_list.appendChild(t);
     419        }
     420      }
     421    }
     422   
     423    return true;
     424  }
     425 
     426  protected Element getThesaurus(long phrase_num) {
     427   
     428    // look up the phrase in the pdata thingy
     429    String record = this.mgpp_retrieve_src.getDocument(this.basepath+File.separatorChar+"pdata", "Document",
     430      phrase_num);
     431   
     432    if (record ==null || record.equals("")) return null;
     433   
     434    // ignore everything up to and including first colon
     435    record = record.substring(record.indexOf(':')+1);
     436   
     437    String [] fields = record.split(":");
     438    String phrase = fields[0];
     439    String tf = fields[1];
     440    //String ef = fields[2]; dont use this
     441    String df = fields[3];
     442   
     443    Element thesaurus = this.doc.createElement("thesaurus");
     444    thesaurus.setAttribute("tf", tf);
     445    thesaurus.setAttribute("df", df);
     446    thesaurus.setAttribute("id", Long.toString(phrase_num));
     447    thesaurus.appendChild(GSXML.createTextElement(this.doc, "phrase", phrase));
     448    return thesaurus;
     449   
     450  }
     451 
     452  /** returns an array of two elements - the prefix and the suffix*/
     453  protected String [] splitPhraseOnWord(String phrase, String word) {
     454   
     455    if (word.equals("")) {
     456     
     457      String [] res =  {phrase, ""};
     458      return res;
     459    }
     460    // use 2 so that we only split on the first occurrance. trailing empty strings should be included
     461    String [] result = phrase.split(word, 2);
     462    return result;
     463   
     464  }
     465 
     466  protected Element phindError(String message) {
     467    Element e = this.doc.createElement("phindError");
     468    Text t = this.doc.createTextNode(message);
     469    e.appendChild(t);
     470    return e;
     471  }
     472 
    463473}
    464474
Note: See TracChangeset for help on using the changeset viewer.