/********************************************************************** * * phindaction.cpp -- * * Copyright 2001 Gordon W. Paynter * Copyright 2001 The New Zealand Digital Library Project * * A component of the Greenstone digital library software * from the New Zealand Digital Library Project at the * University of Waikato, New Zealand. * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. * *********************************************************************/ #include "gsdl_modules_cfg.h" #ifdef GSDL_USE_PHIND_ACTION // Note that this action uses mgpp to retrieve phind info, calling MGQuery // etc. directly, not through the protocol. This breaks our receptionist - // collection server separation and should be fixed some day I guess. #include "phindaction.h" #include "fileutil.h" #include "gsdlunicode.h" phindaction::phindaction () { cgiarginfo arg_ainfo; arg_ainfo.shortname = "pc"; arg_ainfo.longname = "phind classifier"; arg_ainfo.multiplechar = true; arg_ainfo.multiplevalue = false; arg_ainfo.defaultstatus = cgiarginfo::weak; arg_ainfo.argdefault = g_EmptyText; arg_ainfo.savedarginfo = cgiarginfo::mustnot; argsinfo.addarginfo (NULL, arg_ainfo); arg_ainfo.shortname = "pxml"; arg_ainfo.longname = "phind XML mode"; arg_ainfo.multiplechar = false; arg_ainfo.multiplevalue = false; arg_ainfo.defaultstatus = cgiarginfo::weak; arg_ainfo.argdefault = "0"; arg_ainfo.savedarginfo = cgiarginfo::mustnot; argsinfo.addarginfo (NULL, arg_ainfo); arg_ainfo.shortname = "ppnum"; arg_ainfo.longname = "phind phrase number"; arg_ainfo.multiplechar = true; arg_ainfo.multiplevalue = false; arg_ainfo.defaultstatus = cgiarginfo::weak; arg_ainfo.argdefault = "0"; arg_ainfo.savedarginfo = cgiarginfo::mustnot; argsinfo.addarginfo (NULL, arg_ainfo); arg_ainfo.shortname = "pptext"; arg_ainfo.longname = "phind phrase text"; arg_ainfo.multiplechar = true; arg_ainfo.multiplevalue = false; arg_ainfo.defaultstatus = cgiarginfo::weak; arg_ainfo.argdefault = g_EmptyText; arg_ainfo.savedarginfo = cgiarginfo::mustnot; argsinfo.addarginfo (NULL, arg_ainfo); arg_ainfo.shortname = "pfe"; arg_ainfo.longname = "phind first_e"; arg_ainfo.multiplechar = true; arg_ainfo.multiplevalue = false; arg_ainfo.defaultstatus = cgiarginfo::weak; arg_ainfo.argdefault = "0"; arg_ainfo.savedarginfo = cgiarginfo::mustnot; argsinfo.addarginfo (NULL, arg_ainfo); arg_ainfo.shortname = "ple"; arg_ainfo.longname = "phind last_e"; arg_ainfo.multiplechar = true; arg_ainfo.multiplevalue = false; arg_ainfo.defaultstatus = cgiarginfo::weak; arg_ainfo.argdefault = "10"; arg_ainfo.savedarginfo = cgiarginfo::mustnot; argsinfo.addarginfo (NULL, arg_ainfo); arg_ainfo.shortname = "pfl"; arg_ainfo.longname = "phind first_l"; arg_ainfo.multiplechar = true; arg_ainfo.multiplevalue = false; arg_ainfo.defaultstatus = cgiarginfo::weak; arg_ainfo.argdefault = "0"; arg_ainfo.savedarginfo = cgiarginfo::mustnot; argsinfo.addarginfo (NULL, arg_ainfo); arg_ainfo.shortname = "pll"; arg_ainfo.longname = "phind last_l"; arg_ainfo.multiplechar = true; arg_ainfo.multiplevalue = false; arg_ainfo.defaultstatus = cgiarginfo::weak; arg_ainfo.argdefault = "10"; arg_ainfo.savedarginfo = cgiarginfo::mustnot; argsinfo.addarginfo (NULL, arg_ainfo); arg_ainfo.shortname = "pfd"; arg_ainfo.longname = "phind first_d"; arg_ainfo.multiplechar = true; arg_ainfo.multiplevalue = false; arg_ainfo.defaultstatus = cgiarginfo::weak; arg_ainfo.argdefault = "0"; arg_ainfo.savedarginfo = cgiarginfo::mustnot; argsinfo.addarginfo (NULL, arg_ainfo); arg_ainfo.shortname = "pld"; arg_ainfo.longname = "phind last_d"; arg_ainfo.multiplechar = true; arg_ainfo.multiplevalue = false; arg_ainfo.defaultstatus = cgiarginfo::weak; arg_ainfo.argdefault = "10"; arg_ainfo.savedarginfo = cgiarginfo::mustnot; argsinfo.addarginfo (NULL, arg_ainfo); } phindaction::~phindaction () { } void phindaction::get_cgihead_info (cgiargsclass &args, recptprotolistclass * /*protos*/, response_t &response,text_t &response_data, ostream &/*logout*/) { response = content; if (args["pxml"] == "1") { response_data = "text/xml"; } else { response_data = "text/html"; } } bool phindaction::do_action (cgiargsclass &args, recptprotolistclass *protos, browsermapclass * /*browsers*/, displayclass &disp, outconvertclass &outconvert, ostream &textout, ostream &logout) { unsigned long count_l, count_e, count_d; unsigned long phrase = args["ppnum"].getulong(); text_t &word = args["pptext"]; unsigned long first_e = args["pfe"].getulong(); unsigned long last_e = args["ple"].getulong(); unsigned long first_l = args["pfl"].getulong(); unsigned long last_l = args["pll"].getulong(); unsigned long first_d = args["pfd"].getulong(); unsigned long last_d = args["pld"].getulong(); bool XMLmode = false; if (args["pxml"] == "1") XMLmode = true; // must have a valid collection server recptproto *collectproto = protos->getrecptproto (args["c"], logout); if (collectproto == NULL) { output_error("phindaction: ERROR: collection not set", textout, outconvert, disp, logout, XMLmode); return true; } // the frequency and occurances of the phrase unsigned long tf; vector el, linkdest, docNums, docfreq; vector linktype; // the number of occurances to display unsigned long ef, lf, df; text_t basepath = filename_cat(collecthome, args["c"], "index", "phind" + args["pc"]); // If we don't know the phrase number, look it up if (phrase == 0) { if (word.empty()) { output_error("phindaction: ERROR: no phrase number or word", textout, outconvert, disp, logout, XMLmode); return true; } DocNumArray result; /** In order to prevent browser crashing problems, any method which * previously suffered a silent fatal error, now instead returns false * to indicate a fatal error has occured. We can then dispatch an * appropriate error tag to the Phind applet (rather than leave it * whiling away the milliseconds until the end of existence - or at * least your browser - in an infinite loop!) * DLConsulting 12-07-2004 */ if(!find_phrase_number_from_word(basepath, word, result)) { output_error("phindaction: Fatal Error! Couldn't load index information in find_phrase_number_from_word()", textout, outconvert, disp, logout, XMLmode); return true; } if (result.empty()) { output_error("phindaction: The search term ("+word+") does not occur in the collection", textout, outconvert, disp, logout, XMLmode); return true; } else { phrase = result[0]; } } // Create a TextData object to read the phrase data (pdata) TextData textdata; text_t fullpath = filename_cat(basepath, "pdata"); char *fullpathc = fullpath.getcstr(); #if defined __WIN32__ char *base = ""; #else char *base = "/"; #endif if (!textdata.LoadData (base, fullpathc)) { // FatalError (1, "Couldn't load text information for \"%s\"", fullpathc); //exit(0); /** We must return something to the client, whether this error is fatal or * no, otherwise we risk sending their browser into an infinite loop! * DLConsulting 12-07-2004 */ output_error("phindaction: Fatal Error! Couldn't load text information for collection", textout, outconvert, disp, logout, XMLmode); return true; } delete []fullpathc; /** Another previously silent method can now cry out. * DLConsulting 12-07-2004 */ if(!get_phrase_all_data(textdata, phrase, word, tf, ef, lf, df, el, linkdest, linktype, docNums, docfreq)) { output_error( "phindaction: Fatal Error! Couldn't parse phrase in get_phrase_all_data()", textout, outconvert, disp, logout, XMLmode); return true; } // Output the header if (XMLmode) { textout << "\n"; } else { textout << "" << word << "\n" << "
\n" << "

" << word << "

\n" << "

"<< word << " occurs " << tf << " times in " << df << " documents\n"; } // Output the thesaurus links if ((lf > 0) && (first_l < last_l)) { // figure out the number of phrases to output if (last_l > lf) { last_l = lf; } count_l = last_l - first_l; if (XMLmode) { textout << "\n"; /** DLConsulting 12-07-2004 */ if(!print_thesaurus_links(args["c"], XMLmode, textdata, linkdest, linktype, first_l, last_l, disp, outconvert, textout)) { output_error( "phindaction: Fatal Error! Couldn't get phrase in get_phrase_freq_data()", textout, outconvert, disp, logout, XMLmode); return true; } textout << "\n"; } // output links as HTML else { if (count_l == lf) { textout << "

" << count_l << " thesaurus links\n"; } else { textout << "

" << count_l << " of " << lf << " thesaurus links\n"; } textout << "

\n"; /** DLConsulting 12-07-2004 */ if(!print_thesaurus_links(args["c"], XMLmode, textdata, linkdest, linktype, first_l, last_l, disp, outconvert, textout)) { output_error( "phindaction: Fatal Error! Couldn't get phrase in get_phrase_freq_data()", textout, outconvert, disp, logout, XMLmode); return true; } textout << "
typetopicfreqdocs
\n"; if (last_l < lf) { if ((last_l + 10) < lf) { textout << outconvert << disp << "
Get more thesaurus links\n"; } textout << outconvert << disp << "
Get every thesaurus link\n" ; } } } // Output the expansions if ((ef > 0) && (first_e < last_e)) { // figure out the number of phrases to output if (last_e > el.size()) { last_e = el.size(); } count_e = last_e - first_e; // output expansions as XML if (XMLmode) { textout << "" << endl; print_expansions(args["c"], XMLmode, word, textdata, el, first_e, last_e, disp, outconvert, textout); textout << "\n"; } // output expansions as HTML else { if (count_e == el.size()) { textout << "

" << count_e << " expansions\n"; } else { textout << "

" << count_e << " of " << ef << " expansions\n"; } textout << "

\n"; print_expansions(args["c"], XMLmode, word, textdata, el, first_e, last_e, disp, outconvert, textout); textout << "
phrasefreqdocs
\n"; if (last_e < ef) { if ((last_e + 10) < ef) { textout << outconvert << disp << "
Get more expansions\n"; } textout << outconvert << disp << "
Get every expansion\n"; } } } // Output the document occurances if ((df > 0) && (first_d < last_d)) { // figure out the phrases to output if (last_d > docNums.size()) { last_d = docNums.size(); } count_d = last_d - first_d; // output document list as XML if (XMLmode) { textout << "\n"; if(!print_documents(XMLmode, basepath, args["c"], docNums, docfreq, first_d, last_d, disp, outconvert, textout)) { output_error( "phindaction: Fatal Error! Couldn't load text information in print_documents() or get_document_all_data()", textout, outconvert, disp, logout, XMLmode); return true; } textout << "\n"; } // output document list as HTML else { if (count_d == docNums.size()) { textout << "

" << count_d << " documents\n"; } else { textout << "

" << count_d << " of " << df << " documents\n"; } textout << "

\n"; if(!print_documents(XMLmode, basepath, args["c"], docNums, docfreq, first_d, last_d, disp, outconvert, textout)) { output_error( "phindaction: Fatal Error! Couldn't load text information in print_documents()", textout, outconvert, disp, logout, XMLmode); return true; } textout << "
documentfreq
\n"; if (last_d < df) { if ((last_d + 10) < df) { textout << outconvert << disp << "
Get more documents\n"; } textout << outconvert << disp << "
Get every document\n"; } } } // Close the document if (XMLmode) { textout << "\n"; } else { textout << "

\n"; } textdata.UnloadData (); return true; } // Find the phrase number of a word in the index file bool phindaction::find_phrase_number_from_word(const text_t &basepath, const text_t &query, DocNumArray &result) { // Open the index file for searching IndexData indexData; text_t fullpath = filename_cat(basepath, "pword"); char *fullpathc = fullpath.getcstr(); #if defined __WIN32__ char *base = ""; #else char *base = "/"; #endif if (!indexData.LoadData (base, fullpathc)) { // FatalError (1, "Couldn't load index information for \"%s\"", fullpathc); //exit(0); /** Don't handle fatal errors here anymore. * DLConsulting 12-07-2004 */ return false; // Indicates something very bad has happened } delete []fullpathc; // set up the query object QueryInfo queryInfo; SetCStr (queryInfo.docLevel, "Document", 8); queryInfo.maxDocs = 5; queryInfo.sortByRank = true; queryInfo.exactWeights = false; queryInfo.needRankInfo = true; queryInfo.needTermFreqs = true; // mode 1 = casefolded, unstemmed search UCArray ucquery; // greenstone gives us the query encoded in unicode. We want utf8. char* utf8querystring=to_utf8(query).getcstr(); SetCStr(ucquery, utf8querystring); delete []utf8querystring; //toUCArray(query, ucquery); QueryNode *queryTree = ParseQuery(ucquery, 1, 1, 4); // perform the query ExtQueryResult queryResult; MGQuery (indexData, queryInfo, queryTree, queryResult); // cout << "-- word lookup result -- " << endl << queryResult << endl ; result.clear(); result = queryResult.docs; // delete the query if (queryTree != NULL) delete queryTree; indexData.UnloadData(); /** This method now returns a boolean, so... * DLConsulting 12-07-2004 */ return true; // Indicates that what happened is all good, baby. } // Get all the data about a phrase // // The phrase is stored in textData as record phrase. // We retrieve: // word - the text of the phrase // tf - the total frequency of the phrase // ef - the expansion frequency of the phrase // lf - the thesaurus link frequency of the phrase // df - the document frequency of the phrase // el - the list of phrases that are expansions of phrase // ll - the list of phrases that are thesaurus links // dl - the list of documents that contain phrase bool phindaction::get_phrase_all_data(TextData &textdata, unsigned long phrase, text_t &word, unsigned long &tf, unsigned long &ef, unsigned long &lf, unsigned long &df, vector &el, vector &linkdest, vector &linktype, vector &docnum, vector &docfrq) { UCArray text; UCArray docLevel; SetCStr(docLevel, "Document", 8); // Look the word up in the textData if (!GetDocText (textdata, docLevel, phrase, text)) { // FatalError (1, "Error while trying to get phrase %u", phrase); //exit(0); return false; // Something very bad has happened. } // Ignore everything up to the first colon UCArray::iterator next = text.begin(); while (*next++ != ':'); // ignore training carriage returns while (text.back() == '\n') { text.pop_back(); } // Get the word word.clear(); for (; *next != ':'; ++next) { word.push_back(*next); } // Get total frequency tf = 0; for (++next; *next != ':'; ++next) { tf *= 10; tf += (*next - '0'); } // Get expansion frequency ef = 0; for (++next; *next != ':'; ++next) { ef *= 10; ef += (*next - '0'); } // Get document frequency df = 0; for (++next; *next != ':'; ++next) { df *= 10; df += (*next - '0'); } // Get expansion list el.clear(); unsigned long e = 0; for (++next; *next != ':'; ++next) { if (*next == ',') { el.push_back(e); e = 0; } else { e *= 10; e += (*next - '0'); } } // Get document list & the document frequency list docnum.clear(); docfrq.clear(); bool readnum = false; unsigned long d = 0; for (++next; *next != ':'; ++next) { if (*next == ',') { docnum.push_back(d); readnum = true; d = 0; } else if (*next == ';') { if (readnum) { docfrq.push_back(d); } else { docnum.push_back(d); docfrq.push_back(1); } readnum = false; d = 0; } else { d *= 10; d += (*next - '0'); } } // Get thesaurus link frequency & link list text.push_back(':'); text.push_back(':'); // link frequency lf = 0; for (++next; *next != ':'; ++next) { lf *= 10; lf += (*next - '0'); } // two lists of link data linkdest.clear(); linktype.clear(); UCArray thistype; thistype.clear(); bool typedone = false; unsigned long l = 0; for (++next; *next != ':'; ++next) { if (!typedone) { // first read the link type, a charactor string if (*next == ',') { typedone = true; } else { thistype.push_back(*next); } } else { // having read the link type, read the list of link destinations if (*next == ',') { linkdest.push_back(l); linktype.push_back(thistype); l = 0; } else if (*next == ';') { linkdest.push_back(l); linktype.push_back(thistype); l = 0; thistype.clear(); typedone = false; } else { l *= 10; l += (*next - '0'); } } } return true; // Indicates that what happened is all good, baby. } bool phindaction::print_thesaurus_links(const text_t &collection, bool XMLmode, TextData &textdata, vector &linkdest, vector &linktype, unsigned long first, unsigned long last, displayclass &disp, outconvertclass &outconvert, ostream &textout) { // information describing each link in the list unsigned long phrase, tf, ef, df; UCArray type, text; for (unsigned long l = first; l < last; ++l) { // get the phrase data phrase = linkdest[l]; type = linktype[l]; /** DLConsulting 12-07-2004 */ if(!get_phrase_freq_data(textdata, phrase, text, tf, ef, df)) { return false; } if (XMLmode) { textout << "\n"; } else { textout << "" << type << ""; textout << outconvert << disp << "" << text << "" << "" << tf << "" << df << "\n"; } } /** DLConsulting 12-07-2004 */ return true; } // Get the frequency data about a phrase // // The phrase is stored in textData as record phrase. // We retrieve: // word - the text of the phrase // tf - the total frequency of the phrase // ef - the expansion frequency of the phrase // df - the document frequency of the phrase /** * Returns: * false if the method suffered a fatal error, true otherwise */ bool phindaction::get_phrase_freq_data(TextData &textdata, unsigned long phrase, UCArray &word, unsigned long &tf, unsigned long &ef, unsigned long &df) { UCArray text; UCArray docLevel; SetCStr(docLevel, "Document", 8); // Look the word up in the textData if (!GetDocText (textdata, docLevel, phrase, text)) { // FatalError (1, "Error while trying to get phrase %u", phrase); //exit(0); /** DLConsulting 12-07-2004 */ return false; } // Ignore everything up to the first colon UCArray::iterator next = text.begin(); while (*next++ != ':'); // Get the word word.clear(); for (; *next != ':'; ++next) { word.push_back(*next); } // Get total frequency tf = 0; for (++next; *next != ':'; ++next) { tf *= 10; tf += (*next - '0'); } // Get expansion frequency ef = 0; for (++next; *next != ':'; ++next) { ef *= 10; ef += (*next - '0'); } // Get document frequency df = 0; for (++next; *next != ':'; ++next) { df *= 10; df += (*next - '0'); } /** DLConsulting 12-07-2004 */ return true; } // Print a list of expansions // // Given the textData and a list of phrase numbers, print out each of the // expansions. void phindaction::print_expansions(const text_t &collection, bool XMLmode, const text_t &body, TextData &textdata, const vector &elist, unsigned long first, unsigned long last, displayclass &disp, outconvertclass &outconvert, ostream &textout) { UCArray word; unsigned long phrase, tf, df, ef; UCArray suffix, prefix, ucbody; toUCArray(body, ucbody); for (unsigned long e = first; e < last; ++e) { phrase = elist[e]; get_phrase_freq_data(textdata, phrase, word, tf, ef, df); split_phrase(word, ucbody, prefix, suffix); if (XMLmode) { // body is always the same as the text of the phrase, so no need to send it textout << "\n"; } else { textout << outconvert << disp << ""; textout << prefix << ""; textout <" << body << "" << ""; textout << suffix << "" << "" << tf << "" << df << "\n"; } } } // split an expansion into prefix and suffix void phindaction::split_phrase(const UCArray &word, const UCArray &body, UCArray &prefix, UCArray &suffix) { prefix.clear(); suffix.clear(); bool readingPrefix = true; UCArray::const_iterator here = word.begin(); UCArray::const_iterator end = word.end(); while (here != end) { // if we've not read all the prefix, add the next char to the prefix if (readingPrefix) { if (phrase_match(body, here, end)) { readingPrefix = false; // trim whitespace from end of prefix & start of suffix if (!prefix.empty()) { prefix.pop_back(); } if ((here != end) && (*here == ' ')) { ++here; } } else { prefix.push_back(*here); ++here; } } // if we've finished with the prefix, update the suffix else { suffix.push_back(*here); ++here; } } } // phrase_match // // compare two strings, one represented as an UCArray, the other as two // UCArray iterators. // // Return true if the UCArray is the same as the phrase the iterators point // to for the length of the UCArray. bool phindaction::phrase_match(const UCArray &text, UCArray::const_iterator &here, UCArray::const_iterator end) { UCArray::const_iterator one_here = text.begin(); UCArray::const_iterator one_end = text.end(); UCArray::const_iterator two_here = here; // iterate over the length of the first string, comparing each element to // the corresponding element in the second string. while (one_here != one_end) { if (two_here == end) { return false; } else if (*one_here != *two_here) { return false; } ++one_here; ++two_here; } here = two_here; return true; } bool phindaction::print_documents(bool XMLmode, const text_t &basepath, const text_t &collection, const vector &docNums, const vector &docFreq, unsigned long first, unsigned long last, displayclass &disp, outconvertclass &outconvert, ostream &textout) { // Create a TextData object to read the document data TextData docdata; text_t fullpath = filename_cat(basepath, "docs"); char *fullpathc = fullpath.getcstr(); #if defined __WIN32__ char *base = ""; #else char *base = "/"; #endif if (!docdata.LoadData (base, fullpathc)) { // FatalError (1, "Couldn't load text information for \"%s\"", fullpathc); //exit(0); /** DLConsulting 12-07-2004 */ return false; } delete []fullpathc; UCArray title, hash; unsigned long freq, doc; for (unsigned long d = first; d < last; ++d) { doc = docNums[d]; freq = docFreq[d]; /** DLConsulting 13-07-2004 */ if(!get_document_all_data(docdata, doc, title, hash)) { return false; } if (XMLmode) { textout << "\n"; } else { textout << outconvert << disp << "" << title << "" << "" << freq << "\n"; } } docdata.UnloadData(); /** DLConsulting 12-07-2004 */ return true; } // Get all the data about a docment // // The document's details are stored in docData as record docNum. // We retrieve: // title - the document's title // hash - the document's unique OID /** Returns: * false if a fatal error occured, true otherwise * DLConsulting 12-07-2004 */ bool phindaction::get_document_all_data(TextData &docdata, unsigned long docNum, UCArray &title, UCArray &hash) { UCArray text; UCArray docLevel; SetCStr(docLevel, "Document", 8); // Look the word up in the textData if (!GetDocText (docdata, docLevel, docNum, text)) { // FatalError (1, "Error while trying to get document %u", docNum); //exit(0); /** DLConsulting 13-07-2004 */ return false; } // Ignore everything up to the first colon UCArray::iterator next = text.begin(); while (*next++ != '\t'); // Get the document OID (hash) hash.clear(); for (; *next != '\t'; ++next) { hash.push_back(*next); } // Get the title text.push_back('\n'); title.clear(); for (++next; *next != '\n'; ++next) { title.push_back(*next); } /** DLConsulting 13-07-2004 */ return true; } void phindaction::toUCArray(const text_t &in, UCArray &out) { out.clear(); if (out.capacity() < in.size() + 1) { out.reserve(in.size() + 1); } text_t::const_iterator here = in.begin(); text_t::const_iterator end = in.end(); while (here != end) { out.push_back((unsigned char) *here); ++here; } } void phindaction::output_error (const text_t &message, ostream &textout, outconvertclass &outconvert, displayclass & disp, ostream &logout, bool XMLmode) { logout << outconvert << message << "\n"; if (XMLmode) { textout << outconvert << "\n" << "" << message << "\n" << "\n"; } else { textout << outconvert << disp << "_header_\n" << message << "_footer_\n"; } } #endif //GSDL_USE_PHIND_ACTION