########################################################################### # # help.mg.src -- Source for the help command # Copyright (C) 1994 Neil Sharman # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation; either version 2 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program; if not, write to the Free Software # Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. # # @(#)help.mg.src 1.8 21 Mar 1994 # ########################################################################### # # The help file for mgquery. # Lines starting with '#' are treated as comments and are discarded when # the 'help.mg.h' file is produced # ########################################################################### HELP for mgquery ================================ The text is a summary of the information in the "mgquery" manual pages. The input to 'mgquery' consists of a series of input lines. The backslash character ("\") is used at the end of lines to indicate that input continues on the next line. Input lines on which the first character is a dot (".") are commands to the mgquery program. Input lines that do not start with a dot are queries. A query consists of two parts. One part is a boolean or ranked query that identifies documents. The second part is a post-processing pattern matching operation. Any text between the first speech mark (") and the last speech mark is considered to the a post-processing pattern. The following command are available :- .help - displays this text. .quit - quits the program. .set name value - sets parameter "name" to "value" . If the parameter is a boolean parameter and value is omitted the parameter will be inverted (i.e. if it is true it will change to false, if it is false it will change to true). .unset name - deletes parameter "name" .reset - sets all the parameters to their initial state. .display - displays the values of all the current parameters. .push - pushes the current parameters on to a stack. .pop - destroys the current parameters and pops a new set of parameters off the stack. .output arg - This is used to specify where to send the documents. Arg may one be of the following: > filename : Send output to the specified file. >> filename : Append output to the specified file. | command : The output is piped into command, which is executed by sh. .input arg - This is used to specify where input comes from. Arg may one be of the following: < filename : Get the input from the specified file. | command : The input comes from the standard output of command, which is executed by sh. On startup the mgquery program reads from the file .mgrc a sequence of commands (NOTE: The .mgrc file may not contain any queries). mgquery first looks for .mgrc in the current directory and then in the users home directory. Lines starting with a '#' in the .mgrc file are considered to be comments and are ignored. The following parameters (used in the .set and .unset commands) are predefined and have special significance :- accumulator_method = `array' This parameter is used during ranking, and specifies how the weight for each document should be accumulated. The following methods are available `array', `splay_tree', `hash_table', and `list'. briefstats = `off' This is a boolean parameter that determines whether the the totals for disk, memory and time usage statistics will be displayed. at the end of each query. NOTE: this takes precedence over the parameters "diskstats", "memstats" and "timestats". This parameter may take the values `yes', `no', `true', `false', `on' or `off'. buffer = `1048576' When the documents are being read in they are read into a buffer of this size and then displayed from this buffer. If the documents are larger than this buffer the buffer is expanded automatically. Having a large buffer gives a very slight performance improvement because it allows the order of disk operations to be optimised. The buffer size is measured in bytes. diskstats = `off' This is a boolean parameter that determines whether the disk usage statistics for the preceding query will be displayed after each query. This parameter may take the values `yes', `no', `true', `false', `on' or `off'. doc_sepstr = `---------------------------------- %n\n' This specifies the string that will be used to separate documents when they are displayed for `boolean' or `docnums' queries. The standard C escape character sequences (see the man page) may be used to place special characters in the string. For example, a newline would the `\n'. To include a `%' use the sequence `%%'. To include the MG document number use the sequence `%n'. expert = `false' If this is true then a lot of the waffle that the program spits out is suppressed. This parameter may take the values `yes', `no', `true', `false', `on' or `off'. hash_tbl_size = `1000' One of the options during ranking queries is to use a hash table to accumulate the weights for each document. The hash table is a simple chained type. This parameter specifies the size of the hash table and may take any value between 8 and 268435456. heads_length = `50' When the mode is `heads' this specifies the number of characters that will be output for each document. maxdocs = `all' The maximum number of documents to display in response to a query. This parameter may take on a numeric value between 1 and 429467295 or the word `all'. maxparas = `1000' The maximum number of paragraphs to identify during a ranked query with paragraph indexing. After the paragraphs have been identified the paragraphs are converted into documents, and because some of the paragraphs may refer to the same documents the final number of answers may be less that maxparas. The maxdocs parameter will then be applied. This parameter may take on a numeric value between 1 and 429467295. max_accumulators = `50000' This parameter limits the number of different paragraph/ document numbers to be accumulated during ranked queries when the parameter `accumulator_method` is set to `splay_tree', `hash_table', or `list'. This parameter may take any value between 8 and 268435456. max_terms = `all' This parameter limits the number of terms that will actually be used during a ranked query. If more terms than the number specified by max_terms are entered, then the extra terms will be discarded. If `sorted_terms' is on then the limiting will be done after the terms have been sorted. This parameter may take any value between 1 and 429467295 or the word `all'. memstats = `off' This is a boolean parameter that determines whether the memory usage statistics for the preceding query will be displayed after each query. This parameter may take the values `yes', `no', `true', `false', `on' or `off'. mgdir = `.' This specifies the directory where the MG files may be found. If the environment variable `MGDATA' is set then `mgdir' is initialised to the value in `MGDATA'. mgname = `' This specifies the name of the MG database to process. mode = `text' This specifies how documents should be displayed when they are retrieved it may take four different values `text', `docnums', `silent', `heads' or `count'. `text' displays the contents of the document. `docnums' displays only the document numbers. `Silent' retrieves all the documents but displays nothing except how many documents were retrieved. This mode is intended to be used in timing experiments. `Heads` is used to print out the head of each document. `Count' does the minimum amount of work required to determine how many documents would be retrieved, but does not retrieve them. pager = `more' This is the name of the program that will be used to display the help and the retrieved documents. If the environment variable "PAGER" is defined then `pager' takes on that value. para_sepstr = `\n######## PARAGRAPH %n ########\n' This specifies the string that will be used to separate paragraphs. The standard C escape character sequences (see the man page) may be used to place special characters in the string. For example, a newline would the `\n'. To include a `%' use the sequence `%%'. To include the paragraph number within the document use the sequence `%n'. para_start = `***** Weight = %w *****\n' This specifies the string that will be used at the head of paragraphs for a paraghaph level index following a ranked query. The standard C escape character sequences (see the man page) may be used to place special characters in the string. For example, a newline would the `\n'. To include a `%' use the sequence `%%'. To include the paragraph weight use the sequence `%w'. qfreq = `true' This determine whether the ranked queries will take into account the number of times each query term is specified. When this is `true' the number of times a term appears in the query is used in the ranking. When this is `false' all query term are assumed to occur only once. This parameter may take the values `yes', `no', `true', `false', `on' or `off'. query = `boolean' This specifies the type of queries that are to be specified. It can take four different values `boolean', `ranked', `docnums' or `approx-ranked'. `boolean' is for boolean queries. The yacc grammar for boolean queries is as follows :- query : or; or : or '|' and | and ; and : and '&' not | and not | not ; not : term | '!' not ; term : TERM | '(' or ')' ; `ranked' and `approx-ranked' are for queries ranked by the cosine measure. `approx-ranked' uses only the low precision document lengths, and therefore only produces an approximation to full cosine ranking. query : TERM | query TERM ; `docnums' allows the entry of document numbers. Multiple numbers separated by spaces may be specified or ranges separated by hyphens. query : range | query range ; range : num | num '-' num ; ranked_doc_sepstr = `---------------------------------- %n %w\n' This specifies the string that will be used to separate documents when they are displayed for `ranked' or `approx-ranked' queries. The standard C escape character sequences (see the man page) may be used to place special characters in the string. For example, a newline would the `\n'. To include a `%' use the sequence `%%'. To include the MG document number use the sequence `%n'. To include the document weight use the sequence `%w'. sizestats = `false' If this is true then various numbers are output at the end of each query indicating what went on during the query. This parameter may take the values `yes', `no', `true', `false', `on' or `off'. skip_dump = `skips.%d' If this parameter is set then during ranked queries on skipped inverted files when `accumulator_method' is set to `splay_tree', `hash_table', or `list` a file will be produced in the current directory. The name of the file is the value of this parameter, a `%d' in the file name will be replaced with the process id of mgquery. This file will contain information about the usage of skips during the query processing. This option is expensive; use `.unset skip_dump' to obtain optimal performance. sorted_terms = `on' This specifies whether of not the terms should be sorted into decreasing occurrence in documents so that the least often occurring terms are processed first when ranked queries are being done. When this is true the terms are sorted. When this is false the terms are not sorted and are instead processed in order of occurrence. This parameter may take the values `yes', `no', `true', `false', `on' or `off'. stop_at_max_accum = `on' This specifies what should happen when the maximum number of accumulators set by `max_accumulators' is reached. When this is true the the processing of terms is stopped at the completion of the current term. When this is false processing continues but no new accumulators are created. This parameter may take the values `yes', `no', `true', `false', `on' or `off'. terminator = `' This specifies the string that will be output after the last document from the previous query has been output. The standard C escape character sequences (see the man page) may be used to place special characters in the string. For example, a newline would the `\n'. To include a `%' use the sequence `%%'. timestats = `false' If this is true then the time to process a query is displayed in both real time and CPU time. This parameter may take the values `yes', `no', `true', `false', `on' or `off'. verbatim = `off' This is a boolean parameter that determines whether the program should attempt to do a regular expression match on the retrieved text. If verbatim is `on' and a post-processing strng is specified with the query then the post-processing string will be searched for in the documents just before they are displayed. If the string is found the document will be displayed, if not the document will not be displayed. If verbatim is `off' the post-processing string will be considered a regular expression like in `vi' or `egrep'. E.G. If verbatim is `on', "and.*the" will look for the 8 character sequence "and.*the". If verbatim is `off', "and.*the" will look for the sequence "and" followed somewhere later in the document by the sequence "the". This parameter may take the values `yes', `no', `true', `false', `on' or `off'.