Changeset 6584


Ignore:
Timestamp:
2004-01-22T14:17:30+13:00 (20 years ago)
Author:
kjdon
Message:

Fiddled around with segmenting for chinese text. Haven't changed how the
segmentation is done, or what character ranges are used.
But when its done is now controlled by the collect.cfg. There is a new
option, separate_cjk, values true or false, default false. Segmentation
is only done if this is set to true. This is passed as a global option to
all plugins by the import.pl script, so the user just needs to add it
once to the config file, not as an option to all plugins.
The queryaction uses this option too to determine whether or not to segment
the query.

Location:
trunk/gsdl
Files:
14 edited

Legend:

Unmodified
Added
Removed
  • trunk/gsdl/bin/script/import.pl

    r6407 r6584  
    348348    # options must be known before we read the collect.cfg))
    349349    my $plugins = [];
     350    my @global_opts = ();
     351   
    350352    $configfilename = &util::filename_cat ($ENV{'GSDLCOLLECTDIR'}, "etc", "collect.cfg");
    351353    if (-e $configfilename) {
     
    414416        $gli = 1;
    415417    }
     418
     419    # global plugin stuff
     420    if (defined $collectcfg->{'separate_cjk'}&& $collectcfg->{'separate_cjk'} =~ /^true$/i) {
     421        push @global_opts, "-separate_cjk";
     422    }
     423   
    416424
    417425    } else {
     
    433441
    434442    # load all the plugins
    435     $pluginfo = &plugin::load_plugins ($plugins, $verbosity, $out, $faillog);
     443    $pluginfo = &plugin::load_plugins ($plugins, $verbosity, $out, $faillog, \@global_opts);
    436444    if (scalar(@$pluginfo) == 0) {
    437445    print $out &lookup_string("{import.no_plugins_loaded}") . "\n";
  • trunk/gsdl/perllib/colcfg.pm

    r4818 r6584  
    4444# {'buildtype'}->string
    4545# {'maxnumeric'}->string
     46# {'separate_cjk'}->string
    4647
    4748# {'maintainer'}->array of strings
     
    7677                    q/verbosity|allclassifications|OIDtype|maxdocs|/ .
    7778                    q/groupsize|sortmeta|debug|mode|create_images|/ .
    78                     q/maxnumeric)$/,
     79                    q/maxnumeric|separate_cjk)$/,
    7980                    q/(maintainer|languages|indexsubcollections|/ .
    8081                       q/indexes|dontbuild|dontgdbm|mirror|phind|levels|searchtype)$/,
     
    9192                 q/archivedir|cachedir|builddir|removeold|/ .
    9293                 q/textcompress|buildtype|collectdir|no_text|/ .
    93                  q/allclassifications|maxnumeric)$/,
     94                 q/allclassifications|maxnumeric|separate_cjk)$/,
    9495                 q/^(maintainer|languages|indexsubcollections|/ .
    9596                 q/indexes|dontbuild|dontgdbm|levels|searchtype)$/,
  • trunk/gsdl/perllib/mgbuilder.pm

    r6545 r6584  
    151151   
    152152    # load all the plugins
    153     $self->{'pluginfo'} = &plugin::load_plugins ($plugins, $verbosity, $outhandle, $failhandle);
     153
     154    #build up the extra global options for the plugins
     155    my @global_opts = ();
     156    if (defined $self->{'collect_cfg'}->{'separate_cjk'} && $self->{'collect_cfg'}->{'separate_cjk'} =~ /^true$/i) {
     157    push @global_opts, "-separate_cjk";
     158    }
     159    $self->{'pluginfo'} = &plugin::load_plugins ($plugins, $verbosity, $outhandle, $failhandle, \@global_opts);
    154160    if (scalar(@{$self->{'pluginfo'}}) == 0) {
    155161    print $outhandle "No plugins were loaded.\n";
  • trunk/gsdl/perllib/mgppbuilder.pm

    r6545 r6584  
    211211    print $outhandle "doclevel = ". $self->{'doc_level'}."\n";
    212212    # get the list of plugins for this collection
     213
     214    #build up the extra global options for the plugins
     215    my @global_opts = ();
     216    if (defined $self->{'collect_cfg'}->{'separate_cjk'} && $self->{'collect_cfg'}->{'separate_cjk'} =~ /^true$/i) {
     217    push @global_opts, "-separate_cjk";
     218    }
     219
    213220    my $plugins = [];
    214221    if (defined $self->{'collect_cfg'}->{'plugin'}) {
     
    217224   
    218225    # load all the plugins
    219     $self->{'pluginfo'} = &plugin::load_plugins ($plugins, $verbosity, $outhandle);
     226    $self->{'pluginfo'} = &plugin::load_plugins ($plugins, $verbosity, $outhandle, \@global_opts);
    220227    if (scalar(@{$self->{'pluginfo'}}) == 0) {
    221228    print $outhandle "No plugins were loaded.\n";
  • trunk/gsdl/perllib/plugin.pm

    r6407 r6584  
    4141}
    4242
    43 
     43#globaloptions contains any options that should be passed to all plugins
    4444sub load_plugins {
    4545    my ($plugin_list) = shift @_;
    46     ($verbosity, $outhandle, $failhandle) = @_; # globals
     46    ($verbosity, $outhandle, $failhandle, $globaloptions) = @_; # globals
    4747    my @plugin_objects = ();
    4848
     
    5050    $outhandle = STDERR unless defined $outhandle;
    5151    $failhandle = STDERR unless defined $failhandle;
     52
     53    map { $_ = "\"$_\""; } @$globaloptions;
     54    my $globals = join (",", @$globaloptions);
    5255
    5356    foreach $pluginoptions (@$plugin_list) {
     
    7073    map { $_ = "\"$_\""; } @$pluginoptions;
    7174    my $options = join (",", @$pluginoptions);
     75    if ($globals) {
     76        if (@$pluginoptions) {
     77        $options .= ",";
     78        }
     79        $options .= "$globals";
     80    }
    7281    $options =~ s/\$/\\\$/g;
     82   
    7383    eval ("\$plugobj = new \$pluginname($options)");
    7484    die "$@" if $@;
  • trunk/gsdl/perllib/plugins/BasPlug.pm

    r6408 r6584  
    405405             qq^maximum_century/-?\\d{1,2}( ?B\\.C\\.E\\.)?/-1^, \$self->{'max_century'},
    406406             q^cover_image^, \$self->{'cover_image'},
     407             q^separate_cjk^, \$self->{'separate_cjk'},
    407408             "allow_extra_options")) {
    408409
     
    615616    $reader->read_file ($textref);
    616617
    617     if ($language eq "zh") {
     618    #Now segments chinese if the separate_cjk option is set
     619    if ($self->{'separate_cjk'}) {
    618620        # segment the Chinese words
    619621        $$textref = &cnseg::segment($$textref);
  • trunk/gsdl/src/colservr/collectserver.cpp

    r5868 r6584  
    101101      collectinfo.searchTypes = cfgline;
    102102    }
    103 
     103    else if (key == "separate_cjk") {
     104      if (value == "true") collectinfo.isSegmented = true;
     105      else collectinfo.isSegmented = false;
     106    }
    104107    // What have we set in our collect.cfg file :  document or collection ?
    105108    else if (key == "authenticate") collectinfo.authenticate = value;
  • trunk/gsdl/src/recpt/comtypes.cpp

    r5024 r6584  
    5353  isPublic=true;
    5454  isBeta=false;
     55  isSegmented=false;
    5556  languages.erase(languages.begin(), languages.end());
    5657  ccsCols.erase(ccsCols.begin(), ccsCols.end());
  • trunk/gsdl/src/recpt/comtypes.h

    r5024 r6584  
    9090   bool isPublic;
    9191   bool isBeta;
     92  bool isSegmented;
    9293   unsigned long buildDate;
    9394   text_tarray ccsCols;    // empty if collection does not use cross-collection searching
  • trunk/gsdl/src/recpt/documentaction.cpp

    r5917 r6584  
    10081008 
    10091009  if (!args["q"].empty() && args.getintarg("hl")) {
     1010
     1011    ColInfoResponse_t *cinfo = recpt->get_collectinfo_ptr (collectproto, collection, logout);
     1012    bool segment = false;
     1013    if (cinfo != NULL) {
     1014      segment = cinfo->isSegmented;
     1015    }
    10101016    FilterRequest_t request;
    10111017    comerror_t err;
    10121018    request.filterResultOptions = FRmatchTerms;
    10131019    text_t formattedstring = args["q"];
    1014     format_querystring (formattedstring, args.getintarg("b"));
     1020    format_querystring (formattedstring, args.getintarg("b"), segment);
    10151021    set_queryfilter_options (request, formattedstring, args);
    10161022    collectproto->filter (args["c"], request, queryresponse, err, logout);
  • trunk/gsdl/src/recpt/queryaction.cpp

    r5762 r6584  
    632632
    633633// sets the selection box macros _hselection_, _jselection_, _nselection_ _gselection_, fqfselection_
    634 void queryaction::set_option_macro (const text_t &macroname, text_t current_value, bool display_single, bool add_js_update,
    635                     const FilterOption_t &option, displayclass &disp) {
     634void queryaction::set_option_macro (const text_t &macroname,
     635                    text_t current_value,
     636                    bool display_single,
     637                    bool add_js_update,
     638                    const FilterOption_t &option,
     639                    displayclass &disp) {
    636640 
    637641  if (option.validValues.empty()) return;
     
    10821086  isapprox isApprox = Exact;
    10831087
     1088  // what to do about segmentation for multiple colls??
     1089  bool segment = false;
    10841090  text_t formattedstring = "";
    1085   get_formatted_query_string(formattedstring, args, disp, logout);
     1091  get_formatted_query_string(formattedstring, segment, args, disp, logout);
    10861092
    10871093  if (formattedstring.empty()) {
     
    12881294  }
    12891295   
     1296  bool segment = cinfo->isSegmented;
    12901297  browserclass *bptr = browsers->getbrowser (browsertype);
    12911298
     
    13071314  request.filterResultOptions = FROID | FRmetadata | FRtermFreq;
    13081315  text_t formattedstring = "";
    1309   get_formatted_query_string(formattedstring, args, disp, logout);
     1316  get_formatted_query_string(formattedstring, segment, args, disp, logout);
    13101317
    13111318
     
    13541361// also adds dates if appropriate in text search
    13551362void queryaction::get_formatted_query_string (text_t &formattedstring,
     1363                          bool segment,
    13561364                          cgiargsclass &args,
    13571365                          displayclass &disp,
     
    13601368    formattedstring = args["q"];
    13611369    // remove & | ! for simple search, insert spaces for chinese
    1362     format_querystring (formattedstring, args.getintarg("b"));
     1370    format_querystring (formattedstring, args.getintarg("b"), segment);
    13631371    if (args["ct"]=="1") { // mgpp - we need to add in the field info
    13641372      format_field_info(formattedstring, args["fqf"]);
  • trunk/gsdl/src/recpt/queryaction.h

    r4937 r6584  
    7373                int numDocs, isapprox isApprox);
    7474
    75   void get_formatted_query_string (text_t &formattedstring, cgiargsclass &args,
     75  void get_formatted_query_string (text_t &formattedstring, bool segment,
     76                   cgiargsclass &args,
    7677                   displayclass &disp, ostream &logout);
    7778  void define_query_interface(displayclass &disp, cgiargsclass &args,
  • trunk/gsdl/src/recpt/querytools.cpp

    r4757 r6584  
    154154}
    155155
    156 void format_querystring (text_t &querystring, int querymode) {
     156void format_querystring (text_t &querystring, int querymode, bool segment) {
    157157  text_t formattedstring;
    158158
     159  if (querymode == 1 && !segment) return;
     160 
    159161  text_t::const_iterator here = querystring.begin();
    160162  text_t::const_iterator end = querystring.end();
     
    171173                 *here == '!' || *here == '&')) {
    172174      formattedstring.push_back(' ');
    173     } else {
     175    } else if (segment) {
    174176      if ((*here >= 0x4e00 && *here <= 0x9fa5) ||
    175177      (*here >= 0xf900 && *here <= 0xfa2d)) {
     
    184186    space = false;
    185187      }
     188   
     189    } else {
     190      formattedstring.push_back (*here);
    186191    }
    187192    here ++;
  • trunk/gsdl/src/recpt/querytools.h

    r4757 r6584  
    4040void set_more_queryfilter_options (FilterRequest_t &request, cgiargsclass &args);
    4141
    42 void format_querystring (text_t &querystring, int querymode);
     42void format_querystring (text_t &querystring, int querymode, bool segment);
    4343
    4444void add_dates(text_t &querystring, int startdate, int enddate,
Note: See TracChangeset for help on using the changeset viewer.