Changeset 7460
- Timestamp:
- 2004-05-27T13:50:24+12:00 (20 years ago)
- Location:
- trunk
- Files:
-
- 9 edited
Legend:
- Unmodified
- Added
- Removed
-
trunk/gsdl3/packages/mg/java/org/greenstone/mg/MGPassesWrapper.java
r7452 r7460 20 20 21 21 22 /** java wrapper class for access to mg_passes in C22 /** java wrapper class for access to gs3_mg_passes in C 23 23 * 24 24 * the native side implemented in MGPassesWrapperImpl.c … … 32 32 } 33 33 34 private final char END_OF_DOCUMENT = (char) 2; 34 static final public char INVF_LEVEL_1 = '1'; 35 static final public char INVF_LEVEL_2 = '2'; 36 static final public char INVF_LEVEL_3 = '3'; 37 38 static final public int TEXT_PASS_1 = 0; 39 static final public int TEXT_PASS_2 = 1; 40 static final public int INDEX_PASS_1 = 2; 41 static final public int INDEX_PASS_2 = 3; 42 static final public int SPECIAL_PASS = 4; 43 44 static final public int NO_STEM_OR_CASE = 0; 45 static final public int CASE_ONLY = 1; 46 static final public int STEM_ONLY = 2; 47 static final public int STEM_AND_CASE = 3; 48 49 static final public String STEMMER_ENGLISH = "english"; 50 static final public String STEMMER_FRENCH = "french"; 51 static final public String STEMMER_LOVIN = "lovin"; 52 static final public String STEMMER_SIMPLE_FRENCH = "simple-french"; 53 54 static final private char END_OF_DOCUMENT = (char) 2; 35 55 36 56 public MGPassesWrapper() { … … 38 58 } 39 59 40 /** initialises field and method IDs for java side to enable access on C side */41 private static native void initIDs();42 43 /** initialises any C side stuff */44 private native boolean initCSide();45 46 60 /** initialise the pass through the documents */ 47 61 public native boolean init(); 48 62 49 63 /** add a pass declaration */ 50 public native void addPass(char pass_type, char pass_num); 51 64 public void addPass(int pass) { 65 switch (pass) { 66 case TEXT_PASS_1: 67 addPass('T','1'); 68 break; 69 case TEXT_PASS_2: 70 addPass('T','2'); 71 break; 72 case INDEX_PASS_1: 73 addPass('I','1'); 74 break; 75 case INDEX_PASS_2: 76 addPass('I','2'); 77 break; 78 case SPECIAL_PASS: 79 addPass('S','1'); 80 break; 81 } 82 } 52 83 /** set the base path */ 53 84 public native void setBasePath(String basepath); … … 59 90 public native void setInvfLevel(char level); 60 91 92 /** Specify the size of the document buffer in kilobytes. 93 If any document is larger than bufsize, the program 94 will abort with an error message. 95 */ 96 public native void setBufferSize(long bufsize); 97 98 /** Maximum amount of memory to use for the index pass-2 file 99 inversion in megabytes. 100 */ 101 public native void setInversionMemLimit(int limit); 61 102 103 /** If true, treat SGML tags as non-words when building the 104 inverted file. 105 */ 106 public native void ignoreSGMLTags(boolean ignore); 107 108 /** if mg_passes fails, the document that caused teh failure will be 109 output to teh trace file or STDERR. 110 */ 111 public native void dumpFailedDocument(boolean dump); 112 113 /** output statistics on the compression performance to a file 114 called *.compression.stats. frequency specifies the interval 115 (in kilobytes of source text) between outputting each line of 116 statistics. 117 */ 118 public native void outputCompStats(int frequency); 119 120 /** activate tracing, a line will be output every tracepos input bytes */ 121 public native void enableTracing(int tracepos); 62 122 /** process a Greenstone document, which may consist of many MG documents (seeparated by ^B */ 63 123 public boolean processDocument(String docs_text) { … … 73 133 return true; 74 134 } 75 /** process a MG document */76 public native boolean processMGDocument(byte[] text);77 135 78 136 /** finalise the pass through the documents */ 79 137 public native boolean finish(); 138 139 140 /** initialises field and method IDs for java side to enable access on C side */ 141 private static native void initIDs(); 142 143 /** initialises any C side stuff */ 144 private native boolean initCSide(); 145 146 private native void addPass(char pass_type, char pass_num); 147 148 /** process a MG document */ 149 private native boolean processMGDocument(byte[] text); 150 80 151 } -
trunk/gsdl3/packages/mg/jni/MGPassesWrapperImpl.c
r7452 r7460 38 38 } 39 39 40 40 /* add a pass type T1, T2, I1, I2, S */ 41 41 JNIEXPORT void JNICALL 42 42 Java_org_greenstone_mg_MGPassesWrapper_addPass(JNIEnv *j_env, … … 54 54 /* Set the filename */ 55 55 JNIEXPORT void JNICALL 56 Java_org_greenstone_mg_MGPassesWrapper_setFileName(JNIEnv *j_env, jobject j_obj, 57 jstring j_filename) 56 Java_org_greenstone_mg_MGPassesWrapper_setFileName(JNIEnv *j_env, 57 jobject j_obj, 58 jstring j_filename) 58 59 { 59 60 /* Get the filename as a C string */ … … 79 80 80 81 set_basepath(basepath); 81 82 82 83 83 /* Release the string */ … … 86 86 } 87 87 88 88 /* set the level for the inverted file */ 89 89 JNIEXPORT void JNICALL 90 90 Java_org_greenstone_mg_MGPassesWrapper_setInvfLevel(JNIEnv *j_env, … … 98 98 } 99 99 100 /* */100 /* set the stemmer and stem method */ 101 101 JNIEXPORT void JNICALL 102 102 Java_org_greenstone_mg_MGPassesWrapper_setStemOptions(JNIEnv *j_env, … … 111 111 assert(stemmer != NULL); 112 112 set_stem_options(stemmer, method); 113 114 /* Release the string */ 115 (*j_env)->ReleaseStringUTFChars(j_env, j_stemmer, stemmer); 113 116 } 114 117 118 /** Specify the size of the document buffer in kilobytes. 119 If any document is larger than bufsize, the program 120 will abort with an error message. 121 */ 122 JNIEXPORT void JNICALL 123 Java_org_greenstone_mg_MGPassesWrapper_setBufferSize(JNIEnv *j_env, 124 jobject j_obj, 125 jlong j_bufsize){ 126 long buffer = j_bufsize; 127 set_buffer_size(buffer); 128 } 129 130 /** Maximum amount of memory to use for the index pass-2 file 131 inversion in megabytes. 132 */ 133 JNIEXPORT void JNICALL 134 Java_org_greenstone_mg_MGPassesWrapper_setInversionMemLimit(JNIEnv *j_env, 135 jobject j_obj, 136 jint j_limit) { 137 int limit = j_limit; 138 set_inversion_limit(limit); 139 } 140 141 /** If true, treat SGML tags as non-words when building the 142 inverted file. 143 */ 144 JNIEXPORT void JNICALL 145 Java_org_greenstone_mg_MGPassesWrapper_ignoreSGMLTags(JNIEnv *j_env, 146 jobject j_obj, 147 jboolean j_ignore){ 148 int ignore = j_ignore; 149 ignore_sgml_tags(ignore); 150 } 151 152 /** if mg_passes fails, the document that caused the failure will be 153 output to the trace file or STDERR. 154 */ 155 JNIEXPORT void JNICALL 156 Java_org_greenstone_mg_MGPassesWrapper_dumpFailedDocument(JNIEnv *j_env, 157 jobject j_obj, 158 jboolean j_dump) { 159 160 } 161 162 /** output statistics on the compression performance to a file 163 called *.compression.stats. frequency specifies the interval 164 (in kilobytes of source text) between outputting each line of 165 statistics. 166 */ 167 JNIEXPORT void JNICALL 168 Java_org_greenstone_mg_MGPassesWrapper_outputCompStats(JNIEnv *j_env, 169 jobject j_obj, 170 jint j_frequency){ 171 int comp_stat_point = j_frequency; 172 set_comp_stat_point(comp_stat_point); 173 174 } 175 /** activate tracing, a line will be output every tracepos input bytes */ 176 JNIEXPORT void JNICALL 177 Java_org_greenstone_mg_MGPassesWrapper_enableTracing(JNIEnv *j_env, 178 jobject j_obj, 179 jint j_tracepos){ 180 int tracepos = j_tracepos; 181 set_trace_point(tracepos); 182 } 183 184 /** specify the name of the trace file */ 185 JNIEXPORT void JNICALL 186 Java_org_greenstone_mg_MGPassesWrapper_setTraceFile(JNIEnv *j_env, 187 jobject j_obj, 188 jstring j_tracefile){ 189 190 const char* tracefile = (*j_env)->GetStringUTFChars(j_env, j_tracefile, NULL); 191 assert(tracefile != NULL); 192 set_trace_file(tracefile); 193 /* Release the string */ 194 (*j_env)->ReleaseStringUTFChars(j_env, j_tracefile, tracefile); 195 } 196 197 /* initialise the pass through the documents. must be called after all 198 the set methods 199 */ 115 200 JNIEXPORT jboolean JNICALL 116 201 Java_org_greenstone_mg_MGPassesWrapper_init(JNIEnv *j_env, … … 121 206 } 122 207 123 JNIEXPORT jboolean JNICALL 124 Java_org_greenstone_mg_MGPassesWrapper_finish(JNIEnv *j_env, 125 jobject j_obj) { 126 127 finalise_driver(); 128 return 1; 129 } 130 208 209 /* process one document */ 131 210 JNIEXPORT jboolean JNICALL 132 211 Java_org_greenstone_mg_MGPassesWrapper_processMGDocument(JNIEnv *j_env, … … 142 221 } 143 222 144 145 223 /* finalise the pass through the documents */ 224 JNIEXPORT jboolean JNICALL 225 Java_org_greenstone_mg_MGPassesWrapper_finish(JNIEnv *j_env, 226 jobject j_obj) { 227 228 finalise_driver(); 229 return 1; 230 } 231 232 -
trunk/gsdl3/packages/mg/src/text/mg_passes_4jni.c
r7455 r7460 1 1 /************************************************************************** 2 2 * 3 * mg_passes .c -- Driver for the various passes3 * mg_passes_4jni.c -- Driver for the various passes 4 4 * Copyright (C) 1994 Neil Sharman 5 5 * … … 35 35 #include "stemmer.h" 36 36 37 38 37 #include "mg_files.h" 39 38 #include "mg.h" … … 113 112 }; 114 113 115 static char *usage_str = "\nUSAGE:\n" 116 " %s [-h] [-G] [-D] [-1|-2|-3] [-T1] [-T2] [-I1] [-I2] [-N1]\n" 117 " %*s [-N2] [-W] [-S] [-b buffer-size] [-d dictionary-directory]\n" 118 " %*s [-t trace-point Mb] [-m invf-memory] [-c chunk-limit]\n" 119 " %*s [-n trace-name] [-C comp-stat-size] [-s stem_method]\n" 120 " %*s [-a stemmer] [-M max-numeric] -f doc-collection-name\n"; 121 122 123 static void 124 usage (char *err) 125 { 126 if (err) 127 Message (err); 128 fprintf (stderr, usage_str, msg_prefix, strlen (msg_prefix), "", 129 strlen (msg_prefix), "",strlen (msg_prefix), "", 130 strlen (msg_prefix),""); 131 exit (1); 132 } 133 134 135 136 137 #if 0 138 static char * 139 str_comma (unsigned long u) 140 { 141 static char buf[20]; 142 unsigned long a, b, c, d; 143 a = u / 1000000000; 144 u -= a * 1000000000; 145 b = u / 1000000; 146 u -= b * 1000000; 147 c = u / 1000; 148 u -= c * 1000; 149 d = u; 150 151 if (a) 152 sprintf (buf, "%u,%03u,%03u,%03u", a, b, c, d); 153 else if (b) 154 sprintf (buf, "%u,%03u,%03u", b, c, d); 155 else if (c) 156 sprintf (buf, "%u,%03u", c, d); 157 else 158 sprintf (buf, "%u", d); 159 return (buf); 160 } 161 #endif 162 163 164 165 /* 166 int 167 open_next_file (int in_fd) 168 { 169 if (in_fd > 0) 170 close (in_fd); 171 if (num_files == 0) 172 return (-1); 173 if ((in_fd = open (files[0], O_RDONLY)) == -1) 174 FatalError (1, "Cannot open %s", files[0]); 175 files++; 176 num_files--; 177 return (in_fd); 178 } 179 */ 180 114 115 /* clear all the settings from one mg_passes run to the next */ 181 116 void clear_variables() { 182 117 … … 207 142 208 143 } 209 void set_invf_level(char level) { 210 211 switch (level) { 212 case '1': 213 InvfLevel = 1; 214 break; 215 case '2': 216 InvfLevel = 2; 217 break; 218 case '3': 219 InvfLevel = 3; 220 break; 221 } 222 223 } 224 void set_inversion_limit(int limit) { 225 invf_buffer_size = limit * 1024 * 1024; 226 } 227 228 void ignore_sgml_tags(int ignore) { 229 if (ignore) { 230 SkipSGML = 1; 231 } else { 232 SkipSGML = 0; 233 } 234 } 235 236 void set_buffer_size(long size) { 237 buf_size = size * 1024; 238 if (buf_size < MIN_BUF) { 239 buf_size = MIN_BUF; 240 } 241 } 242 243 void set_stem_options(char * stemmer, int method) { 244 stemmer_num = stemmernumber (stemmer); 245 stem_method = method & STEMMER_MASK; 246 247 } 248 249 void set_filename(char * filen) { 250 int len = strlen(filen); 251 if (filename) { 252 Xfree (filename); 253 filename = NULL; 254 } 255 filename = Xstrdup (filen); 256 // put this here for now 257 Dump=1; 258 trace = 512; 259 if (!trace_name) 260 trace_name = make_name (filename, TRACE_SUFFIX, NULL); 261 if (!(Trace = fopen (trace_name, "a"))) 262 Message ("Unable to open \"%s\". No tracing will be done.", trace_name); 263 else 264 setbuf (Trace, NULL); 265 266 } 267 268 144 145 /* ################################################## */ 146 /* the following are methods to set all the variables that used to be 147 set by command line args */ 148 149 /* -S, -T1, -T2, -I1, -I2, args to mg_passes */ 269 150 void add_pass (char pass_type, char pass_num) { 270 151 … … 291 172 break; 292 173 } 293 294 } 174 } 175 176 /* -D arg to mg_passes */ 177 void dump_failed_document(int dump) { 178 Dump = dump; 179 } 180 181 /* -G arg to mg_passes */ 182 void ignore_sgml_tags(int ignore) { 183 if (ignore) { 184 SkipSGML = 1; 185 } else { 186 SkipSGML = 0; 187 } 188 } 189 190 /* -b arg to mg_passes */ 191 void set_buffer_size(long size) { 192 buf_size = size * 1024; 193 if (buf_size < MIN_BUF) { 194 buf_size = MIN_BUF; 195 } 196 } 197 198 /* -c arg to mg_passes */ 199 void set_chunk_limit(long chunk_limit) { 200 ChunkLimit = chunk_limit; 201 } 202 203 /* -C arg to mg_passes */ 204 void set_comp_stat_point(int stat_point) { 205 comp_stat_point = stat_point * 1024; 206 } 207 208 /* -f arg to mg_passes */ 209 void set_filename(char * filen) { 210 int len = strlen(filen); 211 if (filename) { 212 Xfree (filename); 213 filename = NULL; 214 } 215 filename = Xstrdup (filen); 216 } 217 218 /* -m arg to mg_passes */ 219 void set_inversion_limit(int limit) { 220 invf_buffer_size = limit * 1024 * 1024; 221 } 222 223 /* -1, -2, -3 args to mg_passes */ 224 void set_invf_level(char level) { 225 switch (level) { 226 case '1': 227 InvfLevel = 1; 228 break; 229 case '2': 230 InvfLevel = 2; 231 break; 232 case '3': 233 InvfLevel = 3; 234 break; 235 } 236 } 237 238 /* -W arg to mg_passes */ 239 void set_make_weights(int make_w) { 240 MakeWeights = make_w; 241 } 242 243 /* -M arg to mg_passes */ 244 void set_max_numeric(int max_numeric) { 245 SetEnv ("maxnumeric", max_numeric, NULL); 246 } 247 248 /* -a, -s args to mg_passes */ 249 void set_stem_options(char * stemmer, int method) { 250 stemmer_num = stemmernumber (stemmer); 251 stem_method = method & STEMMER_MASK; 252 } 253 254 /* -t arg to mg_passes */ 255 void set_trace_point(int tracepos) { 256 trace = (unsigned long) (tracepos * 1024 * 1024); 257 } 258 259 /* -n arg to mg_passes */ 260 void set_trace_file(char * filen) { 261 int len = strlen(filen); 262 if (trace_name) { 263 Xfree (trace_name); 264 trace_name = NULL; 265 } 266 trace_name = Xstrdup (filen); 267 } 268 269 /* ############################################### */ 270 /* The old driver method has been split into 3: 271 init_driver, process_document (called numdocs times), 272 finalise_driver. 273 The above set vars methods should all be called before init_driver. 274 */ 275 276 295 277 ProgTime StartTime, InitTime, ProcTime, DoneTime; 296 278 … … 298 280 init_driver () 299 281 { 282 if (!filename || *filename == '\0') 283 FatalError (1, "A document collection name must be specified."); 284 285 if ((Passes & (IVF_PASS_1 | IVF_PASS_2)) == (IVF_PASS_1 | IVF_PASS_2)) 286 FatalError (1, "I1 and I2 cannot be done simultaneously."); 287 288 if ((Passes & (TEXT_PASS_1 | TEXT_PASS_2)) == (TEXT_PASS_1 | TEXT_PASS_2)) 289 FatalError (1, "T1 and T2 cannot be done simultaneously."); 290 291 if (!Passes) 292 FatalError (1, "S, T1, T2, I1 or I2 must be specified."); 293 294 if (trace) 295 { 296 if (!trace_name) 297 trace_name = make_name (filename, TRACE_SUFFIX, NULL); 298 if (!(Trace = fopen (trace_name, "a"))) 299 Message ("Unable to open \"%s\". No tracing will be done.", trace_name); 300 else 301 setbuf (Trace, NULL); 302 } 303 else 304 Trace = NULL; 305 306 if (comp_stat_point) 307 { 308 char *name = make_name (filename, COMPRESSION_STATS_SUFFIX, NULL); 309 if (!(Comp_Stats = fopen (name, "wb"))) /* [RPAP - Feb 97: WIN32 Port] */ 310 Message ("Unable to open \"%s\". No comp. stats. will be generated.", 311 name); 312 } 313 314 if (Trace) 315 { 316 int i; 317 fprintf (Trace, "\n\n\t\t-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-\n\n"); 318 /* print out the args to mg_passes */ 319 fprintf (Trace, "\n\n"); 320 } 321 300 322 int pass; 301 323 … … 338 360 bytes_processed += len; 339 361 340 printf("process doc, len=%d\n",len);341 362 #ifndef QUIET 342 363 if (!len) … … 501 522 (double) bytes_processed / (ProcTime.CPUTime - InitTime.CPUTime) / 1024); 502 523 //free (buffer); 503 } 504 505 506 507 int main (int argc, char **argv) 508 { 509 int ch, in_fd; 510 511 msg_prefix = argv[0]; 512 513 opterr = 0; 514 while ((ch = getopt (argc, argv, "hC:WGSD123f:d:b:T:I:t:m:N:c:n:s:a:M:")) != -1) 515 { 516 switch (ch) 517 { 518 case 'G': 519 SkipSGML = 1; 520 break; 521 case 'S': 522 Passes |= SPECIAL; 523 break; 524 case '1': 525 InvfLevel = 1; 526 break; 527 case '2': 528 InvfLevel = 2; 529 break; 530 case '3': 531 InvfLevel = 3; 532 break; 533 case 'f': 534 filename = optarg; 535 break; 536 case 'n': 537 trace_name = optarg; 538 break; 539 case 'D': 540 Dump = 1; 541 break; 542 case 'W': 543 MakeWeights = 1; 544 break; 545 case 'd': 546 set_basepath (optarg); 547 break; 548 case 'a': 549 stemmer_num = stemmernumber (optarg); 550 break; 551 case 's': 552 stem_method = atoi (optarg) & STEMMER_MASK; 553 break; 554 case 'b': 555 buf_size = atoi (optarg) * 1024; 556 break; 557 case 'C': 558 comp_stat_point = atoi (optarg) * 1024; 559 break; 560 case 'c': 561 ChunkLimit = atoi (optarg); 562 break; 563 case 'm': 564 invf_buffer_size = (int) (atof (optarg) * 1024 * 1024); 565 break; 566 case 'I': 567 case 'N': /* N kept for compatability */ 568 if (*optarg == '1') 569 Passes |= IVF_PASS_1; 570 else if (*optarg == '2') 571 Passes |= IVF_PASS_2; 572 else 573 usage ("Invalid pass number"); 574 break; 575 case 'T': 576 if (*optarg == '1') 577 Passes |= TEXT_PASS_1; 578 else if (*optarg == '2') 579 Passes |= TEXT_PASS_2; 580 else 581 usage ("Invalid pass number"); 582 break; 583 case 't': 584 trace = (unsigned long) (atof (optarg) * 1024 * 1024); 585 break; 586 case 'M': 587 SetEnv ("maxnumeric", optarg, NULL); 588 break; 589 case 'h': 590 case '?': 591 usage (NULL); 592 } 593 } 594 595 if (!filename || *filename == '\0') 596 FatalError (1, "A document collection name must be specified."); 597 598 if (buf_size < MIN_BUF) 599 FatalError (1, "The buffer size must exceed 1024 bytes."); 600 601 if ((Passes & (IVF_PASS_1 | IVF_PASS_2)) == (IVF_PASS_1 | IVF_PASS_2)) 602 FatalError (1, "I1 and I2 cannot be done simultaneously."); 603 604 if ((Passes & (TEXT_PASS_1 | TEXT_PASS_2)) == (TEXT_PASS_1 | TEXT_PASS_2)) 605 FatalError (1, "T1 and T2 cannot be done simultaneously."); 606 607 if (!Passes) 608 FatalError (1, "S, T1, T2, I1 or I2 must be specified."); 609 610 if (optind < argc) 611 { 612 if ((in_fd = open (argv[optind], O_RDONLY)) == -1) 613 FatalError (1, "Cannot open %s", argv[optind]); 614 files = &argv[optind + 1]; 615 num_files = argc - (optind + 1); 616 } 617 else 618 in_fd = 0; /* stdin */ 619 620 621 if (trace) 622 { 623 if (!trace_name) 624 trace_name = make_name (filename, TRACE_SUFFIX, NULL); 625 if (!(Trace = fopen (trace_name, "a"))) 626 Message ("Unable to open \"%s\". No tracing will be done.", trace_name); 627 else 628 setbuf (Trace, NULL); 629 } 630 else 631 Trace = NULL; 632 633 if (comp_stat_point) 634 { 635 char *name = make_name (filename, COMPRESSION_STATS_SUFFIX, NULL); 636 if (!(Comp_Stats = fopen (name, "wb"))) /* [RPAP - Feb 97: WIN32 Port] */ 637 Message ("Unable to open \"%s\". No comp. stats. will be generated.", 638 name); 639 } 640 641 642 if (Trace) 643 { 644 int i; 645 fprintf (Trace, "\n\n\t\t-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-\n\n"); 646 for (i = 0; i < argc; i++) 647 fprintf (Trace, "%s ", argv[i]); 648 fprintf (Trace, "\n\n"); 649 } 650 651 init_driver (); 652 /* here we have to do something to process docs from stdin */ 653 finalise_driver(); 524 654 525 if (Trace) 655 526 fclose (Trace); … … 658 529 fclose (Comp_Stats); 659 530 660 return 0; 661 } 531 } 532 -
trunk/gsdl3/src/packages/mg/java/org/greenstone/mg/MGPassesWrapper.java
r7452 r7460 20 20 21 21 22 /** java wrapper class for access to mg_passes in C22 /** java wrapper class for access to gs3_mg_passes in C 23 23 * 24 24 * the native side implemented in MGPassesWrapperImpl.c … … 32 32 } 33 33 34 private final char END_OF_DOCUMENT = (char) 2; 34 static final public char INVF_LEVEL_1 = '1'; 35 static final public char INVF_LEVEL_2 = '2'; 36 static final public char INVF_LEVEL_3 = '3'; 37 38 static final public int TEXT_PASS_1 = 0; 39 static final public int TEXT_PASS_2 = 1; 40 static final public int INDEX_PASS_1 = 2; 41 static final public int INDEX_PASS_2 = 3; 42 static final public int SPECIAL_PASS = 4; 43 44 static final public int NO_STEM_OR_CASE = 0; 45 static final public int CASE_ONLY = 1; 46 static final public int STEM_ONLY = 2; 47 static final public int STEM_AND_CASE = 3; 48 49 static final public String STEMMER_ENGLISH = "english"; 50 static final public String STEMMER_FRENCH = "french"; 51 static final public String STEMMER_LOVIN = "lovin"; 52 static final public String STEMMER_SIMPLE_FRENCH = "simple-french"; 53 54 static final private char END_OF_DOCUMENT = (char) 2; 35 55 36 56 public MGPassesWrapper() { … … 38 58 } 39 59 40 /** initialises field and method IDs for java side to enable access on C side */41 private static native void initIDs();42 43 /** initialises any C side stuff */44 private native boolean initCSide();45 46 60 /** initialise the pass through the documents */ 47 61 public native boolean init(); 48 62 49 63 /** add a pass declaration */ 50 public native void addPass(char pass_type, char pass_num); 51 64 public void addPass(int pass) { 65 switch (pass) { 66 case TEXT_PASS_1: 67 addPass('T','1'); 68 break; 69 case TEXT_PASS_2: 70 addPass('T','2'); 71 break; 72 case INDEX_PASS_1: 73 addPass('I','1'); 74 break; 75 case INDEX_PASS_2: 76 addPass('I','2'); 77 break; 78 case SPECIAL_PASS: 79 addPass('S','1'); 80 break; 81 } 82 } 52 83 /** set the base path */ 53 84 public native void setBasePath(String basepath); … … 59 90 public native void setInvfLevel(char level); 60 91 92 /** Specify the size of the document buffer in kilobytes. 93 If any document is larger than bufsize, the program 94 will abort with an error message. 95 */ 96 public native void setBufferSize(long bufsize); 97 98 /** Maximum amount of memory to use for the index pass-2 file 99 inversion in megabytes. 100 */ 101 public native void setInversionMemLimit(int limit); 61 102 103 /** If true, treat SGML tags as non-words when building the 104 inverted file. 105 */ 106 public native void ignoreSGMLTags(boolean ignore); 107 108 /** if mg_passes fails, the document that caused teh failure will be 109 output to teh trace file or STDERR. 110 */ 111 public native void dumpFailedDocument(boolean dump); 112 113 /** output statistics on the compression performance to a file 114 called *.compression.stats. frequency specifies the interval 115 (in kilobytes of source text) between outputting each line of 116 statistics. 117 */ 118 public native void outputCompStats(int frequency); 119 120 /** activate tracing, a line will be output every tracepos input bytes */ 121 public native void enableTracing(int tracepos); 62 122 /** process a Greenstone document, which may consist of many MG documents (seeparated by ^B */ 63 123 public boolean processDocument(String docs_text) { … … 73 133 return true; 74 134 } 75 /** process a MG document */76 public native boolean processMGDocument(byte[] text);77 135 78 136 /** finalise the pass through the documents */ 79 137 public native boolean finish(); 138 139 140 /** initialises field and method IDs for java side to enable access on C side */ 141 private static native void initIDs(); 142 143 /** initialises any C side stuff */ 144 private native boolean initCSide(); 145 146 private native void addPass(char pass_type, char pass_num); 147 148 /** process a MG document */ 149 private native boolean processMGDocument(byte[] text); 150 80 151 } -
trunk/gsdl3/src/packages/mg/jni/MGPassesWrapperImpl.c
r7452 r7460 38 38 } 39 39 40 40 /* add a pass type T1, T2, I1, I2, S */ 41 41 JNIEXPORT void JNICALL 42 42 Java_org_greenstone_mg_MGPassesWrapper_addPass(JNIEnv *j_env, … … 54 54 /* Set the filename */ 55 55 JNIEXPORT void JNICALL 56 Java_org_greenstone_mg_MGPassesWrapper_setFileName(JNIEnv *j_env, jobject j_obj, 57 jstring j_filename) 56 Java_org_greenstone_mg_MGPassesWrapper_setFileName(JNIEnv *j_env, 57 jobject j_obj, 58 jstring j_filename) 58 59 { 59 60 /* Get the filename as a C string */ … … 79 80 80 81 set_basepath(basepath); 81 82 82 83 83 /* Release the string */ … … 86 86 } 87 87 88 88 /* set the level for the inverted file */ 89 89 JNIEXPORT void JNICALL 90 90 Java_org_greenstone_mg_MGPassesWrapper_setInvfLevel(JNIEnv *j_env, … … 98 98 } 99 99 100 /* */100 /* set the stemmer and stem method */ 101 101 JNIEXPORT void JNICALL 102 102 Java_org_greenstone_mg_MGPassesWrapper_setStemOptions(JNIEnv *j_env, … … 111 111 assert(stemmer != NULL); 112 112 set_stem_options(stemmer, method); 113 114 /* Release the string */ 115 (*j_env)->ReleaseStringUTFChars(j_env, j_stemmer, stemmer); 113 116 } 114 117 118 /** Specify the size of the document buffer in kilobytes. 119 If any document is larger than bufsize, the program 120 will abort with an error message. 121 */ 122 JNIEXPORT void JNICALL 123 Java_org_greenstone_mg_MGPassesWrapper_setBufferSize(JNIEnv *j_env, 124 jobject j_obj, 125 jlong j_bufsize){ 126 long buffer = j_bufsize; 127 set_buffer_size(buffer); 128 } 129 130 /** Maximum amount of memory to use for the index pass-2 file 131 inversion in megabytes. 132 */ 133 JNIEXPORT void JNICALL 134 Java_org_greenstone_mg_MGPassesWrapper_setInversionMemLimit(JNIEnv *j_env, 135 jobject j_obj, 136 jint j_limit) { 137 int limit = j_limit; 138 set_inversion_limit(limit); 139 } 140 141 /** If true, treat SGML tags as non-words when building the 142 inverted file. 143 */ 144 JNIEXPORT void JNICALL 145 Java_org_greenstone_mg_MGPassesWrapper_ignoreSGMLTags(JNIEnv *j_env, 146 jobject j_obj, 147 jboolean j_ignore){ 148 int ignore = j_ignore; 149 ignore_sgml_tags(ignore); 150 } 151 152 /** if mg_passes fails, the document that caused the failure will be 153 output to the trace file or STDERR. 154 */ 155 JNIEXPORT void JNICALL 156 Java_org_greenstone_mg_MGPassesWrapper_dumpFailedDocument(JNIEnv *j_env, 157 jobject j_obj, 158 jboolean j_dump) { 159 160 } 161 162 /** output statistics on the compression performance to a file 163 called *.compression.stats. frequency specifies the interval 164 (in kilobytes of source text) between outputting each line of 165 statistics. 166 */ 167 JNIEXPORT void JNICALL 168 Java_org_greenstone_mg_MGPassesWrapper_outputCompStats(JNIEnv *j_env, 169 jobject j_obj, 170 jint j_frequency){ 171 int comp_stat_point = j_frequency; 172 set_comp_stat_point(comp_stat_point); 173 174 } 175 /** activate tracing, a line will be output every tracepos input bytes */ 176 JNIEXPORT void JNICALL 177 Java_org_greenstone_mg_MGPassesWrapper_enableTracing(JNIEnv *j_env, 178 jobject j_obj, 179 jint j_tracepos){ 180 int tracepos = j_tracepos; 181 set_trace_point(tracepos); 182 } 183 184 /** specify the name of the trace file */ 185 JNIEXPORT void JNICALL 186 Java_org_greenstone_mg_MGPassesWrapper_setTraceFile(JNIEnv *j_env, 187 jobject j_obj, 188 jstring j_tracefile){ 189 190 const char* tracefile = (*j_env)->GetStringUTFChars(j_env, j_tracefile, NULL); 191 assert(tracefile != NULL); 192 set_trace_file(tracefile); 193 /* Release the string */ 194 (*j_env)->ReleaseStringUTFChars(j_env, j_tracefile, tracefile); 195 } 196 197 /* initialise the pass through the documents. must be called after all 198 the set methods 199 */ 115 200 JNIEXPORT jboolean JNICALL 116 201 Java_org_greenstone_mg_MGPassesWrapper_init(JNIEnv *j_env, … … 121 206 } 122 207 123 JNIEXPORT jboolean JNICALL 124 Java_org_greenstone_mg_MGPassesWrapper_finish(JNIEnv *j_env, 125 jobject j_obj) { 126 127 finalise_driver(); 128 return 1; 129 } 130 208 209 /* process one document */ 131 210 JNIEXPORT jboolean JNICALL 132 211 Java_org_greenstone_mg_MGPassesWrapper_processMGDocument(JNIEnv *j_env, … … 142 221 } 143 222 144 145 223 /* finalise the pass through the documents */ 224 JNIEXPORT jboolean JNICALL 225 Java_org_greenstone_mg_MGPassesWrapper_finish(JNIEnv *j_env, 226 jobject j_obj) { 227 228 finalise_driver(); 229 return 1; 230 } 231 232 -
trunk/gsdl3/src/packages/mg/src/text/mg_passes_4jni.c
r7455 r7460 1 1 /************************************************************************** 2 2 * 3 * mg_passes .c -- Driver for the various passes3 * mg_passes_4jni.c -- Driver for the various passes 4 4 * Copyright (C) 1994 Neil Sharman 5 5 * … … 35 35 #include "stemmer.h" 36 36 37 38 37 #include "mg_files.h" 39 38 #include "mg.h" … … 113 112 }; 114 113 115 static char *usage_str = "\nUSAGE:\n" 116 " %s [-h] [-G] [-D] [-1|-2|-3] [-T1] [-T2] [-I1] [-I2] [-N1]\n" 117 " %*s [-N2] [-W] [-S] [-b buffer-size] [-d dictionary-directory]\n" 118 " %*s [-t trace-point Mb] [-m invf-memory] [-c chunk-limit]\n" 119 " %*s [-n trace-name] [-C comp-stat-size] [-s stem_method]\n" 120 " %*s [-a stemmer] [-M max-numeric] -f doc-collection-name\n"; 121 122 123 static void 124 usage (char *err) 125 { 126 if (err) 127 Message (err); 128 fprintf (stderr, usage_str, msg_prefix, strlen (msg_prefix), "", 129 strlen (msg_prefix), "",strlen (msg_prefix), "", 130 strlen (msg_prefix),""); 131 exit (1); 132 } 133 134 135 136 137 #if 0 138 static char * 139 str_comma (unsigned long u) 140 { 141 static char buf[20]; 142 unsigned long a, b, c, d; 143 a = u / 1000000000; 144 u -= a * 1000000000; 145 b = u / 1000000; 146 u -= b * 1000000; 147 c = u / 1000; 148 u -= c * 1000; 149 d = u; 150 151 if (a) 152 sprintf (buf, "%u,%03u,%03u,%03u", a, b, c, d); 153 else if (b) 154 sprintf (buf, "%u,%03u,%03u", b, c, d); 155 else if (c) 156 sprintf (buf, "%u,%03u", c, d); 157 else 158 sprintf (buf, "%u", d); 159 return (buf); 160 } 161 #endif 162 163 164 165 /* 166 int 167 open_next_file (int in_fd) 168 { 169 if (in_fd > 0) 170 close (in_fd); 171 if (num_files == 0) 172 return (-1); 173 if ((in_fd = open (files[0], O_RDONLY)) == -1) 174 FatalError (1, "Cannot open %s", files[0]); 175 files++; 176 num_files--; 177 return (in_fd); 178 } 179 */ 180 114 115 /* clear all the settings from one mg_passes run to the next */ 181 116 void clear_variables() { 182 117 … … 207 142 208 143 } 209 void set_invf_level(char level) { 210 211 switch (level) { 212 case '1': 213 InvfLevel = 1; 214 break; 215 case '2': 216 InvfLevel = 2; 217 break; 218 case '3': 219 InvfLevel = 3; 220 break; 221 } 222 223 } 224 void set_inversion_limit(int limit) { 225 invf_buffer_size = limit * 1024 * 1024; 226 } 227 228 void ignore_sgml_tags(int ignore) { 229 if (ignore) { 230 SkipSGML = 1; 231 } else { 232 SkipSGML = 0; 233 } 234 } 235 236 void set_buffer_size(long size) { 237 buf_size = size * 1024; 238 if (buf_size < MIN_BUF) { 239 buf_size = MIN_BUF; 240 } 241 } 242 243 void set_stem_options(char * stemmer, int method) { 244 stemmer_num = stemmernumber (stemmer); 245 stem_method = method & STEMMER_MASK; 246 247 } 248 249 void set_filename(char * filen) { 250 int len = strlen(filen); 251 if (filename) { 252 Xfree (filename); 253 filename = NULL; 254 } 255 filename = Xstrdup (filen); 256 // put this here for now 257 Dump=1; 258 trace = 512; 259 if (!trace_name) 260 trace_name = make_name (filename, TRACE_SUFFIX, NULL); 261 if (!(Trace = fopen (trace_name, "a"))) 262 Message ("Unable to open \"%s\". No tracing will be done.", trace_name); 263 else 264 setbuf (Trace, NULL); 265 266 } 267 268 144 145 /* ################################################## */ 146 /* the following are methods to set all the variables that used to be 147 set by command line args */ 148 149 /* -S, -T1, -T2, -I1, -I2, args to mg_passes */ 269 150 void add_pass (char pass_type, char pass_num) { 270 151 … … 291 172 break; 292 173 } 293 294 } 174 } 175 176 /* -D arg to mg_passes */ 177 void dump_failed_document(int dump) { 178 Dump = dump; 179 } 180 181 /* -G arg to mg_passes */ 182 void ignore_sgml_tags(int ignore) { 183 if (ignore) { 184 SkipSGML = 1; 185 } else { 186 SkipSGML = 0; 187 } 188 } 189 190 /* -b arg to mg_passes */ 191 void set_buffer_size(long size) { 192 buf_size = size * 1024; 193 if (buf_size < MIN_BUF) { 194 buf_size = MIN_BUF; 195 } 196 } 197 198 /* -c arg to mg_passes */ 199 void set_chunk_limit(long chunk_limit) { 200 ChunkLimit = chunk_limit; 201 } 202 203 /* -C arg to mg_passes */ 204 void set_comp_stat_point(int stat_point) { 205 comp_stat_point = stat_point * 1024; 206 } 207 208 /* -f arg to mg_passes */ 209 void set_filename(char * filen) { 210 int len = strlen(filen); 211 if (filename) { 212 Xfree (filename); 213 filename = NULL; 214 } 215 filename = Xstrdup (filen); 216 } 217 218 /* -m arg to mg_passes */ 219 void set_inversion_limit(int limit) { 220 invf_buffer_size = limit * 1024 * 1024; 221 } 222 223 /* -1, -2, -3 args to mg_passes */ 224 void set_invf_level(char level) { 225 switch (level) { 226 case '1': 227 InvfLevel = 1; 228 break; 229 case '2': 230 InvfLevel = 2; 231 break; 232 case '3': 233 InvfLevel = 3; 234 break; 235 } 236 } 237 238 /* -W arg to mg_passes */ 239 void set_make_weights(int make_w) { 240 MakeWeights = make_w; 241 } 242 243 /* -M arg to mg_passes */ 244 void set_max_numeric(int max_numeric) { 245 SetEnv ("maxnumeric", max_numeric, NULL); 246 } 247 248 /* -a, -s args to mg_passes */ 249 void set_stem_options(char * stemmer, int method) { 250 stemmer_num = stemmernumber (stemmer); 251 stem_method = method & STEMMER_MASK; 252 } 253 254 /* -t arg to mg_passes */ 255 void set_trace_point(int tracepos) { 256 trace = (unsigned long) (tracepos * 1024 * 1024); 257 } 258 259 /* -n arg to mg_passes */ 260 void set_trace_file(char * filen) { 261 int len = strlen(filen); 262 if (trace_name) { 263 Xfree (trace_name); 264 trace_name = NULL; 265 } 266 trace_name = Xstrdup (filen); 267 } 268 269 /* ############################################### */ 270 /* The old driver method has been split into 3: 271 init_driver, process_document (called numdocs times), 272 finalise_driver. 273 The above set vars methods should all be called before init_driver. 274 */ 275 276 295 277 ProgTime StartTime, InitTime, ProcTime, DoneTime; 296 278 … … 298 280 init_driver () 299 281 { 282 if (!filename || *filename == '\0') 283 FatalError (1, "A document collection name must be specified."); 284 285 if ((Passes & (IVF_PASS_1 | IVF_PASS_2)) == (IVF_PASS_1 | IVF_PASS_2)) 286 FatalError (1, "I1 and I2 cannot be done simultaneously."); 287 288 if ((Passes & (TEXT_PASS_1 | TEXT_PASS_2)) == (TEXT_PASS_1 | TEXT_PASS_2)) 289 FatalError (1, "T1 and T2 cannot be done simultaneously."); 290 291 if (!Passes) 292 FatalError (1, "S, T1, T2, I1 or I2 must be specified."); 293 294 if (trace) 295 { 296 if (!trace_name) 297 trace_name = make_name (filename, TRACE_SUFFIX, NULL); 298 if (!(Trace = fopen (trace_name, "a"))) 299 Message ("Unable to open \"%s\". No tracing will be done.", trace_name); 300 else 301 setbuf (Trace, NULL); 302 } 303 else 304 Trace = NULL; 305 306 if (comp_stat_point) 307 { 308 char *name = make_name (filename, COMPRESSION_STATS_SUFFIX, NULL); 309 if (!(Comp_Stats = fopen (name, "wb"))) /* [RPAP - Feb 97: WIN32 Port] */ 310 Message ("Unable to open \"%s\". No comp. stats. will be generated.", 311 name); 312 } 313 314 if (Trace) 315 { 316 int i; 317 fprintf (Trace, "\n\n\t\t-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-\n\n"); 318 /* print out the args to mg_passes */ 319 fprintf (Trace, "\n\n"); 320 } 321 300 322 int pass; 301 323 … … 338 360 bytes_processed += len; 339 361 340 printf("process doc, len=%d\n",len);341 362 #ifndef QUIET 342 363 if (!len) … … 501 522 (double) bytes_processed / (ProcTime.CPUTime - InitTime.CPUTime) / 1024); 502 523 //free (buffer); 503 } 504 505 506 507 int main (int argc, char **argv) 508 { 509 int ch, in_fd; 510 511 msg_prefix = argv[0]; 512 513 opterr = 0; 514 while ((ch = getopt (argc, argv, "hC:WGSD123f:d:b:T:I:t:m:N:c:n:s:a:M:")) != -1) 515 { 516 switch (ch) 517 { 518 case 'G': 519 SkipSGML = 1; 520 break; 521 case 'S': 522 Passes |= SPECIAL; 523 break; 524 case '1': 525 InvfLevel = 1; 526 break; 527 case '2': 528 InvfLevel = 2; 529 break; 530 case '3': 531 InvfLevel = 3; 532 break; 533 case 'f': 534 filename = optarg; 535 break; 536 case 'n': 537 trace_name = optarg; 538 break; 539 case 'D': 540 Dump = 1; 541 break; 542 case 'W': 543 MakeWeights = 1; 544 break; 545 case 'd': 546 set_basepath (optarg); 547 break; 548 case 'a': 549 stemmer_num = stemmernumber (optarg); 550 break; 551 case 's': 552 stem_method = atoi (optarg) & STEMMER_MASK; 553 break; 554 case 'b': 555 buf_size = atoi (optarg) * 1024; 556 break; 557 case 'C': 558 comp_stat_point = atoi (optarg) * 1024; 559 break; 560 case 'c': 561 ChunkLimit = atoi (optarg); 562 break; 563 case 'm': 564 invf_buffer_size = (int) (atof (optarg) * 1024 * 1024); 565 break; 566 case 'I': 567 case 'N': /* N kept for compatability */ 568 if (*optarg == '1') 569 Passes |= IVF_PASS_1; 570 else if (*optarg == '2') 571 Passes |= IVF_PASS_2; 572 else 573 usage ("Invalid pass number"); 574 break; 575 case 'T': 576 if (*optarg == '1') 577 Passes |= TEXT_PASS_1; 578 else if (*optarg == '2') 579 Passes |= TEXT_PASS_2; 580 else 581 usage ("Invalid pass number"); 582 break; 583 case 't': 584 trace = (unsigned long) (atof (optarg) * 1024 * 1024); 585 break; 586 case 'M': 587 SetEnv ("maxnumeric", optarg, NULL); 588 break; 589 case 'h': 590 case '?': 591 usage (NULL); 592 } 593 } 594 595 if (!filename || *filename == '\0') 596 FatalError (1, "A document collection name must be specified."); 597 598 if (buf_size < MIN_BUF) 599 FatalError (1, "The buffer size must exceed 1024 bytes."); 600 601 if ((Passes & (IVF_PASS_1 | IVF_PASS_2)) == (IVF_PASS_1 | IVF_PASS_2)) 602 FatalError (1, "I1 and I2 cannot be done simultaneously."); 603 604 if ((Passes & (TEXT_PASS_1 | TEXT_PASS_2)) == (TEXT_PASS_1 | TEXT_PASS_2)) 605 FatalError (1, "T1 and T2 cannot be done simultaneously."); 606 607 if (!Passes) 608 FatalError (1, "S, T1, T2, I1 or I2 must be specified."); 609 610 if (optind < argc) 611 { 612 if ((in_fd = open (argv[optind], O_RDONLY)) == -1) 613 FatalError (1, "Cannot open %s", argv[optind]); 614 files = &argv[optind + 1]; 615 num_files = argc - (optind + 1); 616 } 617 else 618 in_fd = 0; /* stdin */ 619 620 621 if (trace) 622 { 623 if (!trace_name) 624 trace_name = make_name (filename, TRACE_SUFFIX, NULL); 625 if (!(Trace = fopen (trace_name, "a"))) 626 Message ("Unable to open \"%s\". No tracing will be done.", trace_name); 627 else 628 setbuf (Trace, NULL); 629 } 630 else 631 Trace = NULL; 632 633 if (comp_stat_point) 634 { 635 char *name = make_name (filename, COMPRESSION_STATS_SUFFIX, NULL); 636 if (!(Comp_Stats = fopen (name, "wb"))) /* [RPAP - Feb 97: WIN32 Port] */ 637 Message ("Unable to open \"%s\". No comp. stats. will be generated.", 638 name); 639 } 640 641 642 if (Trace) 643 { 644 int i; 645 fprintf (Trace, "\n\n\t\t-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-\n\n"); 646 for (i = 0; i < argc; i++) 647 fprintf (Trace, "%s ", argv[i]); 648 fprintf (Trace, "\n\n"); 649 } 650 651 init_driver (); 652 /* here we have to do something to process docs from stdin */ 653 finalise_driver(); 524 654 525 if (Trace) 655 526 fclose (Trace); … … 658 529 fclose (Comp_Stats); 659 530 660 return 0; 661 } 531 } 532 -
trunk/indexers/mg/java/org/greenstone/mg/MGPassesWrapper.java
r7452 r7460 20 20 21 21 22 /** java wrapper class for access to mg_passes in C22 /** java wrapper class for access to gs3_mg_passes in C 23 23 * 24 24 * the native side implemented in MGPassesWrapperImpl.c … … 32 32 } 33 33 34 private final char END_OF_DOCUMENT = (char) 2; 34 static final public char INVF_LEVEL_1 = '1'; 35 static final public char INVF_LEVEL_2 = '2'; 36 static final public char INVF_LEVEL_3 = '3'; 37 38 static final public int TEXT_PASS_1 = 0; 39 static final public int TEXT_PASS_2 = 1; 40 static final public int INDEX_PASS_1 = 2; 41 static final public int INDEX_PASS_2 = 3; 42 static final public int SPECIAL_PASS = 4; 43 44 static final public int NO_STEM_OR_CASE = 0; 45 static final public int CASE_ONLY = 1; 46 static final public int STEM_ONLY = 2; 47 static final public int STEM_AND_CASE = 3; 48 49 static final public String STEMMER_ENGLISH = "english"; 50 static final public String STEMMER_FRENCH = "french"; 51 static final public String STEMMER_LOVIN = "lovin"; 52 static final public String STEMMER_SIMPLE_FRENCH = "simple-french"; 53 54 static final private char END_OF_DOCUMENT = (char) 2; 35 55 36 56 public MGPassesWrapper() { … … 38 58 } 39 59 40 /** initialises field and method IDs for java side to enable access on C side */41 private static native void initIDs();42 43 /** initialises any C side stuff */44 private native boolean initCSide();45 46 60 /** initialise the pass through the documents */ 47 61 public native boolean init(); 48 62 49 63 /** add a pass declaration */ 50 public native void addPass(char pass_type, char pass_num); 51 64 public void addPass(int pass) { 65 switch (pass) { 66 case TEXT_PASS_1: 67 addPass('T','1'); 68 break; 69 case TEXT_PASS_2: 70 addPass('T','2'); 71 break; 72 case INDEX_PASS_1: 73 addPass('I','1'); 74 break; 75 case INDEX_PASS_2: 76 addPass('I','2'); 77 break; 78 case SPECIAL_PASS: 79 addPass('S','1'); 80 break; 81 } 82 } 52 83 /** set the base path */ 53 84 public native void setBasePath(String basepath); … … 59 90 public native void setInvfLevel(char level); 60 91 92 /** Specify the size of the document buffer in kilobytes. 93 If any document is larger than bufsize, the program 94 will abort with an error message. 95 */ 96 public native void setBufferSize(long bufsize); 97 98 /** Maximum amount of memory to use for the index pass-2 file 99 inversion in megabytes. 100 */ 101 public native void setInversionMemLimit(int limit); 61 102 103 /** If true, treat SGML tags as non-words when building the 104 inverted file. 105 */ 106 public native void ignoreSGMLTags(boolean ignore); 107 108 /** if mg_passes fails, the document that caused teh failure will be 109 output to teh trace file or STDERR. 110 */ 111 public native void dumpFailedDocument(boolean dump); 112 113 /** output statistics on the compression performance to a file 114 called *.compression.stats. frequency specifies the interval 115 (in kilobytes of source text) between outputting each line of 116 statistics. 117 */ 118 public native void outputCompStats(int frequency); 119 120 /** activate tracing, a line will be output every tracepos input bytes */ 121 public native void enableTracing(int tracepos); 62 122 /** process a Greenstone document, which may consist of many MG documents (seeparated by ^B */ 63 123 public boolean processDocument(String docs_text) { … … 73 133 return true; 74 134 } 75 /** process a MG document */76 public native boolean processMGDocument(byte[] text);77 135 78 136 /** finalise the pass through the documents */ 79 137 public native boolean finish(); 138 139 140 /** initialises field and method IDs for java side to enable access on C side */ 141 private static native void initIDs(); 142 143 /** initialises any C side stuff */ 144 private native boolean initCSide(); 145 146 private native void addPass(char pass_type, char pass_num); 147 148 /** process a MG document */ 149 private native boolean processMGDocument(byte[] text); 150 80 151 } -
trunk/indexers/mg/jni/MGPassesWrapperImpl.c
r7452 r7460 38 38 } 39 39 40 40 /* add a pass type T1, T2, I1, I2, S */ 41 41 JNIEXPORT void JNICALL 42 42 Java_org_greenstone_mg_MGPassesWrapper_addPass(JNIEnv *j_env, … … 54 54 /* Set the filename */ 55 55 JNIEXPORT void JNICALL 56 Java_org_greenstone_mg_MGPassesWrapper_setFileName(JNIEnv *j_env, jobject j_obj, 57 jstring j_filename) 56 Java_org_greenstone_mg_MGPassesWrapper_setFileName(JNIEnv *j_env, 57 jobject j_obj, 58 jstring j_filename) 58 59 { 59 60 /* Get the filename as a C string */ … … 79 80 80 81 set_basepath(basepath); 81 82 82 83 83 /* Release the string */ … … 86 86 } 87 87 88 88 /* set the level for the inverted file */ 89 89 JNIEXPORT void JNICALL 90 90 Java_org_greenstone_mg_MGPassesWrapper_setInvfLevel(JNIEnv *j_env, … … 98 98 } 99 99 100 /* */100 /* set the stemmer and stem method */ 101 101 JNIEXPORT void JNICALL 102 102 Java_org_greenstone_mg_MGPassesWrapper_setStemOptions(JNIEnv *j_env, … … 111 111 assert(stemmer != NULL); 112 112 set_stem_options(stemmer, method); 113 114 /* Release the string */ 115 (*j_env)->ReleaseStringUTFChars(j_env, j_stemmer, stemmer); 113 116 } 114 117 118 /** Specify the size of the document buffer in kilobytes. 119 If any document is larger than bufsize, the program 120 will abort with an error message. 121 */ 122 JNIEXPORT void JNICALL 123 Java_org_greenstone_mg_MGPassesWrapper_setBufferSize(JNIEnv *j_env, 124 jobject j_obj, 125 jlong j_bufsize){ 126 long buffer = j_bufsize; 127 set_buffer_size(buffer); 128 } 129 130 /** Maximum amount of memory to use for the index pass-2 file 131 inversion in megabytes. 132 */ 133 JNIEXPORT void JNICALL 134 Java_org_greenstone_mg_MGPassesWrapper_setInversionMemLimit(JNIEnv *j_env, 135 jobject j_obj, 136 jint j_limit) { 137 int limit = j_limit; 138 set_inversion_limit(limit); 139 } 140 141 /** If true, treat SGML tags as non-words when building the 142 inverted file. 143 */ 144 JNIEXPORT void JNICALL 145 Java_org_greenstone_mg_MGPassesWrapper_ignoreSGMLTags(JNIEnv *j_env, 146 jobject j_obj, 147 jboolean j_ignore){ 148 int ignore = j_ignore; 149 ignore_sgml_tags(ignore); 150 } 151 152 /** if mg_passes fails, the document that caused the failure will be 153 output to the trace file or STDERR. 154 */ 155 JNIEXPORT void JNICALL 156 Java_org_greenstone_mg_MGPassesWrapper_dumpFailedDocument(JNIEnv *j_env, 157 jobject j_obj, 158 jboolean j_dump) { 159 160 } 161 162 /** output statistics on the compression performance to a file 163 called *.compression.stats. frequency specifies the interval 164 (in kilobytes of source text) between outputting each line of 165 statistics. 166 */ 167 JNIEXPORT void JNICALL 168 Java_org_greenstone_mg_MGPassesWrapper_outputCompStats(JNIEnv *j_env, 169 jobject j_obj, 170 jint j_frequency){ 171 int comp_stat_point = j_frequency; 172 set_comp_stat_point(comp_stat_point); 173 174 } 175 /** activate tracing, a line will be output every tracepos input bytes */ 176 JNIEXPORT void JNICALL 177 Java_org_greenstone_mg_MGPassesWrapper_enableTracing(JNIEnv *j_env, 178 jobject j_obj, 179 jint j_tracepos){ 180 int tracepos = j_tracepos; 181 set_trace_point(tracepos); 182 } 183 184 /** specify the name of the trace file */ 185 JNIEXPORT void JNICALL 186 Java_org_greenstone_mg_MGPassesWrapper_setTraceFile(JNIEnv *j_env, 187 jobject j_obj, 188 jstring j_tracefile){ 189 190 const char* tracefile = (*j_env)->GetStringUTFChars(j_env, j_tracefile, NULL); 191 assert(tracefile != NULL); 192 set_trace_file(tracefile); 193 /* Release the string */ 194 (*j_env)->ReleaseStringUTFChars(j_env, j_tracefile, tracefile); 195 } 196 197 /* initialise the pass through the documents. must be called after all 198 the set methods 199 */ 115 200 JNIEXPORT jboolean JNICALL 116 201 Java_org_greenstone_mg_MGPassesWrapper_init(JNIEnv *j_env, … … 121 206 } 122 207 123 JNIEXPORT jboolean JNICALL 124 Java_org_greenstone_mg_MGPassesWrapper_finish(JNIEnv *j_env, 125 jobject j_obj) { 126 127 finalise_driver(); 128 return 1; 129 } 130 208 209 /* process one document */ 131 210 JNIEXPORT jboolean JNICALL 132 211 Java_org_greenstone_mg_MGPassesWrapper_processMGDocument(JNIEnv *j_env, … … 142 221 } 143 222 144 145 223 /* finalise the pass through the documents */ 224 JNIEXPORT jboolean JNICALL 225 Java_org_greenstone_mg_MGPassesWrapper_finish(JNIEnv *j_env, 226 jobject j_obj) { 227 228 finalise_driver(); 229 return 1; 230 } 231 232 -
trunk/indexers/mg/src/text/mg_passes_4jni.c
r7455 r7460 1 1 /************************************************************************** 2 2 * 3 * mg_passes .c -- Driver for the various passes3 * mg_passes_4jni.c -- Driver for the various passes 4 4 * Copyright (C) 1994 Neil Sharman 5 5 * … … 35 35 #include "stemmer.h" 36 36 37 38 37 #include "mg_files.h" 39 38 #include "mg.h" … … 113 112 }; 114 113 115 static char *usage_str = "\nUSAGE:\n" 116 " %s [-h] [-G] [-D] [-1|-2|-3] [-T1] [-T2] [-I1] [-I2] [-N1]\n" 117 " %*s [-N2] [-W] [-S] [-b buffer-size] [-d dictionary-directory]\n" 118 " %*s [-t trace-point Mb] [-m invf-memory] [-c chunk-limit]\n" 119 " %*s [-n trace-name] [-C comp-stat-size] [-s stem_method]\n" 120 " %*s [-a stemmer] [-M max-numeric] -f doc-collection-name\n"; 121 122 123 static void 124 usage (char *err) 125 { 126 if (err) 127 Message (err); 128 fprintf (stderr, usage_str, msg_prefix, strlen (msg_prefix), "", 129 strlen (msg_prefix), "",strlen (msg_prefix), "", 130 strlen (msg_prefix),""); 131 exit (1); 132 } 133 134 135 136 137 #if 0 138 static char * 139 str_comma (unsigned long u) 140 { 141 static char buf[20]; 142 unsigned long a, b, c, d; 143 a = u / 1000000000; 144 u -= a * 1000000000; 145 b = u / 1000000; 146 u -= b * 1000000; 147 c = u / 1000; 148 u -= c * 1000; 149 d = u; 150 151 if (a) 152 sprintf (buf, "%u,%03u,%03u,%03u", a, b, c, d); 153 else if (b) 154 sprintf (buf, "%u,%03u,%03u", b, c, d); 155 else if (c) 156 sprintf (buf, "%u,%03u", c, d); 157 else 158 sprintf (buf, "%u", d); 159 return (buf); 160 } 161 #endif 162 163 164 165 /* 166 int 167 open_next_file (int in_fd) 168 { 169 if (in_fd > 0) 170 close (in_fd); 171 if (num_files == 0) 172 return (-1); 173 if ((in_fd = open (files[0], O_RDONLY)) == -1) 174 FatalError (1, "Cannot open %s", files[0]); 175 files++; 176 num_files--; 177 return (in_fd); 178 } 179 */ 180 114 115 /* clear all the settings from one mg_passes run to the next */ 181 116 void clear_variables() { 182 117 … … 207 142 208 143 } 209 void set_invf_level(char level) { 210 211 switch (level) { 212 case '1': 213 InvfLevel = 1; 214 break; 215 case '2': 216 InvfLevel = 2; 217 break; 218 case '3': 219 InvfLevel = 3; 220 break; 221 } 222 223 } 224 void set_inversion_limit(int limit) { 225 invf_buffer_size = limit * 1024 * 1024; 226 } 227 228 void ignore_sgml_tags(int ignore) { 229 if (ignore) { 230 SkipSGML = 1; 231 } else { 232 SkipSGML = 0; 233 } 234 } 235 236 void set_buffer_size(long size) { 237 buf_size = size * 1024; 238 if (buf_size < MIN_BUF) { 239 buf_size = MIN_BUF; 240 } 241 } 242 243 void set_stem_options(char * stemmer, int method) { 244 stemmer_num = stemmernumber (stemmer); 245 stem_method = method & STEMMER_MASK; 246 247 } 248 249 void set_filename(char * filen) { 250 int len = strlen(filen); 251 if (filename) { 252 Xfree (filename); 253 filename = NULL; 254 } 255 filename = Xstrdup (filen); 256 // put this here for now 257 Dump=1; 258 trace = 512; 259 if (!trace_name) 260 trace_name = make_name (filename, TRACE_SUFFIX, NULL); 261 if (!(Trace = fopen (trace_name, "a"))) 262 Message ("Unable to open \"%s\". No tracing will be done.", trace_name); 263 else 264 setbuf (Trace, NULL); 265 266 } 267 268 144 145 /* ################################################## */ 146 /* the following are methods to set all the variables that used to be 147 set by command line args */ 148 149 /* -S, -T1, -T2, -I1, -I2, args to mg_passes */ 269 150 void add_pass (char pass_type, char pass_num) { 270 151 … … 291 172 break; 292 173 } 293 294 } 174 } 175 176 /* -D arg to mg_passes */ 177 void dump_failed_document(int dump) { 178 Dump = dump; 179 } 180 181 /* -G arg to mg_passes */ 182 void ignore_sgml_tags(int ignore) { 183 if (ignore) { 184 SkipSGML = 1; 185 } else { 186 SkipSGML = 0; 187 } 188 } 189 190 /* -b arg to mg_passes */ 191 void set_buffer_size(long size) { 192 buf_size = size * 1024; 193 if (buf_size < MIN_BUF) { 194 buf_size = MIN_BUF; 195 } 196 } 197 198 /* -c arg to mg_passes */ 199 void set_chunk_limit(long chunk_limit) { 200 ChunkLimit = chunk_limit; 201 } 202 203 /* -C arg to mg_passes */ 204 void set_comp_stat_point(int stat_point) { 205 comp_stat_point = stat_point * 1024; 206 } 207 208 /* -f arg to mg_passes */ 209 void set_filename(char * filen) { 210 int len = strlen(filen); 211 if (filename) { 212 Xfree (filename); 213 filename = NULL; 214 } 215 filename = Xstrdup (filen); 216 } 217 218 /* -m arg to mg_passes */ 219 void set_inversion_limit(int limit) { 220 invf_buffer_size = limit * 1024 * 1024; 221 } 222 223 /* -1, -2, -3 args to mg_passes */ 224 void set_invf_level(char level) { 225 switch (level) { 226 case '1': 227 InvfLevel = 1; 228 break; 229 case '2': 230 InvfLevel = 2; 231 break; 232 case '3': 233 InvfLevel = 3; 234 break; 235 } 236 } 237 238 /* -W arg to mg_passes */ 239 void set_make_weights(int make_w) { 240 MakeWeights = make_w; 241 } 242 243 /* -M arg to mg_passes */ 244 void set_max_numeric(int max_numeric) { 245 SetEnv ("maxnumeric", max_numeric, NULL); 246 } 247 248 /* -a, -s args to mg_passes */ 249 void set_stem_options(char * stemmer, int method) { 250 stemmer_num = stemmernumber (stemmer); 251 stem_method = method & STEMMER_MASK; 252 } 253 254 /* -t arg to mg_passes */ 255 void set_trace_point(int tracepos) { 256 trace = (unsigned long) (tracepos * 1024 * 1024); 257 } 258 259 /* -n arg to mg_passes */ 260 void set_trace_file(char * filen) { 261 int len = strlen(filen); 262 if (trace_name) { 263 Xfree (trace_name); 264 trace_name = NULL; 265 } 266 trace_name = Xstrdup (filen); 267 } 268 269 /* ############################################### */ 270 /* The old driver method has been split into 3: 271 init_driver, process_document (called numdocs times), 272 finalise_driver. 273 The above set vars methods should all be called before init_driver. 274 */ 275 276 295 277 ProgTime StartTime, InitTime, ProcTime, DoneTime; 296 278 … … 298 280 init_driver () 299 281 { 282 if (!filename || *filename == '\0') 283 FatalError (1, "A document collection name must be specified."); 284 285 if ((Passes & (IVF_PASS_1 | IVF_PASS_2)) == (IVF_PASS_1 | IVF_PASS_2)) 286 FatalError (1, "I1 and I2 cannot be done simultaneously."); 287 288 if ((Passes & (TEXT_PASS_1 | TEXT_PASS_2)) == (TEXT_PASS_1 | TEXT_PASS_2)) 289 FatalError (1, "T1 and T2 cannot be done simultaneously."); 290 291 if (!Passes) 292 FatalError (1, "S, T1, T2, I1 or I2 must be specified."); 293 294 if (trace) 295 { 296 if (!trace_name) 297 trace_name = make_name (filename, TRACE_SUFFIX, NULL); 298 if (!(Trace = fopen (trace_name, "a"))) 299 Message ("Unable to open \"%s\". No tracing will be done.", trace_name); 300 else 301 setbuf (Trace, NULL); 302 } 303 else 304 Trace = NULL; 305 306 if (comp_stat_point) 307 { 308 char *name = make_name (filename, COMPRESSION_STATS_SUFFIX, NULL); 309 if (!(Comp_Stats = fopen (name, "wb"))) /* [RPAP - Feb 97: WIN32 Port] */ 310 Message ("Unable to open \"%s\". No comp. stats. will be generated.", 311 name); 312 } 313 314 if (Trace) 315 { 316 int i; 317 fprintf (Trace, "\n\n\t\t-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-\n\n"); 318 /* print out the args to mg_passes */ 319 fprintf (Trace, "\n\n"); 320 } 321 300 322 int pass; 301 323 … … 338 360 bytes_processed += len; 339 361 340 printf("process doc, len=%d\n",len);341 362 #ifndef QUIET 342 363 if (!len) … … 501 522 (double) bytes_processed / (ProcTime.CPUTime - InitTime.CPUTime) / 1024); 502 523 //free (buffer); 503 } 504 505 506 507 int main (int argc, char **argv) 508 { 509 int ch, in_fd; 510 511 msg_prefix = argv[0]; 512 513 opterr = 0; 514 while ((ch = getopt (argc, argv, "hC:WGSD123f:d:b:T:I:t:m:N:c:n:s:a:M:")) != -1) 515 { 516 switch (ch) 517 { 518 case 'G': 519 SkipSGML = 1; 520 break; 521 case 'S': 522 Passes |= SPECIAL; 523 break; 524 case '1': 525 InvfLevel = 1; 526 break; 527 case '2': 528 InvfLevel = 2; 529 break; 530 case '3': 531 InvfLevel = 3; 532 break; 533 case 'f': 534 filename = optarg; 535 break; 536 case 'n': 537 trace_name = optarg; 538 break; 539 case 'D': 540 Dump = 1; 541 break; 542 case 'W': 543 MakeWeights = 1; 544 break; 545 case 'd': 546 set_basepath (optarg); 547 break; 548 case 'a': 549 stemmer_num = stemmernumber (optarg); 550 break; 551 case 's': 552 stem_method = atoi (optarg) & STEMMER_MASK; 553 break; 554 case 'b': 555 buf_size = atoi (optarg) * 1024; 556 break; 557 case 'C': 558 comp_stat_point = atoi (optarg) * 1024; 559 break; 560 case 'c': 561 ChunkLimit = atoi (optarg); 562 break; 563 case 'm': 564 invf_buffer_size = (int) (atof (optarg) * 1024 * 1024); 565 break; 566 case 'I': 567 case 'N': /* N kept for compatability */ 568 if (*optarg == '1') 569 Passes |= IVF_PASS_1; 570 else if (*optarg == '2') 571 Passes |= IVF_PASS_2; 572 else 573 usage ("Invalid pass number"); 574 break; 575 case 'T': 576 if (*optarg == '1') 577 Passes |= TEXT_PASS_1; 578 else if (*optarg == '2') 579 Passes |= TEXT_PASS_2; 580 else 581 usage ("Invalid pass number"); 582 break; 583 case 't': 584 trace = (unsigned long) (atof (optarg) * 1024 * 1024); 585 break; 586 case 'M': 587 SetEnv ("maxnumeric", optarg, NULL); 588 break; 589 case 'h': 590 case '?': 591 usage (NULL); 592 } 593 } 594 595 if (!filename || *filename == '\0') 596 FatalError (1, "A document collection name must be specified."); 597 598 if (buf_size < MIN_BUF) 599 FatalError (1, "The buffer size must exceed 1024 bytes."); 600 601 if ((Passes & (IVF_PASS_1 | IVF_PASS_2)) == (IVF_PASS_1 | IVF_PASS_2)) 602 FatalError (1, "I1 and I2 cannot be done simultaneously."); 603 604 if ((Passes & (TEXT_PASS_1 | TEXT_PASS_2)) == (TEXT_PASS_1 | TEXT_PASS_2)) 605 FatalError (1, "T1 and T2 cannot be done simultaneously."); 606 607 if (!Passes) 608 FatalError (1, "S, T1, T2, I1 or I2 must be specified."); 609 610 if (optind < argc) 611 { 612 if ((in_fd = open (argv[optind], O_RDONLY)) == -1) 613 FatalError (1, "Cannot open %s", argv[optind]); 614 files = &argv[optind + 1]; 615 num_files = argc - (optind + 1); 616 } 617 else 618 in_fd = 0; /* stdin */ 619 620 621 if (trace) 622 { 623 if (!trace_name) 624 trace_name = make_name (filename, TRACE_SUFFIX, NULL); 625 if (!(Trace = fopen (trace_name, "a"))) 626 Message ("Unable to open \"%s\". No tracing will be done.", trace_name); 627 else 628 setbuf (Trace, NULL); 629 } 630 else 631 Trace = NULL; 632 633 if (comp_stat_point) 634 { 635 char *name = make_name (filename, COMPRESSION_STATS_SUFFIX, NULL); 636 if (!(Comp_Stats = fopen (name, "wb"))) /* [RPAP - Feb 97: WIN32 Port] */ 637 Message ("Unable to open \"%s\". No comp. stats. will be generated.", 638 name); 639 } 640 641 642 if (Trace) 643 { 644 int i; 645 fprintf (Trace, "\n\n\t\t-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-\n\n"); 646 for (i = 0; i < argc; i++) 647 fprintf (Trace, "%s ", argv[i]); 648 fprintf (Trace, "\n\n"); 649 } 650 651 init_driver (); 652 /* here we have to do something to process docs from stdin */ 653 finalise_driver(); 524 654 525 if (Trace) 655 526 fclose (Trace); … … 658 529 fclose (Comp_Stats); 659 530 660 return 0; 661 } 531 } 532
Note:
See TracChangeset
for help on using the changeset viewer.