source: trunk/indexers/mg/src/text/mg_text_merge.c@ 3745

Last change on this file since 3745 was 3745, checked in by mdewsnip, 21 years ago

Addition of MG package for search and retrieval

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 7.2 KB
Line 
1/**************************************************************************
2 *
3 * mg_text_merge.c --- merge *.text, *.text.idx files
4 * part of the mgmerge utility
5 * Copyright (C) 1995 Shane Hudson ([email protected])
6 *
7 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation; either version 2 of the License, or
10 * (at your option) any later version.
11 *
12 * This program is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with this program; if not, write to the Free Software
19 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
20 *
21 * $Id: mg_text_merge.c 3745 2003-02-20 21:20:24Z mdewsnip $
22 * Last edited: November 11 1994
23 *
24 **************************************************************************/
25
26#include "sysfuncs.h"
27
28#include "messages.h"
29#include "timing.h"
30#include "netorder.h" /* [RPAP - Jan 97: Endian Ordering] */
31
32#include "locallib.h"
33#include "mg.h"
34#include "mg_merge.h"
35#include "mg_files.h"
36#include "text.h"
37
38
39/**** GLOBALS ****/
40FILE *text[3], *idx[3];
41
42typedef char FileName[256];
43FileName old_name, new_name, merge_name;
44
45long magicsize; /* == where the header in a file begins */
46
47compressed_text_header cth[3];
48
49/*=======================================================================
50 * init_merge_text(): open files, set up global variables, etc
51 *=======================================================================*/
52int
53init_merge_text ()
54{
55
56 /* open .text files */
57 text[OLD] = open_file (old_name, TEXT_SUFFIX, "r+b",
58 MAGIC_TEXT, MG_ABORT); /* [RPAP - Feb 97: WIN32 Port] */
59 magicsize = ftell (text[OLD]);
60 fread (&cth[OLD], sizeof (cth[OLD]), 1, text[OLD]);
61
62 text[NEW] = open_file (new_name, TEXT_SUFFIX, "rb+",
63 MAGIC_TEXT, MG_ABORT);
64 fread (&cth[NEW], sizeof (cth[NEW]), 1, text[NEW]);
65
66 /* open .text.idx files */
67 idx[OLD] = open_file (old_name, TEXT_IDX_SUFFIX, "rb+",
68 MAGIC_TEXI, MG_ABORT);
69 fread (&cth[OLD], sizeof (cth[OLD]), 1, idx[OLD]);
70
71 /* [RPAP - Jan 97: Endian Ordering] */
72 NTOHUL(cth[OLD].num_of_docs);
73 NTOHD(cth[OLD].num_of_bytes); /* [RJM 07/97: 4G limit] */
74 NTOHUL(cth[OLD].num_of_words);
75 NTOHUL(cth[OLD].length_of_longest_doc);
76 NTOHD(cth[OLD].ratio);
77
78 idx[NEW] = open_file (new_name, TEXT_IDX_SUFFIX, "rb+",
79 MAGIC_TEXI, MG_ABORT);
80 fread (&cth[NEW], sizeof (cth[NEW]), 1, idx[NEW]);
81
82 /* [RPAP - Jan 97: Endian Ordering] */
83 NTOHUL(cth[NEW].num_of_docs);
84 NTOHD(cth[NEW].num_of_bytes); /* [RJM 07/97: 4G limit] */
85 NTOHUL(cth[NEW].num_of_words);
86 NTOHUL(cth[NEW].length_of_longest_doc);
87 NTOHD(cth[NEW].ratio);
88
89 idx[MERGE] = create_file (merge_name, TEXT_IDX_SUFFIX, "wb",
90 MAGIC_TEXI, MG_ABORT);
91 return OK;
92}
93
94
95/*=======================================================================
96 * process_merge_text(): merge the files
97 *=======================================================================*/
98int
99process_merge_text (void)
100{
101 int i;
102 u_long data, offset;
103 byte c;
104
105 /* update and write merged header to .text and .text.idx files */
106 /* they have the exact same header */
107 cth[MERGE].num_of_docs = cth[OLD].num_of_docs
108 + cth[NEW].num_of_docs;
109 cth[MERGE].num_of_bytes = cth[OLD].num_of_bytes
110 + cth[NEW].num_of_bytes;
111 cth[MERGE].num_of_words = cth[OLD].num_of_words;
112 cth[MERGE].length_of_longest_doc =
113 (cth[OLD].length_of_longest_doc > cth[NEW].length_of_longest_doc
114 ? cth[OLD].length_of_longest_doc
115 : cth[NEW].length_of_longest_doc);
116 cth[MERGE].ratio = ((cth[OLD].num_of_bytes * cth[OLD].ratio) +
117 (cth[NEW].num_of_bytes * cth[NEW].ratio))
118 / cth[MERGE].num_of_bytes;
119
120 /* [RPAP - Jan 97: Endian Ordering] */
121 HTONUL(cth[MERGE].num_of_docs);
122 HTOND(cth[MERGE].num_of_bytes); /* [RJM 07/97: 4G limit] */
123 HTONUL(cth[MERGE].num_of_words);
124 HTONUL(cth[MERGE].length_of_longest_doc);
125 HTOND(cth[MERGE].ratio);
126
127 fwrite (&cth[MERGE], sizeof (cth[MERGE]), 1, idx[MERGE]);
128 fseek (text[OLD], magicsize, 0);
129 fwrite (&cth[MERGE], sizeof (cth[MERGE]), 1, text[OLD]);
130
131 /*
132 * Update *.text.idx: need to know where each new doc starts
133 * in the appended .text file
134 */
135 for (i = 0; i < cth[OLD].num_of_docs; i++)
136 {
137 fread (&data, sizeof (u_long), 1, idx[OLD]);
138 fwrite (&data, sizeof (u_long), 1, idx[MERGE]);
139 }
140
141 /* offset is the amount to add to each entry from idx[NEW] */
142 fread (&offset, sizeof (u_long), 1, idx[OLD]);
143 NTOHUL(offset); /* [RPAP - Jan 97: Endian Ordering] */
144 offset -= (4 + sizeof (cth[OLD])); /* 4 for the magic number */
145
146 for (i = 0; i < cth[NEW].num_of_docs; i++)
147 {
148 fread (&data, sizeof (u_long), 1, idx[NEW]);
149 NTOHUL(data); /* [RPAP - Jan 97: Endian Ordering] */
150 data += offset;
151 HTONUL(data); /* [RPAP - Jan 97: Endian Ordering] */
152 fwrite (&data, sizeof (u_long), 1, idx[MERGE]);
153 }
154 /* write last u_long in idx[MERGE] (= length of file) */
155 fread (&data, sizeof (u_long), 1, idx[NEW]);
156 NTOHUL(data); /* [RPAP - Jan 97: Endian Ordering] */
157 data += offset;
158 HTONUL(data); /* [RPAP - Jan 97: Endian Ordering] */
159 fwrite (&data, sizeof (u_long), 1, idx[MERGE]);
160
161/******* update .text *******/
162 /* simply cat's the files together, except for the headers
163 * and magic numbers, of course
164 */
165 fseek (text[OLD], 0L, 2);
166 while (!feof (text[NEW]))
167 {
168 fread (&c, sizeof (c), 1, text[NEW]);
169 if (!feof (text[NEW]))
170 fwrite (&c, sizeof (c), 1, text[OLD]);
171 }
172
173 return OK;
174}
175
176
177
178/*=======================================================================
179 * done_merge_text(): close files.
180 *=======================================================================*/
181int
182done_merge_text (void)
183{
184 fclose (idx[MERGE]);
185 fclose (text[OLD]);
186 fclose (idx[OLD]);
187 fclose (text[NEW]);
188 fclose (idx[NEW]);
189
190 fprintf (stderr, "mg_text_merge: %ld documents added to %s\n",
191 cth[NEW].num_of_docs, merge_name);
192
193 return OK;
194}
195
196
197/*=======================================================================
198 * usage()
199 *=======================================================================*/
200void
201usage (char *progname)
202{
203 fprintf (stderr, "usage: %s -f collection_name\n", progname);
204 exit (1);
205}
206
207
208/*=======================================================================
209 * main()
210 *=======================================================================*/
211int main (int argc, char *argv[])
212{
213 char *progname;
214 ProgTime start;
215 int ch; /* for command line processing */
216
217 progname = argv[0];
218 msg_prefix = argv[0];
219 merge_name[0] = '\0';
220
221 while ((ch = getopt (argc, argv, "f:d:h")) != -1)
222 switch (ch)
223 {
224 case 'f':
225 strcpy (merge_name, optarg);
226 break;
227 case 'd':
228 set_basepath (optarg);
229 break;
230 case 'h':
231 case '?':
232 default:
233 usage (progname);
234 }
235
236 if (merge_name[0] == '\0')
237 usage (progname);
238 strcpy (old_name, merge_name);
239 strcat (old_name, ".old");
240 strcpy (new_name, merge_name);
241 strcat (new_name, ".new");
242
243 GetTime (&start);
244 init_merge_text ();
245 process_merge_text ();
246 done_merge_text ();
247 Message ("%s\n", ElapsedTime (&start, NULL));
248 return 0;
249}
Note: See TracBrowser for help on using the repository browser.