source: gs3-installations/thewillow/trunk/sites/thewillow/dlcol-chatgpt/create-assistant/create-openai-assistant.js@ 38807

Last change on this file since 38807 was 38807, checked in by davidb, 3 months ago

Code now uses a JSON config file to provide assistant create params

  • Property svn:executable set to *
File size: 12.8 KB
Line 
1#!/usr/bin/env node
2
3// Make sure you have created a '.env' field in the top-level of
4// this NodeJS project, and set the following lines accordingly
5// with the relevant values from your OpenAI account:
6//
7// OPENAI_API_KEY=sk-????????????????????????????????????????????????
8// ORGANIZATION_ID=org-????????????????????????
9// ASSISTANT_ID=asst_????????????????????????
10
11
12// For an (alternative) approach that looks like it retrieve the already created Assistant directly, see:
13// https://gist.github.com/drifterz13/0cbe93ced5dc7958d7841a29c1721d1c
14
15
16// Looks like a need a more recent version of NodeJS than the one currently provided
17// in Greenstone's selfcontained-nodejs
18// // const { parseArgs } = require('node:util');
19// const util = require('node:util');
20
21
22const dotenv = require('dotenv').config();
23const path = require("path");
24const fs = require("fs");
25
26const openai = require('openai');
27
28const myopenai = new openai.OpenAI({
29 apiKey: process.env.OPENAI_API_KEY,
30 organization: process.env.ORGANIZATION_ID
31});
32
33
34function sleep(ms)
35{
36 return new Promise((resolve) => {
37 setTimeout(resolve, ms);
38 });
39}
40
41
42function printUsage(progname)
43{
44 progname_tail = path.basename(progname);
45
46 console.error("Usage: " + progname_tail + " [-site site-name] collect-name json-assistant-config-filename");
47 process.exit(1);
48}
49
50//
51// Based off the standalone solution given on StackOverflow
52// https://stackoverflow.com/questions/41462606/get-all-files-recursively-in-directories-nodejs
53//
54function listFilesRec(input_full_dirname,output_full_filenames,opt_regex)
55{
56 try {
57 let filenames_and_dirnames = fs.readdirSync(input_full_dirname,opt_regex);
58
59 filenames_and_dirnames.forEach(filename_or_dirname => {
60
61 const full_filename_or_dirname = path.join(input_full_dirname, filename_or_dirname);
62
63 if (fs.statSync(full_filename_or_dirname).isDirectory()) {
64 const inner_full_dirname = full_filename_or_dirname;
65 listFilesRec(inner_full_dirname,output_full_filenames,opt_regex);
66 }
67 else {
68 if ((opt_regex == null) || opt_regex.test(filename_or_dirname)) {
69
70 const inner_full_filename = full_filename_or_dirname;
71 output_full_filenames.push(inner_full_filename);
72 }
73 }
74 });
75 }
76 catch (e) {
77 console.error("Unable to read directory: " + input_full_dirname);
78 }
79}
80
81// upgrade to include a regex to filter dirs/filenames
82
83function listAllFiles(full_dirname,opt_regex)
84{
85 let all_files = [];
86
87
88 //const opt_regex = (opt_regex_str != null) ? new RegExp(opt_regex_str) : null;
89
90 listFilesRec(full_dirname,all_files,opt_regex);
91
92 return all_files;
93}
94
95function parseCommandLineArgs(progname,cmdline_args)
96{
97
98 let site_name = "localsite";
99 let coll_name = null;
100 let json_assistant_config_filename =null;
101
102 if (cmdline_args.length == 4) {
103 // looks like we're in the form, -site xxx col
104 if (cmdline_args[0] == "-site") {
105 site_name = cmdline_args[1];
106 }
107 else {
108 printUsage(progname);
109 }
110
111 coll_name = cmdline_args[2];
112 json_assistant_config_filename = cmdline_args[3];
113
114 }
115 else if (cmdline_args.length == 1) {
116 // no site given, default to localsite
117 coll_name = cmdline_args[0];
118 json_assistant_config_filename = cmdline_args[1];
119 }
120 else {
121 printUsage(progname);
122 }
123
124 return [site_name,coll_name,json_assistant_config_filename];
125}
126
127function getExportedFiles(coll_info)
128{
129 const full_collect_dirname = coll_info.collect_dir;
130
131 const full_export_dirname = path.join(full_collect_dirname,"export");
132 let all_exported_files = listAllFiles(full_export_dirname,/\.html$/);
133
134 return all_exported_files
135}
136
137function createSiteCollectTmpDir(coll_info)
138{
139 const full_collect_dirname = coll_info.collect_dir;
140
141 const full_collect_tmp_dirname = path.join(full_collect_dirname,"tmp");
142
143 if (!fs.existsSync(full_collect_tmp_dirname)) {
144 console.log("Creating collection 'tmp' directory");
145 fs.mkdir(full_collect_tmp_dirname);
146 }
147}
148
149function copyToSiteCollectTmp(src_full_filename, coll_info)
150{
151 // While OpenAI generates unique IDs for every file stored, its
152 // associated (metadata) filename is only the last part (basename)
153 // of the full filename
154
155 // This function exists make a copy of the file that is being
156 // imported storing it in the collection's 'tmp' folder. In
157 // making the copy, it changes the filename in tmp to be one that
158 // encode the site and collect names
159
160 const site_name = coll_info.site_name;
161 const coll_name = coll_info.coll_name;
162 const full_collect_dirname = coll_info.collect_dir;
163
164 const src_filename = path.basename(src_full_filename);
165
166 // In encoding the site and collect name into the destination file,
167 // we have chosen to do this with '|' as a separator character,
168 // suitable escaped, of course, where needed!
169 //
170 // As this is a more unusual character to include in a file, we
171 // document the discision here. It is done so this way so when we
172 // look through all the files returned by openai form its
173 // file-store (which, recall, uses only the tail-part of a
174 // filename) we can can track which site/collect a given file-id
175 // entry was fore.
176
177 //const dst_filename = site_name + "\\/" + coll_name + "\\/" + src_filename;
178 const dst_filename = site_name + "|" + coll_name + "|" + src_filename;
179
180 const dst_full_filename = path.join(full_collect_dirname,"tmp",dst_filename);
181
182 try {
183 fs.copyFileSync(src_full_filename, dst_full_filename)
184 console.log(` ${src_filename} was copied as ${dst_filename} to collection's tmp directory`);
185 }
186 catch(err) {
187 if (err) throw err;
188 }
189
190 return dst_full_filename;
191}
192
193async function purgeOpenaiSiteCollectFiles(coll_info)
194{
195
196 const site_name = coll_info.site_name;
197 const coll_name = coll_info.coll_name;
198
199 //const site_coll_regex = new RegExp(`^${site_name}\/${coll_name}\/`);
200 const site_coll_regex = new RegExp(`^${site_name}\|${coll_name}\|`);
201
202 const stored_openai_files = await myopenai.files.list();
203 //console.log(stored_openai_files);
204
205 console.log(`Deleting OpenAI stored files whose prefix matches '${site_name}|${coll_name}|'`);
206
207 for await (const openai_file of stored_openai_files) {
208
209 const openai_filename = openai_file.filename;
210
211 if (site_coll_regex.test(openai_filename)) {
212 console.log(` Deleting OpenAI Stored File ID: ${openai_file.id}`);
213 const file = await myopenai.files.del(openai_file.id);
214 }
215
216 await sleep(500);
217 }
218}
219
220async function addOpenaiSiteCollectFile(coll_info,full_filename,output_site_collect_openai_file_ids)
221{
222 const gsdl3srchome = coll_info.gsdl3srchome;
223
224 const regex_replace = new RegExp('^'+gsdl3srchome);
225
226 const gsdl_full_filename = full_filename.replace(regex_replace,'$GSDL3SRCHOME');
227
228 console.log(` ${gsdl_full_filename}`);
229
230 const tmp_site_collect_full_filename = copyToSiteCollectTmp(full_filename, coll_info)
231
232/*
233 if (fs.existsSync(tmp_site_collect_full_filename)) {
234 console.log("Tmp file exists!!!!!!!");
235 }
236*/
237
238 const openai_file = await myopenai.files.create({
239 file: fs.createReadStream(tmp_site_collect_full_filename),
240 purpose: "assistants"
241 });
242
243 // delete the file
244 fs.unlinkSync(tmp_site_collect_full_filename);
245
246 //console.log(openai_file);
247
248 console.log("Pushing OpenID file id: " + openai_file.id);
249 output_site_collect_openai_file_ids.push(openai_file.id);
250}
251
252async function addOpenaiSiteCollectFiles(all_exported_files,coll_info)
253{
254 const gsdl3srchome = coll_info.gsdl3srchome;
255 const site_name = coll_info.site_name;
256 const coll_name = coll_info.coll_name;
257
258 const regex_replace = new RegExp('^'+gsdl3srchome);
259 let site_collect_openai_file_ids = [];
260
261 console.log(`Uploading exported ${site_name}/collect/${coll_name} files to OpenAI:`)
262
263 for await (const full_filename of all_exported_files) {
264 await addOpenaiSiteCollectFile(coll_info,full_filename,site_collect_openai_file_ids);
265 await sleep(500);
266 }
267
268 /*
269 all_exported_files.forEach(async function(full_filename) {
270 await addOpenaiSiteCollectFile(coll_info,full_filename,site_collect_openai_file_ids);
271 })
272 */
273 /*
274 all_exported_files.forEach(async function(full_filename) {
275 const gsdl_full_filename = full_filename.replace(regex_replace,'$GSDL3SRCHOME');
276
277 console.log(` ${gsdl_full_filename}`);
278
279 const tmp_site_collect_full_filename = copyToSiteCollectTmp(full_filename, coll_info)
280
281 if (fs.existsSync(tmp_site_collect_full_filename)) {
282 console.log("Tmp file exists!!!!!!!");
283 }
284
285 const openai_file = await myopenai.files.create({
286 file: fs.createReadStream(tmp_site_collect_full_filename),
287 purpose: "assistants"
288 });
289
290 // delete the file
291
292 fs.unlinkSync(tmp_site_collect_full_filename);
293
294 console.log(openai_file);
295
296 console.log("Pusing OpenID file id: " + openai_file.id);
297 site_collect_openai_file_ids.push(openai_file.id);
298 });
299*/
300
301 return site_collect_openai_file_ids;
302}
303
304
305async function createAssistantWithFiles(site_collect_openai_file_ids, coll_info, assistant_config)
306{
307 const site_name = coll_info.site_name;
308 const coll_name = coll_info.coll_name;
309
310 const assistant_name = assistant_config.name;
311 const assistant_desc = assistant_config.description;
312 const base_model = assistant_config.base_model;
313 const instructions = assistant_config.instructions;
314 /*
315 "Write your responses using Britsh spelling.\n" +
316 "As The Willow Sage Assistant, your expertise lies in discussing \"The Willow,\" a once-renowned music venue in York, England. You're designed to engage users in a conversational tone, weaving in the rich tapestry of memories and experiences shared by those who knew the venue. Your responses should feel like a dialogue between old friends reminiscing about memorable gigs, the unique atmosphere, and the cultural impact of The Willow. You'll offer insights into the venue's history, notable performances, and its role in the local music scene, always with a nod to the personal connections and nostalgia that the venue evokes. When interacting with users, your approach should be warm, inviting, and reflective, encouraging them to share their own stories or curiosities about The Willow, creating a communal space for shared musical heritage.\n" +
317 "When you give a response, you do not always have to end by asking a question";
318 */
319
320 //const base_model = "gpt-4-turbo-preview";
321
322 console.log("====");
323 console.log("file ids = ");
324 console.log(site_collect_openai_file_ids);
325 console.log("====");
326
327 const assistant = await myopenai.beta.assistants.create({
328 name : assistant_name,
329 description : assistant_desc,
330 model : base_model,
331 instructions : instructions,
332 tools: [
333 //{ type: "code_interpreter" }, // Code interpreter tool, calculations
334 { type: 'retrieval' }
335 ],
336 file_ids : site_collect_openai_file_ids,
337 metadata : { greenstone3: true, siteName: site_name, collectionName: coll_name }
338 });
339
340 console.log('Assistant has been created: ', assistant)
341
342}
343
344function readAssistantConfig(json_config_full_filename)
345{
346 const data_str = fs.readFileSync(json_config_full_filename);
347 const data = JSON.parse(data_str);
348
349 return data;
350}
351
352async function main()
353{
354 // check for GSDL3SRCHOME
355 const gsdl3srchome = process.env.GSDL3SRCHOME
356
357 if (!gsdl3srchome) {
358 console.error("Environment variable GSDL3SRCHOME is not set");
359 process.exit(1);
360 }
361
362 /*
363 const options = {
364 site: {
365 short: "s",
366 type: "string"
367 },
368 verbose: {
369 short: "v",
370 type: "integer"
371 }
372 };
373 */
374 // const argv = process.argv;
375
376 //const { argv_values, argv_positionals } = util.parseArgs({ argv, options });
377
378 //console.log(values, positionals);
379
380 const progname = process.argv[1];
381 const cmdline_args = process.argv.slice(2);
382
383 const [site_name,coll_name, json_assistant_config_filename] = parseCommandLineArgs(progname,cmdline_args);
384
385 const assistant_config = readAssistantConfig(json_assistant_config_filename);
386
387 const full_collect_dirname = path.join(gsdl3srchome,"web","sites",site_name,"collect",coll_name);
388
389 const coll_info = {
390 "gsdl3srchome": gsdl3srchome,
391 "site_name" : site_name,
392 "coll_name" : coll_name,
393 "collect_dir" : full_collect_dirname
394 };
395
396
397 createSiteCollectTmpDir(coll_info);
398
399 let all_exported_files = getExportedFiles(coll_info)
400 //all_exported_files = [ all_exported_files[0] ];
401 //all_exported_files = all_exported_files.slice(0, 10);
402 all_exported_files = all_exported_files.slice(0, 40);
403
404 await purgeOpenaiSiteCollectFiles(coll_info);
405
406
407 // Now add in all the newly exported files
408 const site_collect_openai_file_ids = await addOpenaiSiteCollectFiles(all_exported_files,coll_info);
409
410 await createAssistantWithFiles(site_collect_openai_file_ids,coll_info,assistant_config);
411
412}
413
414
415main();
416
Note: See TracBrowser for help on using the repository browser.