source: gs3-installations/thewillow/trunk/sites/thewillow/dlcol-chatgpt/create-assistant/main.js@ 38783

Last change on this file since 38783 was 38783, checked in by davidb, 3 months ago

Additional functionality introduced to offset the fact the the OpenAI file-store is quite a flat structure, with our needs, where the site and collection (normally handled through directories/sub-directories) needs to represented/tracked

  • Property svn:executable set to *
File size: 7.2 KB
Line 
1#!/usr/bin/env node
2
3// Make sure you have created a '.env' field in the top-level of
4// this NodeJS project, and set the following lines accordingly
5// with the relevant values from your OpenAI account:
6//
7// OPENAI_API_KEY=sk-????????????????????????????????????????????????
8// ORGANIZATION_ID=org-????????????????????????
9// ASSISTANT_ID=asst_????????????????????????
10
11
12// For an (alternative) approach that looks like it retrieve the already created Assistant directly, see:
13// https://gist.github.com/drifterz13/0cbe93ced5dc7958d7841a29c1721d1c
14
15
16
17const dotenv = require('dotenv').config();
18const path = require("path");
19const fs = require("fs");
20
21const openai = require('openai');
22
23const myopenai = new openai.OpenAI({
24 apiKey: process.env.OPENAI_API_KEY,
25 organization: process.env.ORGANIZATION_ID
26});
27
28
29function printUsage(progname)
30{
31 console.error("Usage: " + progname + " [-site site-name] collect-name");
32 process.exit(1);
33}
34
35// Based off the standalone solution given on StackOverflow
36// https://stackoverflow.com/questions/41462606/get-all-files-recursively-in-directories-nodejs
37
38
39function listFilesRec(input_full_dirname,output_full_filenames,opt_regex)
40{
41 try {
42 let filenames_and_dirnames = fs.readdirSync(input_full_dirname,opt_regex);
43
44 filenames_and_dirnames.forEach(filename_or_dirname => {
45
46 const full_filename_or_dirname = path.join(input_full_dirname, filename_or_dirname);
47
48 if (fs.statSync(full_filename_or_dirname).isDirectory()) {
49 const inner_full_dirname = full_filename_or_dirname;
50 listFilesRec(inner_full_dirname,output_full_filenames,opt_regex);
51 }
52 else {
53 if ((opt_regex == null) || opt_regex.test(filename_or_dirname)) {
54
55 const inner_full_filename = full_filename_or_dirname;
56 output_full_filenames.push(inner_full_filename);
57 }
58 }
59 });
60 }
61 catch (e) {
62 console.error("Unable to read directory: " + input_full_dirname);
63 }
64}
65
66// upgrade to include a regex to filter dirs/filenames
67
68function listAllFiles(full_dirname,opt_regex)
69{
70 let all_files = [];
71
72
73 //const opt_regex = (opt_regex_str != null) ? new RegExp(opt_regex_str) : null;
74
75 listFilesRec(full_dirname,all_files,opt_regex);
76
77 return all_files;
78}
79
80function parseCommandLineArgs(progname,cmdline_args)
81{
82
83 let site_name = "localsite";
84 let coll_name = null;
85
86 if (cmdline_args.length == 3) {
87 // looks like we're in the form, -site xxx col
88 if (cmdline_args[0] == "-site") {
89 site_name = cmdline_args[1];
90 }
91 else {
92 printUsage(progname);
93 }
94
95 coll_name = cmdline_args[2];
96 }
97 else if (cmdline_args.length == 1) {
98 // no site given, default to localsite
99 coll_name = cmdline_args[0];
100 }
101 else {
102 printUsage(progname);
103 }
104
105 return [site_name,coll_name];
106}
107
108function getExportedFiles(coll_info)
109{
110 const full_collect_dirname = coll_info.collect_dir;
111
112 const full_export_dirname = path.join(full_collect_dirname,"export");
113 let all_exported_files = listAllFiles(full_export_dirname,/\.html$/);
114
115 return all_exported_files
116}
117
118function createSiteCollectTmpDir(coll_info)
119{
120 const full_collect_dirname = coll_info.collect_dir;
121
122 const full_collect_tmp_dirname = path.join(full_collect_dirname,"tmp");
123
124 if (!fs.existsSync(full_collect_tmp_dirname)) {
125 console.log("Creating collection 'tmp' directory");
126 fs.mkdir(full_collect_tmp_dirname);
127 }
128}
129
130function copyToSiteCollectTmp(src_full_filename, coll_info)
131{
132 // While OpenAI generates unique IDs for every file stored, its
133 // associated (metadata) filename is only the last part (basename)
134 // of the full filename
135
136 // This function exists make a copy of the file that is being
137 // imported storing it in the collection's 'tmp' folder. In
138 // making the copy, it changes the filename in tmp to be one that
139 // encode the site and collect names
140
141 const site_name = coll_info.site_name;
142 const coll_name = coll_info.coll_name;
143 const full_collect_dirname = coll_info.collect_dir;
144
145 const src_filename = path.basename(src_full_filename);
146
147 // In encoding the site and collect name into the destination file,
148 // we have chosen to do this with '|' as a separator character,
149 // suitable escaped, of course, where needed!
150 //
151 // As this is a more unusual character to include in a file, we
152 // document the discision here. It is done so this way so when we
153 // look through all the files returned by openai form its
154 // file-store (which, recall, uses only the tail-part of a
155 // filename) we can can track which site/collect a given file-id
156 // entry was fore.
157
158 //const dst_filename = site_name + "\\/" + coll_name + "\\/" + src_filename;
159 const dst_filename = site_name + "|" + coll_name + "|" + src_filename;
160
161 const dst_full_filename = path.join(full_collect_dirname,"tmp",dst_filename);
162
163 fs.copyFile(src_full_filename, dst_full_filename, (err) => {
164 if (err) throw err;
165 console.log(`${src_filename} was copied as ${dst_filename} to collection's tmp directory`);
166 });
167
168 return dst_full_filename;
169}
170
171async function purgeOpenaiSiteCollectFiles(coll_info)
172{
173
174 const site_name = coll_info.site_name;
175 const coll_name = coll_info.coll_name;
176
177 //const site_coll_regex = new RegExp(`^${site_name}\/${coll_name}\/`);
178 const site_coll_regex = new RegExp(`^${site_name}\|${coll_name}\|`);
179
180 const stored_openai_files = await myopenai.files.list();
181 console.log(stored_openai_files);
182
183 for await (const openai_file of stored_openai_files) {
184
185 const openai_filename = openai_file.filename;
186
187 if (site_coll_regex.test(openai_filename)) {
188 console.log(` Deleting OpenAI Stored File ID: ${openai_file.id}`);
189 const file = await myopenai.files.del(openai_file.id);
190 }
191 }
192}
193
194
195
196async function main()
197{
198 // check for GSDL3SRCHOME
199 const gsdl3srchome = process.env.GSDL3SRCHOME
200
201 if (!gsdl3srchome) {
202 console.error("Environment variable GSDL3SRCHOME is not set");
203 process.exit(1);
204 }
205
206 const progname = process.argv[1];
207 const cmdline_args = process.argv.slice(2);
208
209 const [site_name,coll_name] = parseCommandLineArgs(progname,cmdline_args);
210
211 const full_collect_dirname = path.join(gsdl3srchome,"web","sites",site_name,"collect",coll_name);
212
213 const coll_info = {
214 "gsdl3srchome": gsdl3srchome,
215 "site_name" : site_name,
216 "coll_name" : coll_name,
217 "collect_dir" : full_collect_dirname
218 };
219
220 createSiteCollectTmpDir(coll_info);
221
222 let all_exported_files = getExportedFiles(coll_info)
223 //all_exported_files = [ all_exported_files[0] ];
224
225 await purgeOpenaiSiteCollectFiles(coll_info);
226
227 const regex_replace = new RegExp('^'+gsdl3srchome);
228 let openai_files = [];
229
230 console.log(`Uploading exported ${site_name}/collect/${coll_name} files to OpenAI:`)
231 all_exported_files.forEach(async function(full_filename) {
232 const gsdl_full_filename = full_filename.replace(regex_replace,'$GSDL3SRCHOME');
233
234 console.log(` ${gsdl_full_filename}`);
235
236 const tmp_site_collect_full_filename = copyToSiteCollectTmp(full_filename, coll_info)
237
238
239 const openai_file = await myopenai.files.create({
240 file: fs.createReadStream(tmp_site_collect_full_filename),
241 purpose: "assistants"
242 });
243
244
245 openai_files.push(openai_file);
246 });
247}
248
249
250main();
251
Note: See TracBrowser for help on using the repository browser.