Changeset 38783


Ignore:
Timestamp:
2024-02-25T00:15:29+13:00 (2 months ago)
Author:
davidb
Message:

Additional functionality introduced to offset the fact the the OpenAI file-store is quite a flat structure, with our needs, where the site and collection (normally handled through directories/sub-directories) needs to represented/tracked

File:
1 edited

Legend:

Unmodified
Added
Removed
  • gs3-installations/thewillow/trunk/sites/thewillow/dlcol-chatgpt/create-assistant/main.js

    r38781 r38783  
    3737
    3838
    39 function listFilesRec(input_full_dirname,output_full_filenames)
     39function listFilesRec(input_full_dirname,output_full_filenames,opt_regex)
    4040{
    4141    try {
    42     let filenames_and_dirnames = fs.readdirSync(input_full_dirname);
     42    let filenames_and_dirnames = fs.readdirSync(input_full_dirname,opt_regex);
    4343
    4444    filenames_and_dirnames.forEach(filename_or_dirname => {
    4545       
    4646        const full_filename_or_dirname = path.join(input_full_dirname, filename_or_dirname);
    47        
     47       
    4848        if (fs.statSync(full_filename_or_dirname).isDirectory()) {
    4949        const inner_full_dirname = full_filename_or_dirname;
    50         listFilesRec(inner_full_dirname,output_full_filenames);
     50        listFilesRec(inner_full_dirname,output_full_filenames,opt_regex);
    5151        }
    5252        else {
    53         const inner_full_filename = full_filename_or_dirname;
    54         output_full_filenames.push(inner_full_filename);
     53        if ((opt_regex == null) || opt_regex.test(filename_or_dirname)) {
     54
     55            const inner_full_filename = full_filename_or_dirname;
     56            output_full_filenames.push(inner_full_filename);
     57        }
    5558        }
    5659    });
     
    6366// upgrade to include a regex to filter dirs/filenames
    6467
    65 function listAllFiles(full_dirname)
     68function listAllFiles(full_dirname,opt_regex)
    6669{
    6770    let all_files  = [];
    6871
    69     listFilesRec(full_dirname,all_files);
    70 
     72   
     73    //const opt_regex = (opt_regex_str != null) ? new RegExp(opt_regex_str) : null;
     74
     75    listFilesRec(full_dirname,all_files,opt_regex);
     76   
    7177    return all_files;
    7278}
    7379
    74 
    75 
    76 
    77 // check for GSDL3SRCHOME
    78 const gsdl3srchome = process.env.GSDL3SRCHOME
    79 
    80 if (!gsdl3srchome) {
    81     console.error("Environment variable GSDL3SRCHOME is not set");
    82     process.exit(1);
    83 }
    84 
    85 const progname = process.argv[1];
    86 const cmdline_args = process.argv.slice(2);
    87 
    88 
    89 let site_name = "localsite";
    90 let coll_name  = null;
    91 
    92 if (cmdline_args.length == 3) {
    93     // looks like we're in the form, -site xxx col
    94     if (cmdline_args[0] == "-site") {
    95     site_name = cmdline_args[1];
     80function parseCommandLineArgs(progname,cmdline_args)
     81{
     82
     83    let site_name = "localsite";
     84    let coll_name  = null;
     85   
     86    if (cmdline_args.length == 3) {
     87    // looks like we're in the form, -site xxx col
     88    if (cmdline_args[0] == "-site") {
     89        site_name = cmdline_args[1];
     90    }
     91    else {
     92        printUsage(progname);
     93    }
     94   
     95    coll_name = cmdline_args[2];
     96    }
     97    else if (cmdline_args.length == 1) {
     98    // no site given, default to localsite
     99    coll_name = cmdline_args[0];
    96100    }
    97101    else {
     
    99103    }
    100104
    101     coll_name = cmdline_args[2];
    102 }
    103 else if (cmdline_args.length == 1) {
    104     // no site given, default to localsite
    105     coll_name = cmdline_args[0];
    106 }
    107 else {
    108     printUsage(progname);
    109 }
    110 
    111 const full_export_dirname = path.join(gsdl3srchome,"web","sites",site_name,"collect",coll_name,"export");
    112 
    113 
    114 let all_exported_files = listAllFiles(full_export_dirname);
    115 console.log(all_exported_files);
    116 
    117 
    118 
    119 
    120 /*
    121 const file = await openai.files.create({
    122   file: fs.createReadStream("mydata.csv"),
    123   purpose: "assistants",
    124 });
    125 */
     105    return [site_name,coll_name];
     106}
     107
     108function getExportedFiles(coll_info)
     109{
     110    const full_collect_dirname = coll_info.collect_dir;
     111   
     112    const full_export_dirname = path.join(full_collect_dirname,"export");
     113    let all_exported_files = listAllFiles(full_export_dirname,/\.html$/);
     114
     115    return all_exported_files
     116}
     117
     118function createSiteCollectTmpDir(coll_info)
     119{
     120    const full_collect_dirname = coll_info.collect_dir;
     121
     122    const full_collect_tmp_dirname = path.join(full_collect_dirname,"tmp");
     123
     124    if (!fs.existsSync(full_collect_tmp_dirname)) {
     125    console.log("Creating collection 'tmp' directory");
     126    fs.mkdir(full_collect_tmp_dirname);
     127    }
     128}
     129
     130function copyToSiteCollectTmp(src_full_filename, coll_info)
     131{
     132    // While OpenAI generates unique IDs for every file stored, its
     133    // associated (metadata) filename is only the last part (basename)
     134    // of the full filename
     135
     136    // This function exists make a copy of the file that is being
     137    // imported storing it in the collection's 'tmp' folder.  In
     138    // making the copy, it changes the filename in tmp to be one that
     139    // encode the site and collect names
     140
     141    const site_name = coll_info.site_name;
     142    const coll_name = coll_info.coll_name;
     143    const full_collect_dirname = coll_info.collect_dir;
     144   
     145    const src_filename = path.basename(src_full_filename);
     146
     147    // In encoding the site and collect name into the destination file,
     148    // we have chosen to do this with '|' as a separator character,
     149    // suitable escaped, of course, where needed!
     150    //
     151    // As this is a more unusual character to include in a file, we
     152    // document the discision here.  It is done so this way so when we
     153    // look through all the files returned by openai form its
     154    // file-store (which, recall, uses only the tail-part of a
     155    // filename) we can can track which site/collect a given file-id
     156    // entry was fore.
     157
     158    //const dst_filename = site_name + "\\/" + coll_name + "\\/" + src_filename;
     159    const dst_filename = site_name + "|" + coll_name + "|" + src_filename;
     160   
     161    const dst_full_filename = path.join(full_collect_dirname,"tmp",dst_filename);
     162   
     163    fs.copyFile(src_full_filename, dst_full_filename, (err) => {
     164    if (err) throw err;
     165    console.log(`${src_filename} was copied as ${dst_filename} to collection's tmp directory`);
     166    });
     167
     168    return dst_full_filename;
     169}
     170
     171async function purgeOpenaiSiteCollectFiles(coll_info)
     172{
     173   
     174    const site_name = coll_info.site_name;
     175    const coll_name = coll_info.coll_name;
     176
     177    //const site_coll_regex = new RegExp(`^${site_name}\/${coll_name}\/`);
     178    const site_coll_regex = new RegExp(`^${site_name}\|${coll_name}\|`);
     179   
     180    const stored_openai_files = await myopenai.files.list();
     181    console.log(stored_openai_files);
     182   
     183    for await (const openai_file of stored_openai_files) {
     184   
     185    const openai_filename = openai_file.filename;
     186   
     187    if (site_coll_regex.test(openai_filename)) {
     188        console.log(`  Deleting OpenAI Stored File ID: ${openai_file.id}`);
     189        const file = await myopenai.files.del(openai_file.id);
     190    }
     191    }
     192}   
     193
     194   
     195   
     196async function main()
     197{
     198    // check for GSDL3SRCHOME
     199    const gsdl3srchome = process.env.GSDL3SRCHOME
     200
     201    if (!gsdl3srchome) {
     202    console.error("Environment variable GSDL3SRCHOME is not set");
     203    process.exit(1);
     204    }
     205
     206    const progname = process.argv[1];
     207    const cmdline_args = process.argv.slice(2);
     208
     209    const [site_name,coll_name] = parseCommandLineArgs(progname,cmdline_args);
     210
     211    const full_collect_dirname = path.join(gsdl3srchome,"web","sites",site_name,"collect",coll_name);
     212   
     213    const coll_info = {
     214    "gsdl3srchome": gsdl3srchome,
     215    "site_name"   : site_name,
     216    "coll_name"   : coll_name,
     217    "collect_dir" : full_collect_dirname   
     218    };
     219   
     220    createSiteCollectTmpDir(coll_info);
     221   
     222    let all_exported_files = getExportedFiles(coll_info)
     223    //all_exported_files = [ all_exported_files[0] ];
     224
     225    await purgeOpenaiSiteCollectFiles(coll_info);
     226
     227    const regex_replace = new RegExp('^'+gsdl3srchome);
     228    let openai_files = [];
     229   
     230    console.log(`Uploading exported ${site_name}/collect/${coll_name} files to OpenAI:`)
     231    all_exported_files.forEach(async function(full_filename) {
     232    const gsdl_full_filename = full_filename.replace(regex_replace,'$GSDL3SRCHOME');
     233   
     234    console.log(`  ${gsdl_full_filename}`);
     235
     236    const tmp_site_collect_full_filename = copyToSiteCollectTmp(full_filename, coll_info)
     237   
     238   
     239    const openai_file = await myopenai.files.create({
     240        file: fs.createReadStream(tmp_site_collect_full_filename),
     241        purpose: "assistants"
     242    });
     243   
     244   
     245    openai_files.push(openai_file);
     246    });
     247}
     248
     249
     250main();
     251
Note: See TracChangeset for help on using the changeset viewer.