Changeset 37646


Ignore:
Timestamp:
2023-04-11T00:54:55+12:00 (13 months ago)
Author:
davidb
Message:

Further coding improvements

File:
1 edited

Legend:

Unmodified
Added
Removed
  • gs3-installations/thewillow/trunk/sites/thewillow/collect/community-contributions/prepare/xlsx-to-csv--thewillow-directorysheet.py

    r37645 r37646  
    88import gdown
    99import openpyxl
    10 import urllib.request
    11 
    12 
     10#import urllib.request
     11
     12# from urllib.request import urlopen
     13
     14import requests
    1315
    1416#import argparse
     
    2224downloads_outputdir = "downloads/"
    2325
     26#import ssl
     27#ssl._create_default_https_context = ssl._create_unverified_context
    2428
    2529
     
    122126        else:
    123127            col_pos = extra_heading_rec["colpos"]
    124             index_pos = col_pos - 1;
    125             worksheet.insert_cols(idx=index_pos)
     128            worksheet.insert_cols(col_pos,1)
    126129            worksheet.cell(row=1,column=col_pos).value = extra_heading
    127130
     
    157160
    158161
    159 def downloadURLDIY(url,ofilename):
    160     print("*** downloadURLDIY() -- currently untested")
    161 
    162     # Based on:
    163     #    https://stackoverflow.com/questions/7243750/download-file-from-web-in-python-3
    164    
    165 
    166     # Download the file from `url` and save it locally under `file_name`:
    167     with urllib.request.urlopen(url) as response, open(ofilename, 'wb') as out_file:
    168         data = response.read() # a `bytes` object
    169 
    170         # Assume we're working with raw bytes that are UTF-8
    171         # If not, then need to decode data
    172         # Something along the lines of ...
    173         # text = data.decode('utf-8') # a `str`; this step can't be used if data is binary
    174            
    175         out_file.write(data)
     162
     163def downloadURL(url):
     164    # Based on 'requests' example on
     165    #    https://www.codingem.com/python-download-file-from-url/
     166
     167    print("Downloading url:")
     168    print("  '" + url +"'")
     169
     170    ofile = url.rsplit('/', 1)[-1] # everything after the last '/'
     171    file_ext = os.path.splitext(ofile)[1]
     172
     173    # Looking for a simple file extension to the URL, otherwise assume HTML
     174
     175    if file_ext == "":
     176        ofile += ".html"
     177       
     178    ofilename = os.path.join(downloads_outputdir,ofile)
     179
     180    # Do the actual downloading, and saving to file
     181    response = requests.get(url)
     182    open(ofilename, "wb").write(response.content)
     183
     184#    # The follow works for HTML or binary data
     185#    with urlopen(url) as fin:
     186#        data = fin.read()
     187
     188#    with open(ofilename, 'wb') as fout:
     189#        fout.write(data)
     190   
     191   
     192#    # Download the file from `url` and save it locally under `ofilename`:
     193#    with urllib.request.urlopen(url) as response, open(ofilename, 'wb') as out_file:
     194#       data = response.read() # a `bytes` object
     195#        out_file.write(data)
     196
     197
     198    return ofilename
    176199
    177200
    178201def downloadGoogleDoc(url):
    179    
    180202
    181203    print("Downloading Google Doc url:")
     
    196218   
    197219    os.rename(downloaded_filename,dl_identifier_filename)
    198    
     220
     221    return dl_identifier_filename
    199222       
    200223   
     
    222245
    223246        if opt_hyperlink != None:
     247           
    224248            opt_hyperlink_colpos = HeadingsTransformation["OptHyperlink"]["colpos"]
    225249            doc_url = opt_hyperlink.target
    226250            worksheet.cell(row=row_i, column=opt_hyperlink_colpos, value=doc_url)
    227            
    228             downloaded_filename = downloadGoogleDoc(doc_url)
    229 
     251
     252            if (doc_url.startswith("https://drive.google.com") or doc_url.startswith("https://docs.google.com")):
     253                downloaded_filename = downloadGoogleDoc(doc_url)
     254
     255            else:
     256                # If no filename extension, then assumes HTML, and adds this on as the file extension
     257                downloaded_filename = downloadURL(doc_url)
     258               
    230259            orig_file = os.path.basename(downloaded_filename)
    231260            orig_filename_colpos = HeadingsTransformation["OrigFilename"]["colpos"]
    232261            worksheet.cell(row=row_i, column=orig_filename_colpos, value=orig_file)
    233262
    234             fileRenameToDLIdentifier(downloaded_filename,dl_identifier_str)
     263            renamed_filename = fileRenameToDLIdentifier(downloaded_filename,dl_identifier_str)
     264
     265            filename_colpos = HeadingsTransformation["Filename"]["colpos"]
     266            worksheet.cell(row=row_i, column=filename_colpos, value=renamed_filename)
     267
    235268           
    236269        dl_identifier += 1
Note: See TracChangeset for help on using the changeset viewer.