Changeset 37645


Ignore:
Timestamp:
2023-04-10T18:55:33+12:00 (13 months ago)
Author:
davidb
Message:

Python script refactored, and then extended to download docs via 'gdown' module

Location:
gs3-installations/thewillow/trunk/sites/thewillow/collect/community-contributions/prepare
Files:
2 edited

Legend:

Unmodified
Added
Removed
  • gs3-installations/thewillow/trunk/sites/thewillow/collect/community-contributions/prepare/README.txt

    r37636 r37645  
    11
    22pip install openpyxl
     3pip install gdown
    34
    45
  • gs3-installations/thewillow/trunk/sites/thewillow/collect/community-contributions/prepare/xlsx-to-csv--thewillow-directorysheet.py

    r37643 r37645  
    33import os
    44import sys
     5import re
    56
    67import csv
     8import gdown
    79import openpyxl
    8 import re
    9 
    10 import os
    11 import re
    12 import sys
     10import urllib.request
     11
     12
    1313
    1414#import argparse
    1515
    16 xlsx_ifilename="Willow Archive Directory.xlsx"
    17 csv_ofilename ="thewillow-directorysheet.csv"
     16xlsx_ifilename = "Willow Archive Directory.xlsx"
     17csv_ofilename  = "thewillow-directorysheet.csv"
    1818
    1919sheet_name="Directory"
    2020
    21 OptHyperlinkColumn=7
    22 #OptHyperlinkColumn=8
    23 
    24 
    25 
    26 
    27 # From the Google-Drive Excel Spreadsheet:
     21# The '/' on the end is important for gdown
     22downloads_outputdir = "downloads/"
     23
     24
     25
     26# Headings 'as is' from Excel spreadsheet, and the Greenstone friendly versions they map to
    2827#
    29 # Ref. No., Title,          Creator,    Description,    Creation Date,  Orginial/ Copy, File,   Library Reference,  Copyright?
    30 #   Theme 1,    Theme 2,    Theme 3,    Object Type 1,  Object Type 2,  Object Type 3,  Notes
    31 
    32 GSFriendlyHeadings = [ "RefNum", "Title", "Creator", "Description", "CreationDate", "OriginalOrCopy", "FileNote", "LibraryRefNum", "InCopyright",
    33                        "Theme",  "Theme", "Theme",   "ObjectType",  "ObjectType",   "ObjectType",     "Notes",
    34                        # Additional column(s) derived from original column data
    35                        "OptHyperlink" ]
    36 
    37 
    38 # Worksheet column positions start at 1
    39 
    40 FileNoteCol     =  7
    41 OptHyperlinkCol = 17
    42 
     28HeadingsTransformation = {
     29    "Ref. No."          : { "gsheading": "RefNum"         },
     30    "Title"             : { "gsheading": "Title"          },
     31    "Creator"           : { "gsheading": "Creator"        },
     32    "Description"       : { "gsheading": "Description"    },
     33    "Creation Date"     : { "gsheading": "CreationDate"   },
     34    "Orginial/ Copy"    : { "gsheading": "OirginalOrCopy" },
     35    "File"              : { "gsheading": "FileNote"       },
     36    "Library Reference" : { "gsheading": "LibraryRefNum"  },
     37    "Copyright?"        : { "gsheading": "InCopyright"    },
     38    "Theme 1"           : { "gsheading": "Theme"          },
     39    "Theme 2"           : { "gsheading": "Theme"          },
     40    "Theme 3"           : { "gsheading": "Theme"          },
     41    "Object Type 1"     : { "gsheading": "ObjectType"     },
     42    "Object Type 2"     : { "gsheading": "ObjectType"     },
     43    "Object Type 3"     : { "gsheading": "ObjectType"     },
     44    "Notes"             : { "gsheading": "Notes"          }
     45}
     46
     47# Additional ones that the tranform process will add in
     48#
     49ExtraHeadings = {
     50    "Filename"     : { "colpos": 1 },
     51    "DLIdentifier" : { "colpos": 2 },
     52    "OrigFilename" : { "colpos": 3 },
     53    "OptHyperlink" : None
     54}
     55   
    4356
    4457
     
    91104
    92105
     106def initHeadingsTransformation(worksheet,num_cols):
     107
     108    # Add in extra headings
     109    #
     110    for extra_heading in ExtraHeadings:
     111        extra_heading_rec = ExtraHeadings[extra_heading]
     112
     113        added_colpos = None
     114       
     115        if extra_heading_rec is None:
     116            # append to end
     117            next_free_col = num_cols + 1
     118            worksheet.insert_cols(idx=next_free_col)
     119            worksheet.cell(row=1,column=next_free_col).value = extra_heading           
     120
     121            added_colpos = next_free_col
     122        else:
     123            col_pos = extra_heading_rec["colpos"]
     124            index_pos = col_pos - 1;
     125            worksheet.insert_cols(idx=index_pos)
     126            worksheet.cell(row=1,column=col_pos).value = extra_heading
     127
     128            added_colpos = col_pos
     129
     130        HeadingsTransformation[extra_heading] = { "gsheading": extra_heading, "colpos": added_colpos }
     131       
     132        num_cols += 1
     133
     134       
     135    # Now work out the index position of each heading
     136    #
     137    for i in range(0,num_cols):
     138        colpos = i + 1;
     139        col_heading = worksheet.cell(row=1,column=colpos).value
     140       
     141        if col_heading is not None:
     142            HeadingsTransformation[col_heading.strip()]["colpos"] = colpos
     143
     144    # Returned increased num_cols value
     145   
     146    return num_cols       
     147   
    93148def transformHeadings(worksheet,num_cols):
    94149
    95150    for i in range(0,num_cols):
    96         col = i+1;
    97         worksheet.cell(row=1,column=col, value=GSFriendlyHeadings[i])
    98 
    99 
    100 def transformWorksheet(worksheet, num_rows, num_cols):
    101 
    102     #
    103     # Transform the worksheet into a form sutiable for processing by
    104     # Greenstone's CSVPlugin
    105     #
    106 
    107 
    108 
    109     # Transform 1: add in Filename column as the first column
    110 
    111     #worksheet.insert_cols(idx=0)
    112     #worksheet.cell(row=1,column=1).value = "Filename"
    113 
    114 
    115     # Transform 2: add in new column that explicitly specifies the hyperlink as a value
    116 
    117     # (skip the first row with headers)
     151        colpos = i + 1;
     152        heading = worksheet.cell(row=1,column=colpos).value
     153       
     154        if heading is not None:
     155            gs_heading = HeadingsTransformation[heading.strip()]["gsheading"]           
     156            worksheet.cell(row=1,column=colpos, value=gs_heading)
     157
     158
     159def downloadURLDIY(url,ofilename):
     160    print("*** downloadURLDIY() -- currently untested")
     161
     162    # Based on:
     163    #    https://stackoverflow.com/questions/7243750/download-file-from-web-in-python-3
     164   
     165
     166    # Download the file from `url` and save it locally under `file_name`:
     167    with urllib.request.urlopen(url) as response, open(ofilename, 'wb') as out_file:
     168        data = response.read() # a `bytes` object
     169
     170        # Assume we're working with raw bytes that are UTF-8
     171        # If not, then need to decode data
     172        # Something along the lines of ...
     173        # text = data.decode('utf-8') # a `str`; this step can't be used if data is binary
     174           
     175        out_file.write(data)
     176
     177
     178def downloadGoogleDoc(url):
     179   
     180
     181    print("Downloading Google Doc url:")
     182    print("  " + url)
     183   
     184    download_filename = gdown.download(url, output=downloads_outputdir, fuzzy=True, quiet=True)
     185
     186    print("Downloaded filename:", download_filename)
     187
     188    return download_filename
     189
     190def fileRenameToDLIdentifier(downloaded_filename,dl_identifier_str):
     191   
     192    downloaded_dir  = os.path.dirname(downloaded_filename)
     193    file_ext = os.path.splitext(downloaded_filename)[1]
     194   
     195    dl_identifier_filename = os.path.join(downloaded_dir,dl_identifier_str+file_ext)
     196   
     197    os.rename(downloaded_filename,dl_identifier_filename)
     198   
     199       
     200   
     201def processWorksheetValues(worksheet, num_rows, num_cols):
     202   
     203    dl_identifier = 1
     204
     205    # Skip the first row with headers in the worksheet
     206    #
    118207    row_i = 2
    119208   
    120209    while row_i<=num_rows:
    121210       
    122         print("  Processing row: " + str(row_i))
    123        
    124         # print(worksheet.cell(row=row_i, column=7).hyperlink.target)
    125         cell = worksheet.cell(row=row_i, column=OptHyperlinkColumn)
    126         opt_hyperlink = cell.hyperlink
    127    
     211        # print("    row: " + str(row_i))
     212
     213        dl_identifier_str = "tw-contrib-%04d" % dl_identifier
     214       
     215        dlidentifier_colpos = HeadingsTransformation["DLIdentifier"]["colpos"]
     216        worksheet.cell(row=row_i, column=dlidentifier_colpos, value=dl_identifier_str)
     217               
     218        filenote_colpos = HeadingsTransformation["File"]["colpos"]
     219       
     220        filenote_cell = worksheet.cell(row=row_i, column=filenote_colpos)
     221        opt_hyperlink = filenote_cell.hyperlink
     222
    128223        if opt_hyperlink != None:
    129             print(opt_hyperlink.target)
    130         #   else:
    131         #       print(cell.value)
    132    
    133         row_i = row_i + 1
     224            opt_hyperlink_colpos = HeadingsTransformation["OptHyperlink"]["colpos"]
     225            doc_url = opt_hyperlink.target
     226            worksheet.cell(row=row_i, column=opt_hyperlink_colpos, value=doc_url)
     227           
     228            downloaded_filename = downloadGoogleDoc(doc_url)
     229
     230            orig_file = os.path.basename(downloaded_filename)
     231            orig_filename_colpos = HeadingsTransformation["OrigFilename"]["colpos"]
     232            worksheet.cell(row=row_i, column=orig_filename_colpos, value=orig_file)
     233
     234            fileRenameToDLIdentifier(downloaded_filename,dl_identifier_str)
     235           
     236        dl_identifier += 1
     237        row_i += 1
    134238
    135239
     
    196300    #****
    197301   
     302    print("")
    198303    print("Loading Excel spreadsheet:")
    199     print("    " + xlsx_ifilename)   
     304    print("  " + xlsx_ifilename)   
    200305    workbook  = openpyxl.load_workbook(xlsx_ifilename)
    201306    worksheet = workbook[sheet_name]
     
    208313    num_cols = trimmed_last_col # spreadsheet index positions start at (1,1) not (0,0)
    209314
     315    print("")
    210316    print("Trimmed worksheet row x col: " + str(num_rows) + " x " + str(num_cols))
    211    
     317
     318
     319    if not os.path.exists(downloads_outputdir):
     320        print("")
     321        print("Creating downloads output directory:")
     322        print("  " + downloads_outputdir)
     323        os.makedirs(downloads_outputdir)
     324   
     325    print("")
    212326    print("Transforming Excel worksheet to a Greenstone compatible form")
     327
     328    print("  Adding in extra Greenstone headings")
     329    num_cols = initHeadingsTransformation(worksheet,num_cols)
     330
     331    print("  Transforming Excel spreadsheet headings to Greenstone friendly ones")
    213332    transformHeadings(worksheet,num_cols)
    214     transformWorksheet(worksheet,num_rows,num_cols)
    215 
     333
     334    print("  Processing values")
     335
     336    processWorksheetValues(worksheet,num_rows,num_cols)
     337
     338    print("")
    216339    print("Saving the converted Excel spreadsheet to CSV:")
    217     print("    " + csv_ofilename)
     340    print("  " + csv_ofilename)
    218341
    219342    outputCSV(csv_ofilename,num_rows,num_cols)
Note: See TracChangeset for help on using the changeset viewer.