Context Navigation

← Previous Changeset
Next Changeset →

Changeset 37645

Timestamp:

2023-04-10T18:55:33+12:00 (13 months ago)

Author:

davidb

Message:

Python script refactored, and then extended to download docs via 'gdown' module

Location:

gs3-installations/thewillow/trunk/sites/thewillow/collect/community-contributions/prepare

Files:

: 2 edited

README.txt (modified) (1 diff)
xlsx-to-csv--thewillow-directorysheet.py (modified) (4 diffs)

Legend:

: Unmodified
: Added
: Removed

gs3-installations/thewillow/trunk/sites/thewillow/collect/community-contributions/prepare/README.txt

r37636	r37645
1	1
2	2	pip install openpyxl
	3	pip install gdown
3	4
4	5

gs3-installations/thewillow/trunk/sites/thewillow/collect/community-contributions/prepare/xlsx-to-csv--thewillow-directorysheet.py

-              r37643
+              r37645
 import os
 import sys
+import re
 import csv
+import gdown
 import openpyxl
+import re
+import os
+import re
+import sys
+import urllib.request
 #import argparse
 xlsx_ifilename="Willow Archive Directory.xlsx"
 csv_ofilename ="thewillow-directorysheet.csv"
+xlsx_ifilename = "Willow Archive Directory.xlsx"
+csv_ofilename  = "thewillow-directorysheet.csv"
 sheet_name="Directory"
+OptHyperlinkColumn=7
+#OptHyperlinkColumn=8
+# From the Google-Drive Excel Spreadsheet:
+# The '/' on the end is important for gdown
+downloads_outputdir = "downloads/"
+# Headings 'as is' from Excel spreadsheet, and the Greenstone friendly versions they map to
+#
+# Ref. No., Title,          Creator,    Description,    Creation Date,  Orginial/ Copy, File,   Library Reference,  Copyright?
+#   Theme 1,    Theme 2,    Theme 3,    Object Type 1,  Object Type 2,  Object Type 3,  Notes
+GSFriendlyHeadings = [ "RefNum", "Title", "Creator", "Description", "CreationDate", "OriginalOrCopy", "FileNote", "LibraryRefNum", "InCopyright",
+                       "Theme",  "Theme", "Theme",   "ObjectType",  "ObjectType",   "ObjectType",     "Notes",
+                       # Additional column(s) derived from original column data
+                       "OptHyperlink" ]
+# Worksheet column positions start at 1
+FileNoteCol     =  7
+OptHyperlinkCol = 17
+HeadingsTransformation = {
+    "Ref. No."          : { "gsheading": "RefNum"         },
+    "Title"             : { "gsheading": "Title"          },
+    "Creator"           : { "gsheading": "Creator"        },
+    "Description"       : { "gsheading": "Description"    },
+    "Creation Date"     : { "gsheading": "CreationDate"   },
+    "Orginial/ Copy"    : { "gsheading": "OirginalOrCopy" },
+    "File"              : { "gsheading": "FileNote"       },
+    "Library Reference" : { "gsheading": "LibraryRefNum"  },
+    "Copyright?"        : { "gsheading": "InCopyright"    },
+    "Theme 1"           : { "gsheading": "Theme"          },
+    "Theme 2"           : { "gsheading": "Theme"          },
+    "Theme 3"           : { "gsheading": "Theme"          },
+    "Object Type 1"     : { "gsheading": "ObjectType"     },
+    "Object Type 2"     : { "gsheading": "ObjectType"     },
+    "Object Type 3"     : { "gsheading": "ObjectType"     },
+    "Notes"             : { "gsheading": "Notes"          }
+}
+# Additional ones that the tranform process will add in
+#
+ExtraHeadings = {
+    "Filename"     : { "colpos": 1 },
+    "DLIdentifier" : { "colpos": 2 },
+    "OrigFilename" : { "colpos": 3 },
+    "OptHyperlink" : None
+}
 …
+def initHeadingsTransformation(worksheet,num_cols):
+    # Add in extra headings
+    #
+    for extra_heading in ExtraHeadings:
+        extra_heading_rec = ExtraHeadings[extra_heading]
+        added_colpos = None
+        if extra_heading_rec is None:
+            # append to end
+            next_free_col = num_cols + 1
+            worksheet.insert_cols(idx=next_free_col)
+            worksheet.cell(row=1,column=next_free_col).value = extra_heading
+            added_colpos = next_free_col
+        else:
+            col_pos = extra_heading_rec["colpos"]
+            index_pos = col_pos - 1;
+            worksheet.insert_cols(idx=index_pos)
+            worksheet.cell(row=1,column=col_pos).value = extra_heading
+            added_colpos = col_pos
+        HeadingsTransformation[extra_heading] = { "gsheading": extra_heading, "colpos": added_colpos }
+        num_cols += 1
+    # Now work out the index position of each heading
+    #
+    for i in range(0,num_cols):
+        colpos = i + 1;
+        col_heading = worksheet.cell(row=1,column=colpos).value
+        if col_heading is not None:
+            HeadingsTransformation[col_heading.strip()]["colpos"] = colpos
+    # Returned increased num_cols value
+    return num_cols
 def transformHeadings(worksheet,num_cols):
     for i in range(0,num_cols):
+        col = i+1;
+        worksheet.cell(row=1,column=col, value=GSFriendlyHeadings[i])
+def transformWorksheet(worksheet, num_rows, num_cols):
+    #
+    # Transform the worksheet into a form sutiable for processing by
+    # Greenstone's CSVPlugin
+    #
+    # Transform 1: add in Filename column as the first column
+    #worksheet.insert_cols(idx=0)
+    #worksheet.cell(row=1,column=1).value = "Filename"
+    # Transform 2: add in new column that explicitly specifies the hyperlink as a value
+    # (skip the first row with headers)
+        colpos = i + 1;
+        heading = worksheet.cell(row=1,column=colpos).value
+        if heading is not None:
+            gs_heading = HeadingsTransformation[heading.strip()]["gsheading"]
+            worksheet.cell(row=1,column=colpos, value=gs_heading)
+def downloadURLDIY(url,ofilename):
+    print("*** downloadURLDIY() -- currently untested")
+    # Based on:
+    #    https://stackoverflow.com/questions/7243750/download-file-from-web-in-python-3
+    # Download the file from `url` and save it locally under `file_name`:
+    with urllib.request.urlopen(url) as response, open(ofilename, 'wb') as out_file:
+        data = response.read() # a `bytes` object
+        # Assume we're working with raw bytes that are UTF-8
+        # If not, then need to decode data
+        # Something along the lines of ...
+        # text = data.decode('utf-8') # a `str`; this step can't be used if data is binary
+        out_file.write(data)
+def downloadGoogleDoc(url):
+    print("Downloading Google Doc url:")
+    print("  " + url)
+    download_filename = gdown.download(url, output=downloads_outputdir, fuzzy=True, quiet=True)
+    print("Downloaded filename:", download_filename)
+    return download_filename
+def fileRenameToDLIdentifier(downloaded_filename,dl_identifier_str):
+    downloaded_dir  = os.path.dirname(downloaded_filename)
+    file_ext = os.path.splitext(downloaded_filename)[1]
+    dl_identifier_filename = os.path.join(downloaded_dir,dl_identifier_str+file_ext)
+    os.rename(downloaded_filename,dl_identifier_filename)
+def processWorksheetValues(worksheet, num_rows, num_cols):
+    dl_identifier = 1
+    # Skip the first row with headers in the worksheet
+    #
     row_i = 2
     while row_i<=num_rows:
+        print("  Processing row: " + str(row_i))
+        # print(worksheet.cell(row=row_i, column=7).hyperlink.target)
+        cell = worksheet.cell(row=row_i, column=OptHyperlinkColumn)
+        opt_hyperlink = cell.hyperlink
+        # print("    row: " + str(row_i))
+        dl_identifier_str = "tw-contrib-%04d" % dl_identifier
+        dlidentifier_colpos = HeadingsTransformation["DLIdentifier"]["colpos"]
+        worksheet.cell(row=row_i, column=dlidentifier_colpos, value=dl_identifier_str)
+        filenote_colpos = HeadingsTransformation["File"]["colpos"]
+        filenote_cell = worksheet.cell(row=row_i, column=filenote_colpos)
+        opt_hyperlink = filenote_cell.hyperlink
         if opt_hyperlink != None:
+            print(opt_hyperlink.target)
+        #   else:
+        #       print(cell.value)
+        row_i = row_i + 1
+            opt_hyperlink_colpos = HeadingsTransformation["OptHyperlink"]["colpos"]
+            doc_url = opt_hyperlink.target
+            worksheet.cell(row=row_i, column=opt_hyperlink_colpos, value=doc_url)
+            downloaded_filename = downloadGoogleDoc(doc_url)
+            orig_file = os.path.basename(downloaded_filename)
+            orig_filename_colpos = HeadingsTransformation["OrigFilename"]["colpos"]
+            worksheet.cell(row=row_i, column=orig_filename_colpos, value=orig_file)
+            fileRenameToDLIdentifier(downloaded_filename,dl_identifier_str)
+        dl_identifier += 1
+        row_i += 1
 …
     #****
+    print("")
     print("Loading Excel spreadsheet:")
     print("    " + xlsx_ifilename)
+    print("  " + xlsx_ifilename)
     workbook  = openpyxl.load_workbook(xlsx_ifilename)
     worksheet = workbook[sheet_name]
 …
     num_cols = trimmed_last_col # spreadsheet index positions start at (1,1) not (0,0)
+    print("")
     print("Trimmed worksheet row x col: " + str(num_rows) + " x " + str(num_cols))
+    if not os.path.exists(downloads_outputdir):
+        print("")
+        print("Creating downloads output directory:")
+        print("  " + downloads_outputdir)
+        os.makedirs(downloads_outputdir)
+    print("")
     print("Transforming Excel worksheet to a Greenstone compatible form")
+    print("  Adding in extra Greenstone headings")
+    num_cols = initHeadingsTransformation(worksheet,num_cols)
+    print("  Transforming Excel spreadsheet headings to Greenstone friendly ones")
     transformHeadings(worksheet,num_cols)
+    transformWorksheet(worksheet,num_rows,num_cols)
+    print("  Processing values")
+    processWorksheetValues(worksheet,num_rows,num_cols)
+    print("")
     print("Saving the converted Excel spreadsheet to CSV:")
     print("    " + csv_ofilename)
+    print("  " + csv_ofilename)
     outputCSV(csv_ofilename,num_rows,num_cols)

Note: See TracChangeset for help on using the changeset viewer.

Context Navigation

Changeset 37645

Legend:

gs3-installations/thewillow/trunk/sites/thewillow/collect/community-contributions/prepare/README.txt

gs3-installations/thewillow/trunk/sites/thewillow/collect/community-contributions/prepare/xlsx-to-csv--thewillow-directorysheet.py

Download in other formats: