Changeset 37645
- Timestamp:
- 2023-04-10T18:55:33+12:00 (13 months ago)
- Location:
- gs3-installations/thewillow/trunk/sites/thewillow/collect/community-contributions/prepare
- Files:
-
- 2 edited
Legend:
- Unmodified
- Added
- Removed
-
gs3-installations/thewillow/trunk/sites/thewillow/collect/community-contributions/prepare/README.txt
r37636 r37645 1 1 2 2 pip install openpyxl 3 pip install gdown 3 4 4 5 -
gs3-installations/thewillow/trunk/sites/thewillow/collect/community-contributions/prepare/xlsx-to-csv--thewillow-directorysheet.py
r37643 r37645 3 3 import os 4 4 import sys 5 import re 5 6 6 7 import csv 8 import gdown 7 9 import openpyxl 8 import re 9 10 import os 11 import re 12 import sys 10 import urllib.request 11 12 13 13 14 14 #import argparse 15 15 16 xlsx_ifilename ="Willow Archive Directory.xlsx"17 csv_ofilename ="thewillow-directorysheet.csv"16 xlsx_ifilename = "Willow Archive Directory.xlsx" 17 csv_ofilename = "thewillow-directorysheet.csv" 18 18 19 19 sheet_name="Directory" 20 20 21 OptHyperlinkColumn=7 22 #OptHyperlinkColumn=8 23 24 25 26 27 # From the Google-Drive Excel Spreadsheet: 21 # The '/' on the end is important for gdown 22 downloads_outputdir = "downloads/" 23 24 25 26 # Headings 'as is' from Excel spreadsheet, and the Greenstone friendly versions they map to 28 27 # 29 # Ref. No., Title, Creator, Description, Creation Date, Orginial/ Copy, File, Library Reference, Copyright? 30 # Theme 1, Theme 2, Theme 3, Object Type 1, Object Type 2, Object Type 3, Notes 31 32 GSFriendlyHeadings = [ "RefNum", "Title", "Creator", "Description", "CreationDate", "OriginalOrCopy", "FileNote", "LibraryRefNum", "InCopyright", 33 "Theme", "Theme", "Theme", "ObjectType", "ObjectType", "ObjectType", "Notes", 34 # Additional column(s) derived from original column data 35 "OptHyperlink" ] 36 37 38 # Worksheet column positions start at 1 39 40 FileNoteCol = 7 41 OptHyperlinkCol = 17 42 28 HeadingsTransformation = { 29 "Ref. No." : { "gsheading": "RefNum" }, 30 "Title" : { "gsheading": "Title" }, 31 "Creator" : { "gsheading": "Creator" }, 32 "Description" : { "gsheading": "Description" }, 33 "Creation Date" : { "gsheading": "CreationDate" }, 34 "Orginial/ Copy" : { "gsheading": "OirginalOrCopy" }, 35 "File" : { "gsheading": "FileNote" }, 36 "Library Reference" : { "gsheading": "LibraryRefNum" }, 37 "Copyright?" : { "gsheading": "InCopyright" }, 38 "Theme 1" : { "gsheading": "Theme" }, 39 "Theme 2" : { "gsheading": "Theme" }, 40 "Theme 3" : { "gsheading": "Theme" }, 41 "Object Type 1" : { "gsheading": "ObjectType" }, 42 "Object Type 2" : { "gsheading": "ObjectType" }, 43 "Object Type 3" : { "gsheading": "ObjectType" }, 44 "Notes" : { "gsheading": "Notes" } 45 } 46 47 # Additional ones that the tranform process will add in 48 # 49 ExtraHeadings = { 50 "Filename" : { "colpos": 1 }, 51 "DLIdentifier" : { "colpos": 2 }, 52 "OrigFilename" : { "colpos": 3 }, 53 "OptHyperlink" : None 54 } 55 43 56 44 57 … … 91 104 92 105 106 def initHeadingsTransformation(worksheet,num_cols): 107 108 # Add in extra headings 109 # 110 for extra_heading in ExtraHeadings: 111 extra_heading_rec = ExtraHeadings[extra_heading] 112 113 added_colpos = None 114 115 if extra_heading_rec is None: 116 # append to end 117 next_free_col = num_cols + 1 118 worksheet.insert_cols(idx=next_free_col) 119 worksheet.cell(row=1,column=next_free_col).value = extra_heading 120 121 added_colpos = next_free_col 122 else: 123 col_pos = extra_heading_rec["colpos"] 124 index_pos = col_pos - 1; 125 worksheet.insert_cols(idx=index_pos) 126 worksheet.cell(row=1,column=col_pos).value = extra_heading 127 128 added_colpos = col_pos 129 130 HeadingsTransformation[extra_heading] = { "gsheading": extra_heading, "colpos": added_colpos } 131 132 num_cols += 1 133 134 135 # Now work out the index position of each heading 136 # 137 for i in range(0,num_cols): 138 colpos = i + 1; 139 col_heading = worksheet.cell(row=1,column=colpos).value 140 141 if col_heading is not None: 142 HeadingsTransformation[col_heading.strip()]["colpos"] = colpos 143 144 # Returned increased num_cols value 145 146 return num_cols 147 93 148 def transformHeadings(worksheet,num_cols): 94 149 95 150 for i in range(0,num_cols): 96 col = i+1; 97 worksheet.cell(row=1,column=col, value=GSFriendlyHeadings[i]) 98 99 100 def transformWorksheet(worksheet, num_rows, num_cols): 101 102 # 103 # Transform the worksheet into a form sutiable for processing by 104 # Greenstone's CSVPlugin 105 # 106 107 108 109 # Transform 1: add in Filename column as the first column 110 111 #worksheet.insert_cols(idx=0) 112 #worksheet.cell(row=1,column=1).value = "Filename" 113 114 115 # Transform 2: add in new column that explicitly specifies the hyperlink as a value 116 117 # (skip the first row with headers) 151 colpos = i + 1; 152 heading = worksheet.cell(row=1,column=colpos).value 153 154 if heading is not None: 155 gs_heading = HeadingsTransformation[heading.strip()]["gsheading"] 156 worksheet.cell(row=1,column=colpos, value=gs_heading) 157 158 159 def downloadURLDIY(url,ofilename): 160 print("*** downloadURLDIY() -- currently untested") 161 162 # Based on: 163 # https://stackoverflow.com/questions/7243750/download-file-from-web-in-python-3 164 165 166 # Download the file from `url` and save it locally under `file_name`: 167 with urllib.request.urlopen(url) as response, open(ofilename, 'wb') as out_file: 168 data = response.read() # a `bytes` object 169 170 # Assume we're working with raw bytes that are UTF-8 171 # If not, then need to decode data 172 # Something along the lines of ... 173 # text = data.decode('utf-8') # a `str`; this step can't be used if data is binary 174 175 out_file.write(data) 176 177 178 def downloadGoogleDoc(url): 179 180 181 print("Downloading Google Doc url:") 182 print(" " + url) 183 184 download_filename = gdown.download(url, output=downloads_outputdir, fuzzy=True, quiet=True) 185 186 print("Downloaded filename:", download_filename) 187 188 return download_filename 189 190 def fileRenameToDLIdentifier(downloaded_filename,dl_identifier_str): 191 192 downloaded_dir = os.path.dirname(downloaded_filename) 193 file_ext = os.path.splitext(downloaded_filename)[1] 194 195 dl_identifier_filename = os.path.join(downloaded_dir,dl_identifier_str+file_ext) 196 197 os.rename(downloaded_filename,dl_identifier_filename) 198 199 200 201 def processWorksheetValues(worksheet, num_rows, num_cols): 202 203 dl_identifier = 1 204 205 # Skip the first row with headers in the worksheet 206 # 118 207 row_i = 2 119 208 120 209 while row_i<=num_rows: 121 210 122 print(" Processing row: " + str(row_i)) 123 124 # print(worksheet.cell(row=row_i, column=7).hyperlink.target) 125 cell = worksheet.cell(row=row_i, column=OptHyperlinkColumn) 126 opt_hyperlink = cell.hyperlink 127 211 # print(" row: " + str(row_i)) 212 213 dl_identifier_str = "tw-contrib-%04d" % dl_identifier 214 215 dlidentifier_colpos = HeadingsTransformation["DLIdentifier"]["colpos"] 216 worksheet.cell(row=row_i, column=dlidentifier_colpos, value=dl_identifier_str) 217 218 filenote_colpos = HeadingsTransformation["File"]["colpos"] 219 220 filenote_cell = worksheet.cell(row=row_i, column=filenote_colpos) 221 opt_hyperlink = filenote_cell.hyperlink 222 128 223 if opt_hyperlink != None: 129 print(opt_hyperlink.target) 130 # else: 131 # print(cell.value) 132 133 row_i = row_i + 1 224 opt_hyperlink_colpos = HeadingsTransformation["OptHyperlink"]["colpos"] 225 doc_url = opt_hyperlink.target 226 worksheet.cell(row=row_i, column=opt_hyperlink_colpos, value=doc_url) 227 228 downloaded_filename = downloadGoogleDoc(doc_url) 229 230 orig_file = os.path.basename(downloaded_filename) 231 orig_filename_colpos = HeadingsTransformation["OrigFilename"]["colpos"] 232 worksheet.cell(row=row_i, column=orig_filename_colpos, value=orig_file) 233 234 fileRenameToDLIdentifier(downloaded_filename,dl_identifier_str) 235 236 dl_identifier += 1 237 row_i += 1 134 238 135 239 … … 196 300 #**** 197 301 302 print("") 198 303 print("Loading Excel spreadsheet:") 199 print(" 304 print(" " + xlsx_ifilename) 200 305 workbook = openpyxl.load_workbook(xlsx_ifilename) 201 306 worksheet = workbook[sheet_name] … … 208 313 num_cols = trimmed_last_col # spreadsheet index positions start at (1,1) not (0,0) 209 314 315 print("") 210 316 print("Trimmed worksheet row x col: " + str(num_rows) + " x " + str(num_cols)) 211 317 318 319 if not os.path.exists(downloads_outputdir): 320 print("") 321 print("Creating downloads output directory:") 322 print(" " + downloads_outputdir) 323 os.makedirs(downloads_outputdir) 324 325 print("") 212 326 print("Transforming Excel worksheet to a Greenstone compatible form") 327 328 print(" Adding in extra Greenstone headings") 329 num_cols = initHeadingsTransformation(worksheet,num_cols) 330 331 print(" Transforming Excel spreadsheet headings to Greenstone friendly ones") 213 332 transformHeadings(worksheet,num_cols) 214 transformWorksheet(worksheet,num_rows,num_cols) 215 333 334 print(" Processing values") 335 336 processWorksheetValues(worksheet,num_rows,num_cols) 337 338 print("") 216 339 print("Saving the converted Excel spreadsheet to CSV:") 217 print(" 340 print(" " + csv_ofilename) 218 341 219 342 outputCSV(csv_ofilename,num_rows,num_cols)
Note:
See TracChangeset
for help on using the changeset viewer.