Changeset 37643 for gs3-installations/thewillow
- Timestamp:
- 2023-04-10T00:13:21+12:00 (14 months ago)
- File:
-
- 1 edited
Legend:
- Unmodified
- Added
- Removed
-
gs3-installations/thewillow/trunk/sites/thewillow/collect/community-contributions/prepare/xlsx-to-csv--thewillow-directorysheet.py
r37636 r37643 14 14 #import argparse 15 15 16 17 16 xlsx_ifilename="Willow Archive Directory.xlsx" 18 17 csv_ofilename ="thewillow-directorysheet.csv" … … 20 19 sheet_name="Directory" 21 20 22 workbook = openpyxl.load_workbook(xlsx_ifilename) 23 worksheet = workbook[sheet_name] 24 25 26 #argc=len(sys.argv) 21 OptHyperlinkColumn=7 22 #OptHyperlinkColumn=8 23 24 25 26 27 # From the Google-Drive Excel Spreadsheet: 27 28 # 28 #if argc == 1: 29 # print("",file=sys.stderr) 30 # print("Usage: ", file=sys.stderr) 31 # print(" " + sys.argv[0] + " excel-spreadsheet.xslx [excel-spreadsheet.csv]",file=sys.stderr) 32 # print("",file=sys.stderr) 33 # sys.exit(1) 34 # 35 # 36 #if argc == 2: 37 # csv_ofilename= os.path.splitext(xlsx_ifilename)[0]+'.csv' 38 39 40 num_rows = worksheet.max_row; 41 42 # Tweak the data 43 44 # (skip the first row with headers) 45 row_i = 2 46 47 print("Converting Excel spreadsheet to CSV") 48 49 #while row_i<=num_rows: 50 # 51 # print(" Processing row: " + str(row_i)) 52 # 53 # row_i = row_i + 1 54 55 56 57 58 59 ob = csv.writer(open(csv_ofilename,'w', newline = "")) 60 61 for r in worksheet.rows: 62 row = [a.value for a in r] 63 ob.writerow(row) 64 65 66 67 68 69 70 71 #if __name__ == "__main__": 72 73 # parser = argparse.ArgumentParser() 74 # parser.add_argument('--sheetname', help="The name of the sheet within the Excel file to extractc data from") 75 # parser.add_argument('--votingtype', choices=["J","T", "JT"], help="Filter to only J=Jury, T=Tele cast votes, JT=Combined jury and tele votes") 76 # parser.add_argument('input-file.xlsx') 77 # parser.add_argument('output-file.json', nargs='?') 78 79 # args = parser.parse_args() 80 81 # sheetname = getattr(args,'sheetname'); 82 # voting_type = getattr(args,'votingtype'); 83 84 # excel_input_filename = getattr(args,'input-file.xlsx'); 85 # json_output_filename = getattr(args,'output-file.json'); 86 87 # if (json_output_filename == None): 88 # json_output_filename = os.path.splitext(excel_input_filename)[0]+'.json' 89 90 29 # Ref. No., Title, Creator, Description, Creation Date, Orginial/ Copy, File, Library Reference, Copyright? 30 # Theme 1, Theme 2, Theme 3, Object Type 1, Object Type 2, Object Type 3, Notes 31 32 GSFriendlyHeadings = [ "RefNum", "Title", "Creator", "Description", "CreationDate", "OriginalOrCopy", "FileNote", "LibraryRefNum", "InCopyright", 33 "Theme", "Theme", "Theme", "ObjectType", "ObjectType", "ObjectType", "Notes", 34 # Additional column(s) derived from original column data 35 "OptHyperlink" ] 36 37 38 # Worksheet column positions start at 1 39 40 FileNoteCol = 7 41 OptHyperlinkCol = 17 42 43 44 45 def trimLastRow(worksheet): 46 47 # Work backwards from worhseet.max_row, ignoring empty rows until 48 # a non-empty cell is encountered 49 50 last_row = worksheet.max_row 51 last_col = worksheet.max_column 52 53 54 for row in range(last_row,0,-1): 55 is_empty = True 56 for col in range(1, last_col+1): 57 cell = worksheet.cell(row,col) 58 if cell.value is not None: 59 is_empty = False 60 break 61 62 if not is_empty: break 63 64 trimmedLastRow = row 65 66 return trimmedLastRow 67 68 69 def trimLastColumn(worksheet): 70 71 # Work backwards from worhseet.max_column, ignoring empty columns until 72 # a non-empty cell is encountered 73 74 last_row = worksheet.max_row 75 last_col = worksheet.max_column 76 77 for col in range(last_col,-1,-1): 78 is_empty = True 79 for row in range(1, last_row+1): 80 cell = worksheet.cell(row,col) 81 if cell.value is not None: 82 is_empty = False 83 break 84 85 if not is_empty: break 86 87 trimmedLastCol = col 88 89 return trimmedLastCol 90 91 92 93 def transformHeadings(worksheet,num_cols): 94 95 for i in range(0,num_cols): 96 col = i+1; 97 worksheet.cell(row=1,column=col, value=GSFriendlyHeadings[i]) 98 99 100 def transformWorksheet(worksheet, num_rows, num_cols): 101 102 # 103 # Transform the worksheet into a form sutiable for processing by 104 # Greenstone's CSVPlugin 105 # 106 107 108 109 # Transform 1: add in Filename column as the first column 110 111 #worksheet.insert_cols(idx=0) 112 #worksheet.cell(row=1,column=1).value = "Filename" 113 114 115 # Transform 2: add in new column that explicitly specifies the hyperlink as a value 116 117 # (skip the first row with headers) 118 row_i = 2 119 120 while row_i<=num_rows: 121 122 print(" Processing row: " + str(row_i)) 123 124 # print(worksheet.cell(row=row_i, column=7).hyperlink.target) 125 cell = worksheet.cell(row=row_i, column=OptHyperlinkColumn) 126 opt_hyperlink = cell.hyperlink 127 128 if opt_hyperlink != None: 129 print(opt_hyperlink.target) 130 # else: 131 # print(cell.value) 132 133 row_i = row_i + 1 134 135 136 def outputCSV(csv_ofilename,num_rows,num_cols): 137 138 ob = csv.writer(open(csv_ofilename,'w', newline = "")) 139 140 for y in range(0,num_rows): 141 row_y = y+1 142 143 row = [] 144 for x in range(0,num_cols): 145 col_x = x+1 146 cell = worksheet.cell(row=row_y,column=col_x) 147 row.append(cell.value) 148 149 ob.writerow(row) 150 151 152 153 154 if __name__ == "__main__": 155 156 #---- 157 # DIY version of parsing command-line arguments 158 #---- 159 # 160 # argc=len(sys.argv) 161 # 162 # if argc == 1: 163 # print("",file=sys.stderr) 164 # print("Usage: ", file=sys.stderr) 165 # print(" " + sys.argv[0] + " excel-spreadsheet.xslx [excel-spreadsheet.csv]",file=sys.stderr) 166 # print("",file=sys.stderr) 167 # sys.exit(1) 168 # 169 # 170 # if argc == 2: 171 # csv_ofilename= os.path.splitext(xlsx_ifilename)[0]+'.csv' 172 173 #---- 174 # 'argparse' example for proessing command-line arguments 175 #---- 176 # parser = argparse.ArgumentParser() 177 # parser.add_argument('--sheetname', help="The name of the sheet within the Excel file to extractc data from") 178 # parser.add_argument('--votingtype', choices=["J","T", "JT"], help="Filter to only J=Jury, T=Tele cast votes, JT=Combined jury and tele votes") 179 # parser.add_argument('input-file.xlsx') 180 # parser.add_argument('output-file.json', nargs='?') 181 182 # args = parser.parse_args() 183 184 # sheetname = getattr(args,'sheetname'); 185 # voting_type = getattr(args,'votingtype'); 186 187 # excel_input_filename = getattr(args,'input-file.xlsx'); 188 # json_output_filename = getattr(args,'output-file.json'); 189 190 # if (json_output_filename == None): 191 # json_output_filename = os.path.splitext(excel_input_filename)[0]+'.json' 192 193 194 #**** 195 # Running with global, hardwired input and output filenames, and sheet-name 196 #**** 197 198 print("Loading Excel spreadsheet:") 199 print(" " + xlsx_ifilename) 200 workbook = openpyxl.load_workbook(xlsx_ifilename) 201 worksheet = workbook[sheet_name] 202 203 204 trimmed_last_row = trimLastRow(worksheet) 205 num_rows = trimmed_last_row # spreadsheet index positions start at (1,1) not (0,0) 206 207 trimmed_last_col = trimLastColumn(worksheet) 208 num_cols = trimmed_last_col # spreadsheet index positions start at (1,1) not (0,0) 209 210 print("Trimmed worksheet row x col: " + str(num_rows) + " x " + str(num_cols)) 211 212 print("Transforming Excel worksheet to a Greenstone compatible form") 213 transformHeadings(worksheet,num_cols) 214 transformWorksheet(worksheet,num_rows,num_cols) 215 216 print("Saving the converted Excel spreadsheet to CSV:") 217 print(" " + csv_ofilename) 218 219 outputCSV(csv_ofilename,num_rows,num_cols)
Note:
See TracChangeset
for help on using the changeset viewer.