Context Navigation

← Previous Change
Next Change →

Changeset 37643 for gs3-installations/thewillow

Timestamp:

2023-04-10T00:13:21+12:00 (14 months ago)

Author:

davidb

Message:

More developed version of script, that trims to area of spreadsheet where values are; changes headings to GS friendly ones

File:

: 1 edited

gs3-installations/thewillow/trunk/sites/thewillow/collect/community-contributions/prepare/xlsx-to-csv--thewillow-directorysheet.py (modified) (2 diffs)

Legend:

: Unmodified
: Added
: Removed

gs3-installations/thewillow/trunk/sites/thewillow/collect/community-contributions/prepare/xlsx-to-csv--thewillow-directorysheet.py

-              r37636
+              r37643
 #import argparse
 xlsx_ifilename="Willow Archive Directory.xlsx"
 csv_ofilename ="thewillow-directorysheet.csv"
 …
 sheet_name="Directory"
+workbook  = openpyxl.load_workbook(xlsx_ifilename)
+worksheet = workbook[sheet_name]
+#argc=len(sys.argv)
+OptHyperlinkColumn=7
+#OptHyperlinkColumn=8
+# From the Google-Drive Excel Spreadsheet:
+#
+#if argc == 1:
+#    print("",file=sys.stderr)
+#    print("Usage: ", file=sys.stderr)
+#    print("    " + sys.argv[0] + " excel-spreadsheet.xslx [excel-spreadsheet.csv]",file=sys.stderr)
+#    print("",file=sys.stderr)
+#    sys.exit(1)
+#
+#
+#if argc == 2:
+#    csv_ofilename= os.path.splitext(xlsx_ifilename)[0]+'.csv'
+num_rows = worksheet.max_row;
+# Tweak the data
+# (skip the first row with headers)
+row_i = 2
+print("Converting Excel spreadsheet to CSV")
+#while row_i<=num_rows:
+#
+#    print("  Processing row: " + str(row_i))
+#
+#    row_i = row_i + 1
+ob = csv.writer(open(csv_ofilename,'w', newline = ""))
+for r in worksheet.rows:
+    row = [a.value for a in r]
+    ob.writerow(row)
+#if __name__ == "__main__":
+#    parser = argparse.ArgumentParser()
+#    parser.add_argument('--sheetname', help="The name of the sheet within the Excel file to extractc data from")
+#    parser.add_argument('--votingtype', choices=["J","T", "JT"], help="Filter to only J=Jury, T=Tele cast votes, JT=Combined jury and tele votes")
+#    parser.add_argument('input-file.xlsx')
+#    parser.add_argument('output-file.json', nargs='?')
+#    args = parser.parse_args()
+#    sheetname = getattr(args,'sheetname');
+#    voting_type = getattr(args,'votingtype');
+#    excel_input_filename = getattr(args,'input-file.xlsx');
+#    json_output_filename = getattr(args,'output-file.json');
+#    if (json_output_filename == None):
+#        json_output_filename = os.path.splitext(excel_input_filename)[0]+'.json'
+# Ref. No., Title,          Creator,    Description,    Creation Date,  Orginial/ Copy, File,   Library Reference,  Copyright?
+#   Theme 1,    Theme 2,    Theme 3,    Object Type 1,  Object Type 2,  Object Type 3,  Notes
+GSFriendlyHeadings = [ "RefNum", "Title", "Creator", "Description", "CreationDate", "OriginalOrCopy", "FileNote", "LibraryRefNum", "InCopyright",
+                       "Theme",  "Theme", "Theme",   "ObjectType",  "ObjectType",   "ObjectType",     "Notes",
+                       # Additional column(s) derived from original column data
+                       "OptHyperlink" ]
+# Worksheet column positions start at 1
+FileNoteCol     =  7
+OptHyperlinkCol = 17
+def trimLastRow(worksheet):
+    # Work backwards from worhseet.max_row, ignoring empty rows until
+    # a non-empty cell is encountered
+    last_row = worksheet.max_row
+    last_col = worksheet.max_column
+    for row in range(last_row,0,-1):
+        is_empty = True
+        for col in range(1, last_col+1):
+            cell = worksheet.cell(row,col)
+            if cell.value is not None:
+                is_empty = False
+                break
+        if not is_empty: break
+    trimmedLastRow = row
+    return trimmedLastRow
+def trimLastColumn(worksheet):
+    # Work backwards from worhseet.max_column, ignoring empty columns until
+    # a non-empty cell is encountered
+    last_row = worksheet.max_row
+    last_col = worksheet.max_column
+    for col in range(last_col,-1,-1):
+        is_empty = True
+        for row in range(1, last_row+1):
+            cell = worksheet.cell(row,col)
+            if cell.value is not None:
+                is_empty = False
+                break
+        if not is_empty: break
+    trimmedLastCol = col
+    return trimmedLastCol
+def transformHeadings(worksheet,num_cols):
+    for i in range(0,num_cols):
+        col = i+1;
+        worksheet.cell(row=1,column=col, value=GSFriendlyHeadings[i])
+def transformWorksheet(worksheet, num_rows, num_cols):
+    #
+    # Transform the worksheet into a form sutiable for processing by
+    # Greenstone's CSVPlugin
+    #
+    # Transform 1: add in Filename column as the first column
+    #worksheet.insert_cols(idx=0)
+    #worksheet.cell(row=1,column=1).value = "Filename"
+    # Transform 2: add in new column that explicitly specifies the hyperlink as a value
+    # (skip the first row with headers)
+    row_i = 2
+    while row_i<=num_rows:
+        print("  Processing row: " + str(row_i))
+        # print(worksheet.cell(row=row_i, column=7).hyperlink.target)
+        cell = worksheet.cell(row=row_i, column=OptHyperlinkColumn)
+        opt_hyperlink = cell.hyperlink
+        if opt_hyperlink != None:
+            print(opt_hyperlink.target)
+        #   else:
+        #       print(cell.value)
+        row_i = row_i + 1
+def outputCSV(csv_ofilename,num_rows,num_cols):
+    ob = csv.writer(open(csv_ofilename,'w', newline = ""))
+    for y in range(0,num_rows):
+        row_y = y+1
+        row = []
+        for x in range(0,num_cols):
+            col_x = x+1
+            cell = worksheet.cell(row=row_y,column=col_x)
+            row.append(cell.value)
+        ob.writerow(row)
+if __name__ == "__main__":
+    #----
+    # DIY version of parsing command-line arguments
+    #----
+    #
+    # argc=len(sys.argv)
+    #
+    # if argc == 1:
+    #     print("",file=sys.stderr)
+    #     print("Usage: ", file=sys.stderr)
+    #     print("    " + sys.argv[0] + " excel-spreadsheet.xslx [excel-spreadsheet.csv]",file=sys.stderr)
+    #     print("",file=sys.stderr)
+    #     sys.exit(1)
+    #
+    #
+    # if argc == 2:
+    #     csv_ofilename= os.path.splitext(xlsx_ifilename)[0]+'.csv'
+    #----
+    # 'argparse' example for proessing command-line arguments
+    #----
+    # parser = argparse.ArgumentParser()
+    # parser.add_argument('--sheetname', help="The name of the sheet within the Excel file to extractc data from")
+    # parser.add_argument('--votingtype', choices=["J","T", "JT"], help="Filter to only J=Jury, T=Tele cast votes, JT=Combined jury and tele votes")
+    # parser.add_argument('input-file.xlsx')
+    # parser.add_argument('output-file.json', nargs='?')
+    # args = parser.parse_args()
+    # sheetname = getattr(args,'sheetname');
+    # voting_type = getattr(args,'votingtype');
+    # excel_input_filename = getattr(args,'input-file.xlsx');
+    # json_output_filename = getattr(args,'output-file.json');
+    # if (json_output_filename == None):
+    #     json_output_filename = os.path.splitext(excel_input_filename)[0]+'.json'
+    #****
+    # Running with global, hardwired input and output filenames, and sheet-name
+    #****
+    print("Loading Excel spreadsheet:")
+    print("    " + xlsx_ifilename)
+    workbook  = openpyxl.load_workbook(xlsx_ifilename)
+    worksheet = workbook[sheet_name]
+    trimmed_last_row = trimLastRow(worksheet)
+    num_rows = trimmed_last_row # spreadsheet index positions start at (1,1) not (0,0)
+    trimmed_last_col = trimLastColumn(worksheet)
+    num_cols = trimmed_last_col # spreadsheet index positions start at (1,1) not (0,0)
+    print("Trimmed worksheet row x col: " + str(num_rows) + " x " + str(num_cols))
+    print("Transforming Excel worksheet to a Greenstone compatible form")
+    transformHeadings(worksheet,num_cols)
+    transformWorksheet(worksheet,num_rows,num_cols)
+    print("Saving the converted Excel spreadsheet to CSV:")
+    print("    " + csv_ofilename)
+    outputCSV(csv_ofilename,num_rows,num_cols)

Note: See TracChangeset for help on using the changeset viewer.

Context Navigation

Changeset 37643 for gs3-installations/thewillow

Legend:

gs3-installations/thewillow/trunk/sites/thewillow/collect/community-contributions/prepare/xlsx-to-csv--thewillow-directorysheet.py

Download in other formats: