Ignore:
Timestamp:
2023-04-10T00:13:21+12:00 (14 months ago)
Author:
davidb
Message:

More developed version of script, that trims to area of spreadsheet where values are; changes headings to GS friendly ones

File:
1 edited

Legend:

Unmodified
Added
Removed
  • gs3-installations/thewillow/trunk/sites/thewillow/collect/community-contributions/prepare/xlsx-to-csv--thewillow-directorysheet.py

    r37636 r37643  
    1414#import argparse
    1515
    16 
    1716xlsx_ifilename="Willow Archive Directory.xlsx"
    1817csv_ofilename ="thewillow-directorysheet.csv"
     
    2019sheet_name="Directory"
    2120
    22 workbook  = openpyxl.load_workbook(xlsx_ifilename)
    23 worksheet = workbook[sheet_name]
    24 
    25 
    26 #argc=len(sys.argv)
     21OptHyperlinkColumn=7
     22#OptHyperlinkColumn=8
     23
     24
     25
     26
     27# From the Google-Drive Excel Spreadsheet:
    2728#
    28 #if argc == 1:
    29 #    print("",file=sys.stderr)
    30 #    print("Usage: ", file=sys.stderr)
    31 #    print("    " + sys.argv[0] + " excel-spreadsheet.xslx [excel-spreadsheet.csv]",file=sys.stderr)
    32 #    print("",file=sys.stderr)
    33 #    sys.exit(1)
    34 #
    35 #
    36 #if argc == 2:
    37 #    csv_ofilename= os.path.splitext(xlsx_ifilename)[0]+'.csv'
    38 
    39 
    40 num_rows = worksheet.max_row;
    41 
    42 # Tweak the data
    43 
    44 # (skip the first row with headers)
    45 row_i = 2
    46 
    47 print("Converting Excel spreadsheet to CSV")
    48 
    49 #while row_i<=num_rows:
    50 #
    51 #    print("  Processing row: " + str(row_i))
    52 #   
    53 #    row_i = row_i + 1
    54 
    55          
    56        
    57    
    58 
    59 ob = csv.writer(open(csv_ofilename,'w', newline = ""))
    60    
    61 for r in worksheet.rows:
    62     row = [a.value for a in r]
    63     ob.writerow(row)
    64 
    65 
    66 
    67 
    68 
    69 
    70 
    71 #if __name__ == "__main__":
    72 
    73 #    parser = argparse.ArgumentParser()
    74 #    parser.add_argument('--sheetname', help="The name of the sheet within the Excel file to extractc data from")
    75 #    parser.add_argument('--votingtype', choices=["J","T", "JT"], help="Filter to only J=Jury, T=Tele cast votes, JT=Combined jury and tele votes")
    76 #    parser.add_argument('input-file.xlsx')
    77 #    parser.add_argument('output-file.json', nargs='?')
    78 
    79 #    args = parser.parse_args()
    80 
    81 #    sheetname = getattr(args,'sheetname');
    82 #    voting_type = getattr(args,'votingtype');
    83 
    84 #    excel_input_filename = getattr(args,'input-file.xlsx');
    85 #    json_output_filename = getattr(args,'output-file.json');
    86 
    87 #    if (json_output_filename == None):
    88 #        json_output_filename = os.path.splitext(excel_input_filename)[0]+'.json'
    89 
    90    
     29# Ref. No., Title,          Creator,    Description,    Creation Date,  Orginial/ Copy, File,   Library Reference,  Copyright?
     30#   Theme 1,    Theme 2,    Theme 3,    Object Type 1,  Object Type 2,  Object Type 3,  Notes
     31
     32GSFriendlyHeadings = [ "RefNum", "Title", "Creator", "Description", "CreationDate", "OriginalOrCopy", "FileNote", "LibraryRefNum", "InCopyright",
     33                       "Theme",  "Theme", "Theme",   "ObjectType",  "ObjectType",   "ObjectType",     "Notes",
     34                       # Additional column(s) derived from original column data
     35                       "OptHyperlink" ]
     36
     37
     38# Worksheet column positions start at 1
     39
     40FileNoteCol     =  7
     41OptHyperlinkCol = 17
     42
     43
     44
     45def trimLastRow(worksheet):
     46
     47    # Work backwards from worhseet.max_row, ignoring empty rows until
     48    # a non-empty cell is encountered
     49   
     50    last_row = worksheet.max_row
     51    last_col = worksheet.max_column
     52
     53   
     54    for row in range(last_row,0,-1):
     55        is_empty = True
     56        for col in range(1, last_col+1):
     57            cell = worksheet.cell(row,col)
     58            if cell.value is not None:
     59                is_empty = False
     60                break
     61           
     62        if not is_empty: break
     63       
     64    trimmedLastRow = row
     65   
     66    return trimmedLastRow
     67
     68
     69def trimLastColumn(worksheet):
     70
     71    # Work backwards from worhseet.max_column, ignoring empty columns until
     72    # a non-empty cell is encountered
     73   
     74    last_row = worksheet.max_row
     75    last_col = worksheet.max_column
     76   
     77    for col in range(last_col,-1,-1):
     78        is_empty = True
     79        for row in range(1, last_row+1):
     80            cell = worksheet.cell(row,col)
     81            if cell.value is not None:
     82                is_empty = False
     83                break
     84           
     85        if not is_empty: break
     86       
     87    trimmedLastCol = col
     88   
     89    return trimmedLastCol
     90
     91
     92
     93def transformHeadings(worksheet,num_cols):
     94
     95    for i in range(0,num_cols):
     96        col = i+1;
     97        worksheet.cell(row=1,column=col, value=GSFriendlyHeadings[i])
     98
     99
     100def transformWorksheet(worksheet, num_rows, num_cols):
     101
     102    #
     103    # Transform the worksheet into a form sutiable for processing by
     104    # Greenstone's CSVPlugin
     105    #
     106
     107
     108
     109    # Transform 1: add in Filename column as the first column
     110
     111    #worksheet.insert_cols(idx=0)
     112    #worksheet.cell(row=1,column=1).value = "Filename"
     113
     114
     115    # Transform 2: add in new column that explicitly specifies the hyperlink as a value
     116
     117    # (skip the first row with headers)
     118    row_i = 2
     119   
     120    while row_i<=num_rows:
     121       
     122        print("  Processing row: " + str(row_i))
     123       
     124        # print(worksheet.cell(row=row_i, column=7).hyperlink.target)
     125        cell = worksheet.cell(row=row_i, column=OptHyperlinkColumn)
     126        opt_hyperlink = cell.hyperlink
     127   
     128        if opt_hyperlink != None:
     129            print(opt_hyperlink.target)
     130        #   else:
     131        #       print(cell.value)
     132   
     133        row_i = row_i + 1
     134
     135
     136def outputCSV(csv_ofilename,num_rows,num_cols):
     137   
     138    ob = csv.writer(open(csv_ofilename,'w', newline = ""))
     139
     140    for y in range(0,num_rows):
     141        row_y = y+1
     142       
     143        row = []
     144        for x in range(0,num_cols):
     145            col_x = x+1
     146            cell = worksheet.cell(row=row_y,column=col_x)           
     147            row.append(cell.value)
     148           
     149        ob.writerow(row)
     150   
     151
     152
     153
     154if __name__ == "__main__":   
     155
     156    #----
     157    # DIY version of parsing command-line arguments
     158    #----
     159    #
     160    # argc=len(sys.argv)
     161    #
     162    # if argc == 1:
     163    #     print("",file=sys.stderr)
     164    #     print("Usage: ", file=sys.stderr)
     165    #     print("    " + sys.argv[0] + " excel-spreadsheet.xslx [excel-spreadsheet.csv]",file=sys.stderr)
     166    #     print("",file=sys.stderr)
     167    #     sys.exit(1)
     168    #
     169    #
     170    # if argc == 2:
     171    #     csv_ofilename= os.path.splitext(xlsx_ifilename)[0]+'.csv'
     172
     173    #----
     174    # 'argparse' example for proessing command-line arguments
     175    #----
     176    # parser = argparse.ArgumentParser()
     177    # parser.add_argument('--sheetname', help="The name of the sheet within the Excel file to extractc data from")
     178    # parser.add_argument('--votingtype', choices=["J","T", "JT"], help="Filter to only J=Jury, T=Tele cast votes, JT=Combined jury and tele votes")
     179    # parser.add_argument('input-file.xlsx')
     180    # parser.add_argument('output-file.json', nargs='?')
     181   
     182    # args = parser.parse_args()
     183   
     184    # sheetname = getattr(args,'sheetname');
     185    # voting_type = getattr(args,'votingtype');
     186   
     187    # excel_input_filename = getattr(args,'input-file.xlsx');
     188    # json_output_filename = getattr(args,'output-file.json');
     189   
     190    # if (json_output_filename == None):
     191    #     json_output_filename = os.path.splitext(excel_input_filename)[0]+'.json'
     192
     193
     194    #****
     195    # Running with global, hardwired input and output filenames, and sheet-name
     196    #****
     197   
     198    print("Loading Excel spreadsheet:")
     199    print("    " + xlsx_ifilename)   
     200    workbook  = openpyxl.load_workbook(xlsx_ifilename)
     201    worksheet = workbook[sheet_name]
     202
     203
     204    trimmed_last_row = trimLastRow(worksheet)
     205    num_rows = trimmed_last_row # spreadsheet index positions start at (1,1) not (0,0)
     206   
     207    trimmed_last_col = trimLastColumn(worksheet)
     208    num_cols = trimmed_last_col # spreadsheet index positions start at (1,1) not (0,0)
     209
     210    print("Trimmed worksheet row x col: " + str(num_rows) + " x " + str(num_cols))
     211   
     212    print("Transforming Excel worksheet to a Greenstone compatible form")
     213    transformHeadings(worksheet,num_cols)
     214    transformWorksheet(worksheet,num_rows,num_cols)
     215
     216    print("Saving the converted Excel spreadsheet to CSV:")
     217    print("    " + csv_ofilename)
     218
     219    outputCSV(csv_ofilename,num_rows,num_cols)
Note: See TracChangeset for help on using the changeset viewer.