Changeset 33191 for main

Show
Ignore:
Timestamp:
22.06.2019 10:33:10 (3 months ago)
Author:
davidb
Message:

Code tidy-up

Location:
main/trunk/model-sites-dev/heritage-nz/collect/pdf-reports/prepare
Files:
3 modified

Legend:

Unmodified
Added
Removed
  • main/trunk/model-sites-dev/heritage-nz/collect/pdf-reports/prepare/xls-to-csv.py

    r33188 r33191  
    44 
    55import xlsutil 
    6  
    7 ## worksheet_name = "Archaeological reports" 
    86 
    97letter_to_folder = { 
     
    4846 
    4947 
     48# **** Written, but never used 
     49# **** Col numbers refer to older version of spreadsheet 
     50# 
     51# There is now a mapping of header-name to index position which would be 
     52# a better way to do thing 
     53 
    5054def fixup_cell(col_num,cell_str): 
    5155    # col == 1 => Author 
     
    7478 
    7579def excel_to_bound_pdf_csv(excel_filename): 
    76     #workbook = xlrd.open_workbook(excel_filename, formatting_info=True) 
    7780    workbook = xlrd.open_workbook(excel_filename) 
    78     ## worksheet = workbook.sheet_by_name(worksheet_name) 
    79     worksheet = workbook.sheet_by_index(0) 
     81 
     82    worksheet = workbook.sheet_by_index(xlsutil.worksheet_index_pos) 
     83    ## worksheet = workbook.sheet_by_name(xlsutil.worksheet_name) 
    8084 
    8185    excel_filename_split = os.path.split(excel_filename) 
     
    8892    unbound_filename  = os.path.join(excel_dirname,"UNBOUND "+excel_file_root+".csv") 
    8993 
    90     # print "Worksheet: " + worksheet_name 
    9194    csv_pdfbound_ofile = open(pdfbound_filename, 'wb') 
    92     #csv_unbound_ofile  = open('{}.csv'.format(excel_file_root), 'wb') 
    9395    csv_unbound_ofile  = open(unbound_filename, 'wb') 
    9496 
    9597    pdfbound_wr = csv.writer(csv_pdfbound_ofile, quoting=csv.QUOTE_ALL) 
    9698    unbound_wr  = csv.writer(csv_unbound_ofile, quoting=csv.QUOTE_ALL) 
    97  
    98 #    # 1. Skip lines until Header encountered (as defined by encountering "Report Identifier") 
    99 #    # 2. Write out Header line as Greenstone friendly Metadata terms  
    100 #    #      => remove sp spaces, change "Report Identifier" to hnz.Identifier 
    101 #    # 3. Process the rest of the file, checking if accompanying  
    102 #    #     PDF file present or not 
    103  
    10499 
    105100    # 1. For header-line, build up hashmap of header-names to column number 
     
    110105    #     specified in 'sanitized_headers' 
    111106 
    112  
    113107    num_rows = worksheet.nrows 
    114108 
    115     # 1, Skip lines until Header encountered 
    116 #    row_i = 0 
    117 #    found_header = False 
    118 #    while row_i<num_rows: 
    119 #        first_cell = worksheet.cell_value(row_i,0); 
    120 #        if first_cell == "Report Identifier": 
    121 #            found_header = True 
    122 #            break 
    123 #        print "Skipping row {} as not yet encountered 'Report Identifier' metadata label in column 0".format(row_i) 
    124 #        row_i = row_i + 1 
    125  
     109    # 1. Get header-line hashmap of header-names to column numbers 
    126110    header_names_mapping = xlsutil.getHeaderMappingToColNum(worksheet) 
    127111 
    128     # 2. Process Header into Greenstone friendly metadata terms      
    129112    if header_names_mapping is None: 
    130113        print "Failed to find metadata label 'Report Identifier' in column 0" 
    131114        exit() 
    132115    else: 
     116        # 2. Process Header into Greenstone friendly metadata terms      
     117 
    133118        entry_utf8_row = [] 
    134119        row_i = 0; 
     
    164149        pdfbound_wr.writerow(entry_utf8_row) 
    165150                       
    166         # 3. Process the rest of the file ... 
     151        # 3. Process the rest of the file (metadata values) ... 
    167152 
    168153        row_i = row_i + 1 
     
    179164                    col_j = header_names_mapping[header_name] 
    180165 
    181 #                    cell = worksheet.cell(row_i,col_j) 
    182  
    183 #                    cell_type  = worksheet.cell_type(row_i,col_j) 
    184166                    cell_value = worksheet.cell_value(row_i,col_j) 
    185167 
    186 #                    formatted_cell_value = format_excelval(workbook,cell_type,cell_value,False) 
    187168                    formatted_cell_value = xlsutil.format_if_int(cell_value) 
    188169 
    189 #                    if col_j == 0: 
    190170                    if header_name == "Report Identifier": 
    191171                        # Check to see if companion PDF file present 
  • main/trunk/model-sites-dev/heritage-nz/collect/pdf-reports/prepare/xls-to-sanitized-csv.py

    r33188 r33191  
    55import xlsutil 
    66 
    7 ## worksheet_name = "Archaeological reports" 
    8  
    9  
    107def excel_to_sanitized_csv(excel_filename): 
    118    workbook = xlrd.open_workbook(excel_filename) 
    12     # worksheet = workbook.sheet_by_name(worksheet_name) 
    13     worksheet = workbook.sheet_by_index(0) 
     9 
     10    worksheet = workbook.sheet_by_index(xlsutil.worksheet_index_pos) 
     11    ## worksheet = workbook.sheet_by_name(xlsutil.worksheet_name) 
    1412 
    1513    excel_filename_split = os.path.split(excel_filename) 
     
    2119    sanitized_csv_filename  = os.path.join(excel_dirname,excel_file_root+".csv") 
    2220 
    23     # print "Worksheet: " + worksheet_name 
    2421    sanitized_csv_ofile = open(sanitized_csv_filename, 'wb') 
    2522 
     
    3128    num_rows = worksheet.nrows 
    3229 
    33 #    row_i = 0 
    34 #    num_header_cols = worksheet.row_len(row_i) 
    35  
    3630    # 1. get header-map 
    3731    header_names_mapping = xlsutil.getHeaderMappingToColNum(worksheet) 
    3832 
    39 #    found_header = False 
    40 #    for col_j in range(num_header_cols): 
    41 #        header_cell_value = worksheet.cell_value(row_i,col_j) 
    42 #        if header_cell_value == "Report Identifier": 
    43 #            found_header = True 
    44 # 
    45 #        header_names_mapping[header_cell_value] = col_j 
    46 # 
    47 #    if not(found_header): 
    4833    if header_names_mapping is None: 
    4934        print "Failed to find \"Report Identifier\" header in spreadsheet (Row 0 or Sheet 0)" 
     
    6550                cell_value = worksheet.cell_value(row_i,header_col_j) 
    6651 
    67  
    68  
    6952                cell_value_utf8 = unicode(cell_value).encode("utf-8") 
    7053 
  • main/trunk/model-sites-dev/heritage-nz/collect/pdf-reports/prepare/xlsutil.py

    r33188 r33191  
    11import xlrd 
     2 
     3# Currently access worksheet by index num 
     4# Alternative is to do this by worksheet name 
     5worksheet_index_pos = 0  
     6## worksheet_name = "Archaeological reports" 
    27 
    38sanitized_headers = [ "Report Identifier", "Author", "Title", "Produced By", "Date", "HNZPT Region", "File No",