Changeset 33191


Ignore:
Timestamp:
2019-06-22T10:33:10+12:00 (5 years ago)
Author:
davidb
Message:

Code tidy-up

Location:
main/trunk/model-sites-dev/heritage-nz/collect/pdf-reports/prepare
Files:
3 edited

Legend:

Unmodified
Added
Removed
  • main/trunk/model-sites-dev/heritage-nz/collect/pdf-reports/prepare/xls-to-csv.py

    r33188 r33191  
    44
    55import xlsutil
    6 
    7 ## worksheet_name = "Archaeological reports"
    86
    97letter_to_folder = {
     
    4846
    4947
     48# **** Written, but never used
     49# **** Col numbers refer to older version of spreadsheet
     50#
     51# There is now a mapping of header-name to index position which would be
     52# a better way to do thing
     53
    5054def fixup_cell(col_num,cell_str):
    5155    # col == 1 => Author
     
    7478
    7579def excel_to_bound_pdf_csv(excel_filename):
    76     #workbook = xlrd.open_workbook(excel_filename, formatting_info=True)
    7780    workbook = xlrd.open_workbook(excel_filename)
    78     ## worksheet = workbook.sheet_by_name(worksheet_name)
    79     worksheet = workbook.sheet_by_index(0)
     81
     82    worksheet = workbook.sheet_by_index(xlsutil.worksheet_index_pos)
     83    ## worksheet = workbook.sheet_by_name(xlsutil.worksheet_name)
    8084
    8185    excel_filename_split = os.path.split(excel_filename)
     
    8892    unbound_filename  = os.path.join(excel_dirname,"UNBOUND "+excel_file_root+".csv")
    8993
    90     # print "Worksheet: " + worksheet_name
    9194    csv_pdfbound_ofile = open(pdfbound_filename, 'wb')
    92     #csv_unbound_ofile  = open('{}.csv'.format(excel_file_root), 'wb')
    9395    csv_unbound_ofile  = open(unbound_filename, 'wb')
    9496
    9597    pdfbound_wr = csv.writer(csv_pdfbound_ofile, quoting=csv.QUOTE_ALL)
    9698    unbound_wr  = csv.writer(csv_unbound_ofile, quoting=csv.QUOTE_ALL)
    97 
    98 #    # 1. Skip lines until Header encountered (as defined by encountering "Report Identifier")
    99 #    # 2. Write out Header line as Greenstone friendly Metadata terms
    100 #    #      => remove sp spaces, change "Report Identifier" to hnz.Identifier
    101 #    # 3. Process the rest of the file, checking if accompanying
    102 #    #     PDF file present or not
    103 
    10499
    105100    # 1. For header-line, build up hashmap of header-names to column number
     
    110105    #     specified in 'sanitized_headers'
    111106
    112 
    113107    num_rows = worksheet.nrows
    114108
    115     # 1, Skip lines until Header encountered
    116 #    row_i = 0
    117 #    found_header = False
    118 #    while row_i<num_rows:
    119 #        first_cell = worksheet.cell_value(row_i,0);
    120 #        if first_cell == "Report Identifier":
    121 #            found_header = True
    122 #            break
    123 #        print "Skipping row {} as not yet encountered 'Report Identifier' metadata label in column 0".format(row_i)
    124 #        row_i = row_i + 1
    125 
     109    # 1. Get header-line hashmap of header-names to column numbers
    126110    header_names_mapping = xlsutil.getHeaderMappingToColNum(worksheet)
    127111
    128     # 2. Process Header into Greenstone friendly metadata terms     
    129112    if header_names_mapping is None:
    130113        print "Failed to find metadata label 'Report Identifier' in column 0"
    131114        exit()
    132115    else:
     116        # 2. Process Header into Greenstone friendly metadata terms     
     117
    133118        entry_utf8_row = []
    134119        row_i = 0;
     
    164149        pdfbound_wr.writerow(entry_utf8_row)
    165150                     
    166         # 3. Process the rest of the file ...
     151        # 3. Process the rest of the file (metadata values) ...
    167152
    168153        row_i = row_i + 1
     
    179164                    col_j = header_names_mapping[header_name]
    180165
    181 #                    cell = worksheet.cell(row_i,col_j)
    182 
    183 #                    cell_type  = worksheet.cell_type(row_i,col_j)
    184166                    cell_value = worksheet.cell_value(row_i,col_j)
    185167
    186 #                    formatted_cell_value = format_excelval(workbook,cell_type,cell_value,False)
    187168                    formatted_cell_value = xlsutil.format_if_int(cell_value)
    188169
    189 #                    if col_j == 0:
    190170                    if header_name == "Report Identifier":
    191171                        # Check to see if companion PDF file present
  • main/trunk/model-sites-dev/heritage-nz/collect/pdf-reports/prepare/xls-to-sanitized-csv.py

    r33188 r33191  
    55import xlsutil
    66
    7 ## worksheet_name = "Archaeological reports"
    8 
    9 
    107def excel_to_sanitized_csv(excel_filename):
    118    workbook = xlrd.open_workbook(excel_filename)
    12     # worksheet = workbook.sheet_by_name(worksheet_name)
    13     worksheet = workbook.sheet_by_index(0)
     9
     10    worksheet = workbook.sheet_by_index(xlsutil.worksheet_index_pos)
     11    ## worksheet = workbook.sheet_by_name(xlsutil.worksheet_name)
    1412
    1513    excel_filename_split = os.path.split(excel_filename)
     
    2119    sanitized_csv_filename  = os.path.join(excel_dirname,excel_file_root+".csv")
    2220
    23     # print "Worksheet: " + worksheet_name
    2421    sanitized_csv_ofile = open(sanitized_csv_filename, 'wb')
    2522
     
    3128    num_rows = worksheet.nrows
    3229
    33 #    row_i = 0
    34 #    num_header_cols = worksheet.row_len(row_i)
    35 
    3630    # 1. get header-map
    3731    header_names_mapping = xlsutil.getHeaderMappingToColNum(worksheet)
    3832
    39 #    found_header = False
    40 #    for col_j in range(num_header_cols):
    41 #        header_cell_value = worksheet.cell_value(row_i,col_j)
    42 #        if header_cell_value == "Report Identifier":
    43 #            found_header = True
    44 #
    45 #        header_names_mapping[header_cell_value] = col_j
    46 #
    47 #    if not(found_header):
    4833    if header_names_mapping is None:
    4934        print "Failed to find \"Report Identifier\" header in spreadsheet (Row 0 or Sheet 0)"
     
    6550                cell_value = worksheet.cell_value(row_i,header_col_j)
    6651
    67 
    68 
    6952                cell_value_utf8 = unicode(cell_value).encode("utf-8")
    7053
  • main/trunk/model-sites-dev/heritage-nz/collect/pdf-reports/prepare/xlsutil.py

    r33188 r33191  
    11import xlrd
     2
     3# Currently access worksheet by index num
     4# Alternative is to do this by worksheet name
     5worksheet_index_pos = 0
     6## worksheet_name = "Archaeological reports"
    27
    38sanitized_headers = [ "Report Identifier", "Author", "Title", "Produced By", "Date", "HNZPT Region", "File No",
Note: See TracChangeset for help on using the changeset viewer.