import xlrd # Currently access worksheet by index num # Alternative is to do this by worksheet name worksheet_index_pos = 0 ## worksheet_name = "Archaeological reports" sanitized_headers = [ u"Report Identifier", u"Author", u"Title", u"Produced By", u"Date", u"HNZPT Region", u"File No", u"Authority No", u"Site No", u"Relevant TLA's", u"Record Type", u"Date Entered" ] def make_unicode(value): value_unicode = value if type(value) != unicode: value_str = value if type(value_str) != str: value_str = str(value) value_unicode = value_str.decode('utf-8') return value_unicode def cell_value_tidy_unicode(worksheet,row_i,col_j): value= worksheet.cell_value(row_i,col_j) value_unicode = make_unicode(value) value_unicode_stripped = value_unicode.strip() return value_unicode_stripped def format_if_int(cell_value): # Numbers from worksheet are represented as floating-point type # This causes a problem when it is an 'int' as it then gets # written out as a floating point number (with a '.0') # => test for float and when there is no value after # the decimal point, explicitly cast to 'int' if type(cell_value) is float: if cell_value == int(cell_value): cell_value = int(cell_value) return cell_value header_row_pos = 0 def getHeaderMappingToColNum(worksheet): # For header line, build up hashmap of header-names to column number # Assume header line is first row of spreadsheet header_names_mapping = {} num_header_cols = worksheet.row_len(header_row_pos) found_header = False for col_j in range(num_header_cols): header_cell_value = cell_value_tidy_unicode(worksheet,header_row_pos,col_j) if header_cell_value == u"Report Identifier": found_header = True if header_cell_value and header_cell_value.strip(): header_names_mapping[header_cell_value] = col_j if not found_header: header_names_mapping = None return header_names_mapping # The following can be used on an XSL file, but not currently an XSLX file # This is because, at the time of coding, xlrd does not support loading # in an XSLT workbook with formatting_info=True # https://code.activestate.com/recipes/546518-simple-conversion-of-excel-files-into-csv-and-yaml/download/1/ def format_excelval(book, type, value, wanttupledate): """ Clean up the incoming excel data """ ## Data Type Codes: ## EMPTY 0 ## TEXT 1 a Unicode string ## NUMBER 2 float ## DATE 3 float ## BOOLEAN 4 int; 1 means TRUE, 0 means FALSE ## ERROR 5 returnrow = [] if type == 2: # TEXT if value == int(value): value = int(value) elif type == 3: # NUMBER datetuple = xlrd.xldate_as_tuple(value, book.datemode) value = datetuple if wanttupledate else tupledate_to_isodate(datetuple) elif type == 5: # ERROR value = xlrd.error_text_from_code[value] return value