root/main/trunk/model-sites-dev/heritage-nz/collect/pdf-reports/prepare/xlsutil.py @ 33249

Revision 33249, 3.3 KB (checked in by davidb, 14 months ago)

Further checking needed when converting cell-value to be in unicode format

Line 
1import xlrd
2
3# Currently access worksheet by index num
4# Alternative is to do this by worksheet name
5worksheet_index_pos = 0
6## worksheet_name = "Archaeological reports"
7
8sanitized_headers = [ u"Report Identifier", u"Author", u"Title", u"Produced By", u"Date", u"HNZPT Region", u"File No",
9                      u"Authority No", u"Site No", u"Relevant TLA's", u"Record Type", u"Date Entered" ]
10
11def make_unicode(value):
12    value_unicode = value
13
14    if type(value) != unicode:
15        value_str = value
16        if type(value) != str:
17
18            # numbers in the spreadsheet default to 'float'
19            # but if actually a whole number, want it be
20            # type cast to an 'int' otherwise a '.0' ends
21            # up on the end of the string version
22            if type(value) is float:
23                if value == int(value):
24                    value = int(value)
25
26            value_str = str(value)
27       
28        value_unicode =  value_str.decode('utf-8')
29
30    return value_unicode
31
32
33def cell_value_tidy_unicode(worksheet,row_i,col_j):
34    value= worksheet.cell_value(row_i,col_j)
35
36    value_unicode = make_unicode(value)
37    value_unicode_stripped = value_unicode.strip()
38
39    return value_unicode_stripped
40
41
42def format_if_int(cell_value):
43    # Numbers from worksheet are represented as floating-point type
44    # This causes a problem when it is an 'int' as it then gets
45    # written out as a floating point number (with a '.0')
46    # => test for float and when there is no value after
47    #    the decimal point, explicitly cast to 'int'
48    if type(cell_value) is float:
49        if cell_value == int(cell_value):
50            cell_value = int(cell_value)
51
52    return cell_value
53
54
55
56header_row_pos = 0
57
58def getHeaderMappingToColNum(worksheet):
59
60    # For header line, build up hashmap of header-names to column number
61    # Assume header line is first row of spreadsheet
62
63    header_names_mapping = {}
64
65    num_header_cols = worksheet.row_len(header_row_pos)
66
67    found_header = False
68    for col_j in range(num_header_cols):
69        header_cell_value = cell_value_tidy_unicode(worksheet,header_row_pos,col_j)
70        if header_cell_value == u"Report Identifier":
71            found_header = True
72
73        if header_cell_value and header_cell_value.strip():
74            header_names_mapping[header_cell_value] = col_j
75
76    if not found_header:
77        header_names_mapping = None
78
79    return header_names_mapping
80
81
82# The following can be used on an XSL file, but not currently an XSLX file
83# This is because, at the time of coding, xlrd does not support loading
84# in an XSLT workbook with formatting_info=True
85 
86# https://code.activestate.com/recipes/546518-simple-conversion-of-excel-files-into-csv-and-yaml/download/1/
87def format_excelval(book, type, value, wanttupledate):
88    """ Clean up the incoming excel data """
89    ##  Data Type Codes:
90    ##  EMPTY   0
91    ##  TEXT    1 a Unicode string
92    ##  NUMBER  2 float
93    ##  DATE    3 float
94    ##  BOOLEAN 4 int; 1 means TRUE, 0 means FALSE
95    ##  ERROR   5
96    returnrow = []
97    if   type == 2: # TEXT
98        if value == int(value): value = int(value)
99    elif type == 3: # NUMBER
100        datetuple = xlrd.xldate_as_tuple(value, book.datemode)
101        value = datetuple if wanttupledate else tupledate_to_isodate(datetuple)
102    elif type == 5: # ERROR
103        value = xlrd.error_text_from_code[value]
104    return value
105
Note: See TracBrowser for help on using the browser.