source: main/trunk/model-sites-dev/heritage-nz/collect/pdf-reports/prepare/xls-to-sanitized-csv.py@ 33222

Last change on this file since 33222 was 33222, checked in by davidb, 5 years ago

More careful working with unicode/utf8

File size: 2.6 KB
Line 
1import xlrd
2import csv
3import os
4import sys
5
6import xlsutil
7
8def excel_to_sanitized_csv(excel_filename,sanitized_csv_filename):
9 workbook = xlrd.open_workbook(excel_filename)
10
11 worksheet = workbook.sheet_by_index(xlsutil.worksheet_index_pos)
12 ## worksheet = workbook.sheet_by_name(xlsutil.worksheet_name)
13
14 # excel_filename_split = os.path.split(excel_filename)
15 # excel_dirname = excel_filename_split[0]
16 # excel_file = excel_filename_split[1]
17 # excel_file_splitext = os.path.splitext(excel_file)
18 # excel_file_root = excel_file_splitext[0];
19
20 # sanitized_csv_filename = os.path.join("for-download",excel_file_root+".csv")
21
22 sanitized_csv_ofile = open(sanitized_csv_filename, 'wb')
23
24 sanitized_csv_wr = csv.writer(sanitized_csv_ofile, quoting=csv.QUOTE_ALL)
25
26 # 1. For header line, Build up hashmap of header-names to column number
27 # 2. Write out CSV file for only the header-names in 'sanitized_headers'
28
29 num_rows = worksheet.nrows
30
31 # 1. get header-map
32 header_names_mapping = xlsutil.getHeaderMappingToColNum(worksheet)
33
34 if header_names_mapping is None:
35 print "Failed to find \"Report Identifier\" header in spreadsheet (Row 0 or Sheet 0)"
36 exit(1)
37
38 # 2. Write out CSV file ...
39 # Work through all the row to the spreadsheet, including the header-names
40 row_i = xlsutil.header_row_pos
41
42 while row_i<num_rows:
43 # Work through sanitized header names building up row of utf8 values
44 sanitized_utf8_row = []
45
46 for header_name in xlsutil.sanitized_headers:
47
48 if header_name in header_names_mapping:
49 header_col_j = header_names_mapping[header_name]
50
51# cell_value = worksheet.cell_value(row_i,header_col_j)
52#
53# cell_value_utf8 = unicode(cell_value).encode("utf-8")
54
55 cell_value_unicode = xlsutil.cell_value_tidy_unicode(worksheet,row_i,header_col_j)
56
57 cell_value_utf8 = cell_value_unicode.encode("utf-8")
58
59## cell_value_utf8 = unicode(cell_value).encode("utf-8")
60
61 sanitized_utf8_row.append(cell_value_utf8)
62 else:
63 print("Warning: Failed to column mapping in spreadsheet for header name \""+header_name+"\" => skipping")
64
65 sanitized_csv_wr.writerow(sanitized_utf8_row)
66
67 row_i = row_i + 1
68
69 sanitized_csv_ofile.close()
70
71if __name__ == "__main__":
72 if len(os.sys.argv) != 3:
73 print >> sys.stderr, "Usage: " + os.sys.argv[0] + "input.xsl|input.xslx output.csv"
74 exit(1)
75
76 excel_to_sanitized_csv(os.sys.argv[1],os.sys.argv[2])
77
Note: See TracBrowser for help on using the repository browser.