Context Navigation

source: main/trunk/model-sites-dev/heritage-nz/collect/pdf-reports/prepare/xls-to-sanitized-csv.py@ 33222

Last change on this file since 33222 was 33222, checked in by davidb, 5 years ago
More careful working with unicode/utf8
File size: 2.6 KB

Rev	Line
[33174]	1	import xlrd
	2	import csv
	3	import os
[33192]	4	import sys
[33174]	5
[33188]	6	import xlsutil
	7
[33192]	8	def excel_to_sanitized_csv(excel_filename,sanitized_csv_filename):
[33174]	9	workbook = xlrd.open_workbook(excel_filename)
	10
[33191]	11	worksheet = workbook.sheet_by_index(xlsutil.worksheet_index_pos)
	12	## worksheet = workbook.sheet_by_name(xlsutil.worksheet_name)
	13
[33192]	14	# excel_filename_split = os.path.split(excel_filename)
	15	# excel_dirname = excel_filename_split[0]
	16	# excel_file = excel_filename_split[1]
	17	# excel_file_splitext = os.path.splitext(excel_file)
	18	# excel_file_root = excel_file_splitext[0];
[33174]	19
[33192]	20	# sanitized_csv_filename = os.path.join("for-download",excel_file_root+".csv")
[33174]	21
	22	sanitized_csv_ofile = open(sanitized_csv_filename, 'wb')
	23
	24	sanitized_csv_wr = csv.writer(sanitized_csv_ofile, quoting=csv.QUOTE_ALL)
	25
	26	# 1. For header line, Build up hashmap of header-names to column number
[33188]	27	# 2. Write out CSV file for only the header-names in 'sanitized_headers'
[33174]	28
	29	num_rows = worksheet.nrows
	30
[33188]	31	# 1. get header-map
	32	header_names_mapping = xlsutil.getHeaderMappingToColNum(worksheet)
[33174]	33
[33188]	34	if header_names_mapping is None:
[33174]	35	print "Failed to find \"Report Identifier\" header in spreadsheet (Row 0 or Sheet 0)"
	36	exit(1)
	37
[33188]	38	# 2. Write out CSV file ...
[33202]	39	# Work through all the row to the spreadsheet, including the header-names
	40	row_i = xlsutil.header_row_pos
	41
[33174]	42	while row_i<num_rows:
	43	# Work through sanitized header names building up row of utf8 values
	44	sanitized_utf8_row = []
	45
[33188]	46	for header_name in xlsutil.sanitized_headers:
[33174]	47
	48	if header_name in header_names_mapping:
	49	header_col_j = header_names_mapping[header_name]
	50
[33222]	51	# cell_value = worksheet.cell_value(row_i,header_col_j)
	52	#
	53	# cell_value_utf8 = unicode(cell_value).encode("utf-8")
[33174]	54
[33222]	55	cell_value_unicode = xlsutil.cell_value_tidy_unicode(worksheet,row_i,header_col_j)
[33174]	56
[33222]	57	cell_value_utf8 = cell_value_unicode.encode("utf-8")
	58
	59	## cell_value_utf8 = unicode(cell_value).encode("utf-8")
	60
[33174]	61	sanitized_utf8_row.append(cell_value_utf8)
	62	else:
	63	print("Warning: Failed to column mapping in spreadsheet for header name \""+header_name+"\" => skipping")
	64
	65	sanitized_csv_wr.writerow(sanitized_utf8_row)
	66
	67	row_i = row_i + 1
	68
	69	sanitized_csv_ofile.close()
	70
	71	if __name__ == "__main__":
[33192]	72	if len(os.sys.argv) != 3:
	73	print >> sys.stderr, "Usage: " + os.sys.argv[0] + "input.xsl\|input.xslx output.csv"
	74	exit(1)
[33174]	75
[33192]	76	excel_to_sanitized_csv(os.sys.argv[1],os.sys.argv[2])
	77

Note: See TracBrowser for help on using the repository browser.

Download in other formats: