Context Navigation

xls-to-csv.py@ 33024

Last change on this file since 33024 was 33024, checked in by davidb, 5 years ago
Tweak so Title in spreadsheet maps to dc.Title, so as not to conflict with the automatically set Title from TextPlugin, when the SecondaryPlugin runs on the extracted text file
File size: 6.2 KB

Rev	Line
[32807]	1	import xlrd
	2	import csv
	3	import os
	4
[32813]	5	## worksheet_name = "Archaeological reports"
[32807]	6
[32813]	7
	8	# https://code.activestate.com/recipes/546518-simple-conversion-of-excel-files-into-csv-and-yaml/download/1/
	9	def format_excelval(book, type, value, wanttupledate):
	10	""" Clean up the incoming excel data """
	11	## Data Type Codes:
	12	## EMPTY 0
	13	## TEXT 1 a Unicode string
	14	## NUMBER 2 float
	15	## DATE 3 float
	16	## BOOLEAN 4 int; 1 means TRUE, 0 means FALSE
	17	## ERROR 5
	18	returnrow = []
	19	if type == 2: # TEXT
	20	if value == int(value): value = int(value)
	21	elif type == 3: # NUMBER
	22	datetuple = xlrd.xldate_as_tuple(value, book.datemode)
	23	value = datetuple if wanttupledate else tupledate_to_isodate(datetuple)
	24	elif type == 5: # ERROR
	25	value = xlrd.error_text_from_code[value]
	26	return value
	27
	28
[33018]	29	letter_to_folder = {
	30	'a': "A - B",
	31	'b': "A - B",
	32	'c': "C - D",
	33	'd': "C - D",
	34	'e': "E - F",
	35	'f': "E - F",
	36	'g': "G - H",
	37	'h': "G - H",
	38	'i': "I - J",
	39	'j': "I - J",
	40	'k': "K - L",
	41	'l': "K - L",
	42	'm': "M - N",
	43	'n': "M - N",
	44	'o': "O - P",
	45	'p': "O - P",
	46	'q': "Q - R",
	47	'r': "Q - R",
	48	's': "S - T",
	49	't': "S - T",
	50	'u': "U - V",
	51	'v': "U - V",
	52	'w': "W - Z",
	53	'x': "W - Z",
	54	'y': "W - Z",
	55	'z': "W - Z"
	56	}
	57
	58	def id_to_relative_pdf_file(id):
	59	first_char = id[0:1]
	60	first_char_lower = first_char.lower()
	61
	62	file_tail = letter_to_folder[first_char_lower]
	63	file_root = "Reports " + file_tail
	64
	65	pdf_file = os.path.join("Digital-Library-PDFs",file_root,id+".pdf")
	66
	67	return pdf_file
	68
	69
[32807]	70	def excel_to_bound_pdf_csv(excel_filename):
[32813]	71	workbook = xlrd.open_workbook(excel_filename, formatting_info=True)
	72	## worksheet = workbook.sheet_by_name(worksheet_name)
	73	worksheet = workbook.sheet_by_index(0)
	74
[32807]	75	excel_filename_split = os.path.split(excel_filename)
	76	excel_dirname = excel_filename_split[0]
	77	excel_file = excel_filename_split[1]
	78	excel_file_splitext = os.path.splitext(excel_file)
	79	excel_file_root = excel_file_splitext[0];
	80
	81	pdfbound_filename = os.path.join(excel_dirname,"PDF-BOUND "+excel_file_root+".csv")
	82	unbound_filename = os.path.join(excel_dirname,"UNBOUND "+excel_file_root+".csv")
	83
	84	# print "Worksheet: " + worksheet_name
	85	csv_pdfbound_ofile = open(pdfbound_filename, 'wb')
	86	#csv_unbound_ofile = open('{}.csv'.format(excel_file_root), 'wb')
	87	csv_unbound_ofile = open(unbound_filename, 'wb')
	88
	89	pdfbound_wr = csv.writer(csv_pdfbound_ofile, quoting=csv.QUOTE_ALL)
	90	unbound_wr = csv.writer(csv_unbound_ofile, quoting=csv.QUOTE_ALL)
	91
[32820]	92	# 1. Skip lines until Header encountered (as defined by encountering "Report Identifier")
	93	# 2. Write out Header line as Greenstone friendly Metadata terms
	94	# => remove sp spaces, change "Report Identifier" to hnz.Identifier
[32807]	95	# 3. Process the rest of the file, checking if accompanying
	96	# PDF file present or not
	97
	98	num_rows = worksheet.nrows
	99
[32820]	100	# 1, Skip lines until Header encountered
[32807]	101	row_i = 0
	102	found_header = False
	103	while row_i<num_rows:
[32808]	104	first_cell = worksheet.cell_value(row_i,0);
[32807]	105	if first_cell == "Report Identifier":
	106	found_header = True
	107	break
	108	print "Skipping row {} as not yet encountered 'Report Identifier' metadata label in column 0".format(row_i)
	109	row_i = row_i + 1
	110
[32820]	111	# 2. Process Header into Greenstone friendly metadata terms
[32807]	112	if found_header:
	113	entry_utf8_row = []
	114	for entry in worksheet.row_values(row_i):
[32813]	115	if entry == "Report Identifier":
	116	entry = "hnz.Identifier"
	117
[33024]	118	if entry == "Title":
	119	entry = "dc.Title"
	120
[32807]	121	entry_utf8 = unicode(entry).encode("utf-8")
	122	metadata_entry_utf8 = entry_utf8.replace(" ", "")
	123
	124	entry_utf8_row.append(metadata_entry_utf8)
	125	unbound_wr.writerow(entry_utf8_row)
	126
[32808]	127	# Add in 'Filename' as first column in pdfbound CSV file
	128	entry_utf8_row.insert(0, "Filename")
	129	pdfbound_wr.writerow(entry_utf8_row)
	130
	131
[32807]	132	row_i = row_i + 1
	133	while row_i<num_rows:
[32808]	134	num_cols = worksheet.row_len(row_i)
[32820]	135	formatted_utf8_row = []
[32808]	136
	137	found_pdf = False
	138
	139	for col_j in range(num_cols):
[32813]	140	cell = worksheet.cell(row_i,col_j)
[32808]	141
[32813]	142	cell_type = worksheet.cell_type(row_i,col_j)
	143	cell_value = worksheet.cell_value(row_i,col_j)
	144
[32820]	145	formatted_cell = format_excelval(workbook,cell_type,cell_value,False)
[32813]	146
[32808]	147	if col_j == 0:
	148	# Check to see if companion PDF file present
[33018]	149	# pdf_filename = os.path.join(excel_dirname,"pdfs",formatted_cell+".pdf")
	150	id = formatted_cell.replace(" ","")
	151
	152	pdf_file = id_to_relative_pdf_file(id)
	153	pdf_file_root, pdf_ext = os.path.splitext(pdf_file)
	154	PDF_file = pdf_file_root + ".PDF"
	155
	156	pdf_filename = os.path.join(excel_dirname,pdf_file)
	157	PDF_filename = os.path.join(excel_dirname,PDF_file)
	158
[32808]	159	if os.path.exists(pdf_filename):
	160	found_pdf = True
[32820]	161	formatted_utf8_row.insert(0, pdf_file)
[33018]	162	elif os.path.exists(PDF_filename):
	163	found_pdf = True
	164	formatted_utf8_row.insert(0, PDF_file)
	165	else:
	166	print "Unbound id: '" + id + "'"
	167
	168	if isinstance(formatted_cell, basestring):
	169	# Remove any trailing whitespace.
	170	# Newline at end particular harmful for a entry in the CSV file
	171	formatted_cell = formatted_cell.rstrip();
	172
[32820]	173	formatted_cell_utf8 = unicode(formatted_cell).encode("utf-8")
	174	formatted_utf8_row.append(formatted_cell_utf8)
[32808]	175	if found_pdf:
[32820]	176	pdfbound_wr.writerow(formatted_utf8_row)
[32808]	177	else:
[32820]	178	unbound_wr.writerow(formatted_utf8_row)
[32807]	179
	180	row_i = row_i + 1
	181	else:
	182	print "Failed to find metadata label 'Report Identifier' in column 0"
	183
	184
	185	csv_pdfbound_ofile.close()
	186	csv_unbound_ofile.close()
	187
	188	if __name__ == "__main__":
	189	excel_to_bound_pdf_csv(os.sys.argv[1])
	190

Note: See TracBrowser for help on using the repository browser.

Context Navigation

source: main/trunk/model-sites-dev/heritage-nz/collect/reports-2019/prepare/xls-to-csv.py@ 33024

Download in other formats: