Context Navigation

xls-to-csv.py@ 33024

Last change on this file since 33024 was 33024, checked in by davidb, 5 years ago
Tweak so Title in spreadsheet maps to dc.Title, so as not to conflict with the automatically set Title from TextPlugin, when the SecondaryPlugin runs on the extracted text file
File size: 6.2 KB

Line
1	import xlrd
2	import csv
3	import os
4
5	## worksheet_name = "Archaeological reports"
6
7
8	# https://code.activestate.com/recipes/546518-simple-conversion-of-excel-files-into-csv-and-yaml/download/1/
9	def format_excelval(book, type, value, wanttupledate):
10	""" Clean up the incoming excel data """
11	## Data Type Codes:
12	## EMPTY 0
13	## TEXT 1 a Unicode string
14	## NUMBER 2 float
15	## DATE 3 float
16	## BOOLEAN 4 int; 1 means TRUE, 0 means FALSE
17	## ERROR 5
18	returnrow = []
19	if type == 2: # TEXT
20	if value == int(value): value = int(value)
21	elif type == 3: # NUMBER
22	datetuple = xlrd.xldate_as_tuple(value, book.datemode)
23	value = datetuple if wanttupledate else tupledate_to_isodate(datetuple)
24	elif type == 5: # ERROR
25	value = xlrd.error_text_from_code[value]
26	return value
27
28
29	letter_to_folder = {
30	'a': "A - B",
31	'b': "A - B",
32	'c': "C - D",
33	'd': "C - D",
34	'e': "E - F",
35	'f': "E - F",
36	'g': "G - H",
37	'h': "G - H",
38	'i': "I - J",
39	'j': "I - J",
40	'k': "K - L",
41	'l': "K - L",
42	'm': "M - N",
43	'n': "M - N",
44	'o': "O - P",
45	'p': "O - P",
46	'q': "Q - R",
47	'r': "Q - R",
48	's': "S - T",
49	't': "S - T",
50	'u': "U - V",
51	'v': "U - V",
52	'w': "W - Z",
53	'x': "W - Z",
54	'y': "W - Z",
55	'z': "W - Z"
56	}
57
58	def id_to_relative_pdf_file(id):
59	first_char = id[0:1]
60	first_char_lower = first_char.lower()
61
62	file_tail = letter_to_folder[first_char_lower]
63	file_root = "Reports " + file_tail
64
65	pdf_file = os.path.join("Digital-Library-PDFs",file_root,id+".pdf")
66
67	return pdf_file
68
69
70	def excel_to_bound_pdf_csv(excel_filename):
71	workbook = xlrd.open_workbook(excel_filename, formatting_info=True)
72	## worksheet = workbook.sheet_by_name(worksheet_name)
73	worksheet = workbook.sheet_by_index(0)
74
75	excel_filename_split = os.path.split(excel_filename)
76	excel_dirname = excel_filename_split[0]
77	excel_file = excel_filename_split[1]
78	excel_file_splitext = os.path.splitext(excel_file)
79	excel_file_root = excel_file_splitext[0];
80
81	pdfbound_filename = os.path.join(excel_dirname,"PDF-BOUND "+excel_file_root+".csv")
82	unbound_filename = os.path.join(excel_dirname,"UNBOUND "+excel_file_root+".csv")
83
84	# print "Worksheet: " + worksheet_name
85	csv_pdfbound_ofile = open(pdfbound_filename, 'wb')
86	#csv_unbound_ofile = open('{}.csv'.format(excel_file_root), 'wb')
87	csv_unbound_ofile = open(unbound_filename, 'wb')
88
89	pdfbound_wr = csv.writer(csv_pdfbound_ofile, quoting=csv.QUOTE_ALL)
90	unbound_wr = csv.writer(csv_unbound_ofile, quoting=csv.QUOTE_ALL)
91
92	# 1. Skip lines until Header encountered (as defined by encountering "Report Identifier")
93	# 2. Write out Header line as Greenstone friendly Metadata terms
94	# => remove sp spaces, change "Report Identifier" to hnz.Identifier
95	# 3. Process the rest of the file, checking if accompanying
96	# PDF file present or not
97
98	num_rows = worksheet.nrows
99
100	# 1, Skip lines until Header encountered
101	row_i = 0
102	found_header = False
103	while row_i<num_rows:
104	first_cell = worksheet.cell_value(row_i,0);
105	if first_cell == "Report Identifier":
106	found_header = True
107	break
108	print "Skipping row {} as not yet encountered 'Report Identifier' metadata label in column 0".format(row_i)
109	row_i = row_i + 1
110
111	# 2. Process Header into Greenstone friendly metadata terms
112	if found_header:
113	entry_utf8_row = []
114	for entry in worksheet.row_values(row_i):
115	if entry == "Report Identifier":
116	entry = "hnz.Identifier"
117
118	if entry == "Title":
119	entry = "dc.Title"
120
121	entry_utf8 = unicode(entry).encode("utf-8")
122	metadata_entry_utf8 = entry_utf8.replace(" ", "")
123
124	entry_utf8_row.append(metadata_entry_utf8)
125	unbound_wr.writerow(entry_utf8_row)
126
127	# Add in 'Filename' as first column in pdfbound CSV file
128	entry_utf8_row.insert(0, "Filename")
129	pdfbound_wr.writerow(entry_utf8_row)
130
131
132	row_i = row_i + 1
133	while row_i<num_rows:
134	num_cols = worksheet.row_len(row_i)
135	formatted_utf8_row = []
136
137	found_pdf = False
138
139	for col_j in range(num_cols):
140	cell = worksheet.cell(row_i,col_j)
141
142	cell_type = worksheet.cell_type(row_i,col_j)
143	cell_value = worksheet.cell_value(row_i,col_j)
144
145	formatted_cell = format_excelval(workbook,cell_type,cell_value,False)
146
147	if col_j == 0:
148	# Check to see if companion PDF file present
149	# pdf_filename = os.path.join(excel_dirname,"pdfs",formatted_cell+".pdf")
150	id = formatted_cell.replace(" ","")
151
152	pdf_file = id_to_relative_pdf_file(id)
153	pdf_file_root, pdf_ext = os.path.splitext(pdf_file)
154	PDF_file = pdf_file_root + ".PDF"
155
156	pdf_filename = os.path.join(excel_dirname,pdf_file)
157	PDF_filename = os.path.join(excel_dirname,PDF_file)
158
159	if os.path.exists(pdf_filename):
160	found_pdf = True
161	formatted_utf8_row.insert(0, pdf_file)
162	elif os.path.exists(PDF_filename):
163	found_pdf = True
164	formatted_utf8_row.insert(0, PDF_file)
165	else:
166	print "Unbound id: '" + id + "'"
167
168	if isinstance(formatted_cell, basestring):
169	# Remove any trailing whitespace.
170	# Newline at end particular harmful for a entry in the CSV file
171	formatted_cell = formatted_cell.rstrip();
172
173	formatted_cell_utf8 = unicode(formatted_cell).encode("utf-8")
174	formatted_utf8_row.append(formatted_cell_utf8)
175	if found_pdf:
176	pdfbound_wr.writerow(formatted_utf8_row)
177	else:
178	unbound_wr.writerow(formatted_utf8_row)
179
180	row_i = row_i + 1
181	else:
182	print "Failed to find metadata label 'Report Identifier' in column 0"
183
184
185	csv_pdfbound_ofile.close()
186	csv_unbound_ofile.close()
187
188	if __name__ == "__main__":
189	excel_to_bound_pdf_csv(os.sys.argv[1])
190

Note: See TracBrowser for help on using the repository browser.

Context Navigation

source: main/trunk/model-sites-dev/heritage-nz/collect/reports-2019/prepare/xls-to-csv.py@ 33024

Download in other formats: