source: main/trunk/model-sites-dev/heritage-nz/collect/reports-2019/prepare/xls-to-csv.py@ 33018

Last change on this file since 33018 was 33018, checked in by davidb, 2 years ago

Script upgraded to work with full set of PDFs provided by Heritage NZ

File size: 6.1 KB
Line 
1import xlrd
2import csv
3import os
4
5## worksheet_name = "Archaeological reports"
6
7
8# https://code.activestate.com/recipes/546518-simple-conversion-of-excel-files-into-csv-and-yaml/download/1/
9def format_excelval(book, type, value, wanttupledate):
10 """ Clean up the incoming excel data """
11 ## Data Type Codes:
12 ## EMPTY 0
13 ## TEXT 1 a Unicode string
14 ## NUMBER 2 float
15 ## DATE 3 float
16 ## BOOLEAN 4 int; 1 means TRUE, 0 means FALSE
17 ## ERROR 5
18 returnrow = []
19 if type == 2: # TEXT
20 if value == int(value): value = int(value)
21 elif type == 3: # NUMBER
22 datetuple = xlrd.xldate_as_tuple(value, book.datemode)
23 value = datetuple if wanttupledate else tupledate_to_isodate(datetuple)
24 elif type == 5: # ERROR
25 value = xlrd.error_text_from_code[value]
26 return value
27
28
29letter_to_folder = {
30 'a': "A - B",
31 'b': "A - B",
32 'c': "C - D",
33 'd': "C - D",
34 'e': "E - F",
35 'f': "E - F",
36 'g': "G - H",
37 'h': "G - H",
38 'i': "I - J",
39 'j': "I - J",
40 'k': "K - L",
41 'l': "K - L",
42 'm': "M - N",
43 'n': "M - N",
44 'o': "O - P",
45 'p': "O - P",
46 'q': "Q - R",
47 'r': "Q - R",
48 's': "S - T",
49 't': "S - T",
50 'u': "U - V",
51 'v': "U - V",
52 'w': "W - Z",
53 'x': "W - Z",
54 'y': "W - Z",
55 'z': "W - Z"
56}
57
58def id_to_relative_pdf_file(id):
59 first_char = id[0:1]
60 first_char_lower = first_char.lower()
61
62 file_tail = letter_to_folder[first_char_lower]
63 file_root = "Reports " + file_tail
64
65 pdf_file = os.path.join("Digital-Library-PDFs",file_root,id+".pdf")
66
67 return pdf_file
68
69
70def excel_to_bound_pdf_csv(excel_filename):
71 workbook = xlrd.open_workbook(excel_filename, formatting_info=True)
72 ## worksheet = workbook.sheet_by_name(worksheet_name)
73 worksheet = workbook.sheet_by_index(0)
74
75 excel_filename_split = os.path.split(excel_filename)
76 excel_dirname = excel_filename_split[0]
77 excel_file = excel_filename_split[1]
78 excel_file_splitext = os.path.splitext(excel_file)
79 excel_file_root = excel_file_splitext[0];
80
81 pdfbound_filename = os.path.join(excel_dirname,"PDF-BOUND "+excel_file_root+".csv")
82 unbound_filename = os.path.join(excel_dirname,"UNBOUND "+excel_file_root+".csv")
83
84 # print "Worksheet: " + worksheet_name
85 csv_pdfbound_ofile = open(pdfbound_filename, 'wb')
86 #csv_unbound_ofile = open('{}.csv'.format(excel_file_root), 'wb')
87 csv_unbound_ofile = open(unbound_filename, 'wb')
88
89 pdfbound_wr = csv.writer(csv_pdfbound_ofile, quoting=csv.QUOTE_ALL)
90 unbound_wr = csv.writer(csv_unbound_ofile, quoting=csv.QUOTE_ALL)
91
92 # 1. Skip lines until Header encountered (as defined by encountering "Report Identifier")
93 # 2. Write out Header line as Greenstone friendly Metadata terms
94 # => remove sp spaces, change "Report Identifier" to hnz.Identifier
95 # 3. Process the rest of the file, checking if accompanying
96 # PDF file present or not
97
98 num_rows = worksheet.nrows
99
100 # 1, Skip lines until Header encountered
101 row_i = 0
102 found_header = False
103 while row_i<num_rows:
104 first_cell = worksheet.cell_value(row_i,0);
105 if first_cell == "Report Identifier":
106 found_header = True
107 break
108 print "Skipping row {} as not yet encountered 'Report Identifier' metadata label in column 0".format(row_i)
109 row_i = row_i + 1
110
111 # 2. Process Header into Greenstone friendly metadata terms
112 if found_header:
113 entry_utf8_row = []
114 for entry in worksheet.row_values(row_i):
115 if entry == "Report Identifier":
116 entry = "hnz.Identifier"
117
118 entry_utf8 = unicode(entry).encode("utf-8")
119 metadata_entry_utf8 = entry_utf8.replace(" ", "")
120
121 entry_utf8_row.append(metadata_entry_utf8)
122 unbound_wr.writerow(entry_utf8_row)
123
124 # Add in 'Filename' as first column in pdfbound CSV file
125 entry_utf8_row.insert(0, "Filename")
126 pdfbound_wr.writerow(entry_utf8_row)
127
128
129 row_i = row_i + 1
130 while row_i<num_rows:
131 num_cols = worksheet.row_len(row_i)
132 formatted_utf8_row = []
133
134 found_pdf = False
135
136 for col_j in range(num_cols):
137 cell = worksheet.cell(row_i,col_j)
138
139 cell_type = worksheet.cell_type(row_i,col_j)
140 cell_value = worksheet.cell_value(row_i,col_j)
141
142 formatted_cell = format_excelval(workbook,cell_type,cell_value,False)
143
144 if col_j == 0:
145 # Check to see if companion PDF file present
146 # pdf_filename = os.path.join(excel_dirname,"pdfs",formatted_cell+".pdf")
147 id = formatted_cell.replace(" ","")
148
149 pdf_file = id_to_relative_pdf_file(id)
150 pdf_file_root, pdf_ext = os.path.splitext(pdf_file)
151 PDF_file = pdf_file_root + ".PDF"
152
153 pdf_filename = os.path.join(excel_dirname,pdf_file)
154 PDF_filename = os.path.join(excel_dirname,PDF_file)
155
156 if os.path.exists(pdf_filename):
157 found_pdf = True
158 formatted_utf8_row.insert(0, pdf_file)
159 elif os.path.exists(PDF_filename):
160 found_pdf = True
161 formatted_utf8_row.insert(0, PDF_file)
162 else:
163 print "Unbound id: '" + id + "'"
164
165 if isinstance(formatted_cell, basestring):
166 # Remove any trailing whitespace.
167 # Newline at end particular harmful for a entry in the CSV file
168 formatted_cell = formatted_cell.rstrip();
169
170 formatted_cell_utf8 = unicode(formatted_cell).encode("utf-8")
171 formatted_utf8_row.append(formatted_cell_utf8)
172 if found_pdf:
173 pdfbound_wr.writerow(formatted_utf8_row)
174 else:
175 unbound_wr.writerow(formatted_utf8_row)
176
177 row_i = row_i + 1
178 else:
179 print "Failed to find metadata label 'Report Identifier' in column 0"
180
181
182 csv_pdfbound_ofile.close()
183 csv_unbound_ofile.close()
184
185if __name__ == "__main__":
186 excel_to_bound_pdf_csv(os.sys.argv[1])
187
Note: See TracBrowser for help on using the repository browser.