Changeset 33188 for main/trunk/model-sites-dev/heritage-nz/collect/pdf-reports/prepare/xls-to-csv.py
- Timestamp:
- 2019-06-22T10:10:17+12:00 (5 years ago)
- File:
-
- 1 edited
Legend:
- Unmodified
- Added
- Removed
-
main/trunk/model-sites-dev/heritage-nz/collect/pdf-reports/prepare/xls-to-csv.py
r33024 r33188 3 3 import os 4 4 5 import xlsutil 6 5 7 ## worksheet_name = "Archaeological reports" 6 7 8 # https://code.activestate.com/recipes/546518-simple-conversion-of-excel-files-into-csv-and-yaml/download/1/9 def format_excelval(book, type, value, wanttupledate):10 """ Clean up the incoming excel data """11 ## Data Type Codes:12 ## EMPTY 013 ## TEXT 1 a Unicode string14 ## NUMBER 2 float15 ## DATE 3 float16 ## BOOLEAN 4 int; 1 means TRUE, 0 means FALSE17 ## ERROR 518 returnrow = []19 if type == 2: # TEXT20 if value == int(value): value = int(value)21 elif type == 3: # NUMBER22 datetuple = xlrd.xldate_as_tuple(value, book.datemode)23 value = datetuple if wanttupledate else tupledate_to_isodate(datetuple)24 elif type == 5: # ERROR25 value = xlrd.error_text_from_code[value]26 return value27 28 8 29 9 letter_to_folder = { … … 68 48 69 49 50 def fixup_cell(col_num,cell_str): 51 # col == 1 => Author 52 # col == 2 => Title 53 # col == 4 => TLA 54 # col == 5 => AuthorityNo 55 # col == 6 => SiteNo 56 57 if col_num == 1: #Author 58 cell_str = re.sub('\|?(et al)|(Associates)', '', cell_str) 59 60 if col_num == 2: #Title 61 # Detect any ALL-CAPS titles, and change to title-case 62 onlyletters = re.sub('[^A-Za-z]+', '', cell_str) 63 if onlyletters.isupper: 64 cell_str = cell_str.title() 65 66 if col_num == 4: #TLA 67 if cell_str == "All": cell_str = "" 68 if cell_str == "n/a": cell_str = "" 69 if cell_str == "various": cell_str = "" # not a meaningful TLA 70 71 72 return cell_str 73 74 70 75 def excel_to_bound_pdf_csv(excel_filename): 71 workbook = xlrd.open_workbook(excel_filename, formatting_info=True) 76 #workbook = xlrd.open_workbook(excel_filename, formatting_info=True) 77 workbook = xlrd.open_workbook(excel_filename) 72 78 ## worksheet = workbook.sheet_by_name(worksheet_name) 73 79 worksheet = workbook.sheet_by_index(0) … … 90 96 unbound_wr = csv.writer(csv_unbound_ofile, quoting=csv.QUOTE_ALL) 91 97 92 # 1. Skip lines until Header encountered (as defined by encountering "Report Identifier") 93 # 2. Write out Header line as Greenstone friendly Metadata terms 98 # # 1. Skip lines until Header encountered (as defined by encountering "Report Identifier") 99 # # 2. Write out Header line as Greenstone friendly Metadata terms 100 # # => remove sp spaces, change "Report Identifier" to hnz.Identifier 101 # # 3. Process the rest of the file, checking if accompanying 102 # # PDF file present or not 103 104 105 # 1. For header-line, build up hashmap of header-names to column number 106 # 2. Write out header-line as Greenstone friendly Metadata terms 94 107 # => remove sp spaces, change "Report Identifier" to hnz.Identifier 95 108 # 3. Process the rest of the file, checking if accompanying 96 # PDF file present or not 109 # PDF file present or not and only printing out the header-names 110 # specified in 'sanitized_headers' 111 97 112 98 113 num_rows = worksheet.nrows 99 114 100 115 # 1, Skip lines until Header encountered 101 row_i = 0 102 found_header = False 103 while row_i<num_rows: 104 first_cell = worksheet.cell_value(row_i,0); 105 if first_cell == "Report Identifier": 106 found_header = True 107 break 108 print "Skipping row {} as not yet encountered 'Report Identifier' metadata label in column 0".format(row_i) 109 row_i = row_i + 1 116 # row_i = 0 117 # found_header = False 118 # while row_i<num_rows: 119 # first_cell = worksheet.cell_value(row_i,0); 120 # if first_cell == "Report Identifier": 121 # found_header = True 122 # break 123 # print "Skipping row {} as not yet encountered 'Report Identifier' metadata label in column 0".format(row_i) 124 # row_i = row_i + 1 125 126 header_names_mapping = xlsutil.getHeaderMappingToColNum(worksheet) 110 127 111 128 # 2. Process Header into Greenstone friendly metadata terms 112 if found_header: 129 if header_names_mapping is None: 130 print "Failed to find metadata label 'Report Identifier' in column 0" 131 exit() 132 else: 113 133 entry_utf8_row = [] 114 for entry in worksheet.row_values(row_i): 115 if entry == "Report Identifier": 116 entry = "hnz.Identifier" 117 118 if entry == "Title": 119 entry = "dc.Title" 120 121 entry_utf8 = unicode(entry).encode("utf-8") 122 metadata_entry_utf8 = entry_utf8.replace(" ", "") 123 124 entry_utf8_row.append(metadata_entry_utf8) 134 row_i = 0; 135 136 for header_name in xlsutil.sanitized_headers: 137 138 if header_name in header_names_mapping: 139 header_col_j = header_names_mapping[header_name] 140 141 header_cell_value = worksheet.cell_value(row_i,header_col_j) 142 143 if header_cell_value == "Report Identifier": 144 header_cell_value = "hnz.Identifier" 145 146 if header_cell_value == "Title": 147 header_cell_value = "dc.Title" 148 149 if header_cell_value == "RelevantTLA's": 150 header_cell_value = "TLA" 151 152 header_cell_value_utf8 = unicode(header_cell_value).encode("utf-8") 153 metadata_name_utf8 = header_cell_value_utf8.replace(" ", "") 154 155 entry_utf8_row.append(metadata_name_utf8) 156 157 else: 158 print("Warning: Failed to column mapping in spreadsheet for header name \""+header_name+"\" => skipping") 159 125 160 unbound_wr.writerow(entry_utf8_row) 126 161 … … 128 163 entry_utf8_row.insert(0, "Filename") 129 164 pdfbound_wr.writerow(entry_utf8_row) 130 165 166 # 3. Process the rest of the file ... 131 167 132 168 row_i = row_i + 1 … … 137 173 found_pdf = False 138 174 139 for col_j in range(num_cols): 140 cell = worksheet.cell(row_i,col_j) 141 142 cell_type = worksheet.cell_type(row_i,col_j) 143 cell_value = worksheet.cell_value(row_i,col_j) 144 145 formatted_cell = format_excelval(workbook,cell_type,cell_value,False) 146 147 if col_j == 0: 148 # Check to see if companion PDF file present 149 # pdf_filename = os.path.join(excel_dirname,"pdfs",formatted_cell+".pdf") 150 id = formatted_cell.replace(" ","") 151 152 pdf_file = id_to_relative_pdf_file(id) 153 pdf_file_root, pdf_ext = os.path.splitext(pdf_file) 154 PDF_file = pdf_file_root + ".PDF" 155 156 pdf_filename = os.path.join(excel_dirname,pdf_file) 157 PDF_filename = os.path.join(excel_dirname,PDF_file) 158 159 if os.path.exists(pdf_filename): 160 found_pdf = True 161 formatted_utf8_row.insert(0, pdf_file) 162 elif os.path.exists(PDF_filename): 163 found_pdf = True 164 formatted_utf8_row.insert(0, PDF_file) 165 else: 166 print "Unbound id: '" + id + "'" 167 168 if isinstance(formatted_cell, basestring): 169 # Remove any trailing whitespace. 170 # Newline at end particular harmful for a entry in the CSV file 171 formatted_cell = formatted_cell.rstrip(); 172 173 formatted_cell_utf8 = unicode(formatted_cell).encode("utf-8") 174 formatted_utf8_row.append(formatted_cell_utf8) 175 # for col_j in range(num_cols): 176 for header_name in xlsutil.sanitized_headers: 177 178 if header_name in header_names_mapping: 179 col_j = header_names_mapping[header_name] 180 181 # cell = worksheet.cell(row_i,col_j) 182 183 # cell_type = worksheet.cell_type(row_i,col_j) 184 cell_value = worksheet.cell_value(row_i,col_j) 185 186 # formatted_cell_value = format_excelval(workbook,cell_type,cell_value,False) 187 formatted_cell_value = xlsutil.format_if_int(cell_value) 188 189 # if col_j == 0: 190 if header_name == "Report Identifier": 191 # Check to see if companion PDF file present 192 # pdf_filename = os.path.join(excel_dirname,"pdfs",formatted_cell_value+".pdf") 193 id = formatted_cell_value.replace(" ","") 194 195 pdf_file = id_to_relative_pdf_file(id) 196 pdf_file_root, pdf_ext = os.path.splitext(pdf_file) 197 PDF_file = pdf_file_root + ".PDF" 198 199 pdf_filename = os.path.join(excel_dirname,pdf_file) 200 PDF_filename = os.path.join(excel_dirname,PDF_file) 201 202 if os.path.exists(pdf_filename): 203 found_pdf = True 204 formatted_utf8_row.insert(0, pdf_file) 205 elif os.path.exists(PDF_filename): 206 found_pdf = True 207 formatted_utf8_row.insert(0, PDF_file) 208 else: 209 print "Unbound id: '" + id + "'" 210 211 if isinstance(formatted_cell_value, basestring): 212 # Remove any trailing whitespace. 213 # Newline at end particular harmful for a entry in the CSV file 214 formatted_cell_value = formatted_cell_value.rstrip(); 215 216 formatted_cell_value_utf8 = unicode(formatted_cell_value).encode("utf-8") 217 218 ## Perform any cell transformations to make DL used spreadsheet 219 ## cleaner to build 220 # formatted_cell_value_utf8 = fixup_cell(col_j,formatted_cell_value_utf8) 221 222 formatted_utf8_row.append(formatted_cell_value_utf8) 223 else: 224 print "Warning: No column number mapping for header name \""+header_name+"\" => skipping" 225 175 226 if found_pdf: 176 227 pdfbound_wr.writerow(formatted_utf8_row) … … 179 230 180 231 row_i = row_i + 1 181 else:182 print "Failed to find metadata label 'Report Identifier' in column 0"183 232 184 233
Note:
See TracChangeset
for help on using the changeset viewer.