Changeset 33191 for main/trunk/model-sites-dev/heritage-nz/collect/pdf-reports/prepare/xls-to-csv.py
- Timestamp:
- 2019-06-22T10:33:10+12:00 (5 years ago)
- File:
-
- 1 edited
Legend:
- Unmodified
- Added
- Removed
-
main/trunk/model-sites-dev/heritage-nz/collect/pdf-reports/prepare/xls-to-csv.py
r33188 r33191 4 4 5 5 import xlsutil 6 7 ## worksheet_name = "Archaeological reports"8 6 9 7 letter_to_folder = { … … 48 46 49 47 48 # **** Written, but never used 49 # **** Col numbers refer to older version of spreadsheet 50 # 51 # There is now a mapping of header-name to index position which would be 52 # a better way to do thing 53 50 54 def fixup_cell(col_num,cell_str): 51 55 # col == 1 => Author … … 74 78 75 79 def excel_to_bound_pdf_csv(excel_filename): 76 #workbook = xlrd.open_workbook(excel_filename, formatting_info=True)77 80 workbook = xlrd.open_workbook(excel_filename) 78 ## worksheet = workbook.sheet_by_name(worksheet_name) 79 worksheet = workbook.sheet_by_index(0) 81 82 worksheet = workbook.sheet_by_index(xlsutil.worksheet_index_pos) 83 ## worksheet = workbook.sheet_by_name(xlsutil.worksheet_name) 80 84 81 85 excel_filename_split = os.path.split(excel_filename) … … 88 92 unbound_filename = os.path.join(excel_dirname,"UNBOUND "+excel_file_root+".csv") 89 93 90 # print "Worksheet: " + worksheet_name91 94 csv_pdfbound_ofile = open(pdfbound_filename, 'wb') 92 #csv_unbound_ofile = open('{}.csv'.format(excel_file_root), 'wb')93 95 csv_unbound_ofile = open(unbound_filename, 'wb') 94 96 95 97 pdfbound_wr = csv.writer(csv_pdfbound_ofile, quoting=csv.QUOTE_ALL) 96 98 unbound_wr = csv.writer(csv_unbound_ofile, quoting=csv.QUOTE_ALL) 97 98 # # 1. Skip lines until Header encountered (as defined by encountering "Report Identifier")99 # # 2. Write out Header line as Greenstone friendly Metadata terms100 # # => remove sp spaces, change "Report Identifier" to hnz.Identifier101 # # 3. Process the rest of the file, checking if accompanying102 # # PDF file present or not103 104 99 105 100 # 1. For header-line, build up hashmap of header-names to column number … … 110 105 # specified in 'sanitized_headers' 111 106 112 113 107 num_rows = worksheet.nrows 114 108 115 # 1, Skip lines until Header encountered 116 # row_i = 0 117 # found_header = False 118 # while row_i<num_rows: 119 # first_cell = worksheet.cell_value(row_i,0); 120 # if first_cell == "Report Identifier": 121 # found_header = True 122 # break 123 # print "Skipping row {} as not yet encountered 'Report Identifier' metadata label in column 0".format(row_i) 124 # row_i = row_i + 1 125 109 # 1. Get header-line hashmap of header-names to column numbers 126 110 header_names_mapping = xlsutil.getHeaderMappingToColNum(worksheet) 127 111 128 # 2. Process Header into Greenstone friendly metadata terms129 112 if header_names_mapping is None: 130 113 print "Failed to find metadata label 'Report Identifier' in column 0" 131 114 exit() 132 115 else: 116 # 2. Process Header into Greenstone friendly metadata terms 117 133 118 entry_utf8_row = [] 134 119 row_i = 0; … … 164 149 pdfbound_wr.writerow(entry_utf8_row) 165 150 166 # 3. Process the rest of the file ...151 # 3. Process the rest of the file (metadata values) ... 167 152 168 153 row_i = row_i + 1 … … 179 164 col_j = header_names_mapping[header_name] 180 165 181 # cell = worksheet.cell(row_i,col_j)182 183 # cell_type = worksheet.cell_type(row_i,col_j)184 166 cell_value = worksheet.cell_value(row_i,col_j) 185 167 186 # formatted_cell_value = format_excelval(workbook,cell_type,cell_value,False)187 168 formatted_cell_value = xlsutil.format_if_int(cell_value) 188 169 189 # if col_j == 0:190 170 if header_name == "Report Identifier": 191 171 # Check to see if companion PDF file present
Note:
See TracChangeset
for help on using the changeset viewer.