Changeset 33202 for main/trunk/model-sites-dev
- Timestamp:
- 2019-06-22T17:13:38+12:00 (5 years ago)
- Location:
- main/trunk/model-sites-dev/heritage-nz/collect/pdf-reports
- Files:
-
- 4 edited
Legend:
- Unmodified
- Added
- Removed
-
main/trunk/model-sites-dev/heritage-nz/collect/pdf-reports/PREPARE-IMPORT.sh
r33192 r33202 7 7 python prepare/xls-to-sanitized-csv.py \ 8 8 "import/Archaeological report PDFs 20190620.xlsx" \ 9 "for-download/Archaeological-Reports-20190620. xlsx"9 "for-download/Archaeological-Reports-20190620.csv" 10 10 11 11 echo "Generating PDF-Bound and Unbound CSV files for use in import.pl" -
main/trunk/model-sites-dev/heritage-nz/collect/pdf-reports/prepare/xls-to-csv.py
r33191 r33202 103 103 # 3. Process the rest of the file, checking if accompanying 104 104 # PDF file present or not and only printing out the header-names 105 # specified in 'sanitized_headers' 105 # as long as it has a non-empty header-name 106 ## # specified in 'sanitized_headers' 106 107 107 108 num_rows = worksheet.nrows … … 117 118 118 119 entry_utf8_row = [] 119 row_i = 0; 120 121 for header_name in xlsutil.sanitized_headers: 122 123 if header_name in header_names_mapping: 120 num_header_cols = worksheet.row_len(xlsutil.header_row_pos) 121 122 for col_j in range(num_header_cols): 123 ## for header_name in xlsutil.sanitized_headers: 124 125 header_name = worksheet.cell_value(xlsutil.header_row_pos,col_j) 126 127 if header_name and header_name.strip(): 128 #### print "*** 2. header_name = " + header_name 129 124 130 header_col_j = header_names_mapping[header_name] 125 126 header_cell_value = worksheet.cell_value(row_i,header_col_j) 131 ## if header_name in header_names_mapping: 132 ## header_col_j = header_names_mapping[header_name] 133 134 ## header_cell_value = worksheet.cell_value(row_i,header_col_j) 127 135 128 if header_ cell_value == "Report Identifier":129 header_ cell_value = "hnz.Identifier"130 131 if header_ cell_value == "Title":132 header_ cell_value = "dc.Title"133 134 if header_ cell_value == "RelevantTLA's":135 header_ cell_value = "TLA"136 137 header_ cell_value_utf8 = unicode(header_cell_value).encode("utf-8")138 metadata_name_utf8 = header_ cell_value_utf8.replace(" ", "")136 if header_name == "Report Identifier": 137 header_name = "hnz.Identifier" 138 139 if header_name == "Title": 140 header_name = "dc.Title" 141 142 if header_name == "RelevantTLA's": 143 header_name = "TLA" 144 145 header_name_utf8 = unicode(header_name).encode("utf-8") 146 metadata_name_utf8 = header_name_utf8.replace(" ", "") 139 147 140 148 entry_utf8_row.append(metadata_name_utf8) 141 149 142 else:143 print("Warning: Failed to column mapping in spreadsheet for header name \""+header_name+"\" => skipping")150 ## else: 151 ## print("Warning: Failed to column mapping in spreadsheet for header name \""+header_name+"\" => skipping") 144 152 145 153 unbound_wr.writerow(entry_utf8_row) … … 151 159 # 3. Process the rest of the file (metadata values) ... 152 160 153 row_i = row_i+ 1161 row_i = xlsutil.header_row_pos + 1 154 162 while row_i<num_rows: 155 163 num_cols = worksheet.row_len(row_i) … … 158 166 found_pdf = False 159 167 160 # for col_j in range(num_cols): 161 for header_name in xlsutil.sanitized_headers: 162 163 if header_name in header_names_mapping: 168 for col_j in range(num_cols): 169 ## for header_name in xlsutil.sanitized_headers: 170 header_name = worksheet.cell_value(xlsutil.header_row_pos,col_j) 171 172 if header_name and header_name.strip(): 164 173 col_j = header_names_mapping[header_name] 174 ## if header_name in header_names_mapping: 175 ## col_j = header_names_mapping[header_name] 165 176 166 177 cell_value = worksheet.cell_value(row_i,col_j) … … 201 212 202 213 formatted_utf8_row.append(formatted_cell_value_utf8) 203 else:204 print "Warning: No column number mapping for header name \""+header_name+"\" => skipping"214 ## else: 215 ## print "Warning: No column number mapping for header name \""+header_name+"\" => skipping" 205 216 206 217 if found_pdf: -
main/trunk/model-sites-dev/heritage-nz/collect/pdf-reports/prepare/xls-to-sanitized-csv.py
r33192 r33202 36 36 exit(1) 37 37 38 # Move on to the start of the data values39 row_i = 140 41 38 # 2. Write out CSV file ... 39 # Work through all the row to the spreadsheet, including the header-names 40 row_i = xlsutil.header_row_pos 41 42 42 while row_i<num_rows: 43 43 # Work through sanitized header names building up row of utf8 values -
main/trunk/model-sites-dev/heritage-nz/collect/pdf-reports/prepare/xlsutil.py
r33191 r33202 23 23 24 24 25 header_row_pos = 0 26 25 27 def getHeaderMappingToColNum(worksheet): 26 28 … … 30 32 header_names_mapping = {} 31 33 32 row_i = 0 33 num_header_cols = worksheet.row_len(row_i) 34 num_header_cols = worksheet.row_len(header_row_pos) 34 35 35 36 found_header = False 36 37 for col_j in range(num_header_cols): 37 header_cell_value = worksheet.cell_value( row_i,col_j)38 header_cell_value = worksheet.cell_value(header_row_pos,col_j) 38 39 if header_cell_value == "Report Identifier": 39 40 found_header = True 40 41 41 header_names_mapping[header_cell_value] = col_j 42 if header_cell_value and header_cell_value.strip(): 43 header_names_mapping[header_cell_value] = col_j 42 44 43 45 if not found_header:
Note:
See TracChangeset
for help on using the changeset viewer.