import xlrd import csv import os import xlsutil letter_to_folder = { 'a': "A - B", 'b': "A - B", 'c': "C - D", 'd': "C - D", 'e': "E - F", 'f': "E - F", 'g': "G - H", 'h': "G - H", 'i': "I - J", 'j': "I - J", 'k': "K - L", 'l': "K - L", 'm': "M - N", 'n': "M - N", 'o': "O - P", 'p': "O - P", 'q': "Q - R", 'r': "Q - R", 's': "S - T", 't': "S - T", 'u': "U - V", 'v': "U - V", 'w': "W - Z", 'x': "W - Z", 'y': "W - Z", 'z': "W - Z" } def id_to_relative_pdf_file(id): first_char = id[0:1] first_char_lower = first_char.lower() file_tail = letter_to_folder[first_char_lower] file_root = "Reports " + file_tail pdf_file = os.path.join("Digital-Library-PDFs",file_root,id+".pdf") return pdf_file # **** Written, but never used # **** Col numbers refer to older version of spreadsheet # # There is now a mapping of header-name to index position which would be # a better way to do thing def fixup_cell(col_num,cell_str): # col == 1 => Author # col == 2 => Title # col == 4 => TLA # col == 5 => AuthorityNo # col == 6 => SiteNo if col_num == 1: #Author cell_str = re.sub('\|?(et al)|(Associates)', '', cell_str) if col_num == 2: #Title # Detect any ALL-CAPS titles, and change to title-case onlyletters = re.sub('[^A-Za-z]+', '', cell_str) if onlyletters.isupper: cell_str = cell_str.title() if col_num == 4: #TLA if cell_str == "All": cell_str = "" if cell_str == "n/a": cell_str = "" if cell_str == "various": cell_str = "" # not a meaningful TLA return cell_str def excel_to_bound_pdf_csv(excel_filename): workbook = xlrd.open_workbook(excel_filename) worksheet = workbook.sheet_by_index(xlsutil.worksheet_index_pos) ## worksheet = workbook.sheet_by_name(xlsutil.worksheet_name) excel_filename_split = os.path.split(excel_filename) excel_dirname = excel_filename_split[0] excel_file = excel_filename_split[1] excel_file_splitext = os.path.splitext(excel_file) excel_file_root = excel_file_splitext[0]; pdfbound_filename = os.path.join(excel_dirname,"PDF-BOUND "+excel_file_root+".csv") unbound_filename = os.path.join(excel_dirname,"UNBOUND "+excel_file_root+".csv") csv_pdfbound_ofile = open(pdfbound_filename, 'wb') csv_unbound_ofile = open(unbound_filename, 'wb') pdfbound_wr = csv.writer(csv_pdfbound_ofile, quoting=csv.QUOTE_ALL) unbound_wr = csv.writer(csv_unbound_ofile, quoting=csv.QUOTE_ALL) # 1. For header-line, build up hashmap of header-names to column number # 2. Write out header-line as Greenstone friendly Metadata terms # => remove sp spaces, change "Report Identifier" to hnz.Identifier # 3. Process the rest of the file, checking if accompanying # PDF file present or not and only printing out the header-names # specified in 'sanitized_headers' num_rows = worksheet.nrows # 1. Get header-line hashmap of header-names to column numbers header_names_mapping = xlsutil.getHeaderMappingToColNum(worksheet) if header_names_mapping is None: print "Failed to find metadata label 'Report Identifier' in column 0" exit() else: # 2. Process Header into Greenstone friendly metadata terms entry_utf8_row = [] row_i = 0; for header_name in xlsutil.sanitized_headers: if header_name in header_names_mapping: header_col_j = header_names_mapping[header_name] header_cell_value = worksheet.cell_value(row_i,header_col_j) if header_cell_value == "Report Identifier": header_cell_value = "hnz.Identifier" if header_cell_value == "Title": header_cell_value = "dc.Title" if header_cell_value == "RelevantTLA's": header_cell_value = "TLA" header_cell_value_utf8 = unicode(header_cell_value).encode("utf-8") metadata_name_utf8 = header_cell_value_utf8.replace(" ", "") entry_utf8_row.append(metadata_name_utf8) else: print("Warning: Failed to column mapping in spreadsheet for header name \""+header_name+"\" => skipping") unbound_wr.writerow(entry_utf8_row) # Add in 'Filename' as first column in pdfbound CSV file entry_utf8_row.insert(0, "Filename") pdfbound_wr.writerow(entry_utf8_row) # 3. Process the rest of the file (metadata values) ... row_i = row_i + 1 while row_i skipping" if found_pdf: pdfbound_wr.writerow(formatted_utf8_row) else: unbound_wr.writerow(formatted_utf8_row) row_i = row_i + 1 csv_pdfbound_ofile.close() csv_unbound_ofile.close() if __name__ == "__main__": excel_to_bound_pdf_csv(os.sys.argv[1])