/** *############################################################################ * A component of the Greenstone Librarian Interface, part of the Greenstone * digital library suite from the New Zealand Digital Library Project at the * University of Waikato, New Zealand. * * Author: Michael Dewsnip, NZDL Project, University of Waikato, NZ * * Copyright (C) 2010 Greenstone Digital Library Project * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. *############################################################################ */ package org.greenstone.gatherer.metadata; import java.io.File; import java.net.*; import java.nio.charset.*; import java.util.*; import org.greenstone.gatherer.collection.CollectionManager; import org.greenstone.gatherer.DebugStream; import java.util.regex.Matcher; import java.util.regex.Pattern; /** Static access class that contains many of the methods used to work with filename encodings. * Works closely with classes FileNode, CollectionTreeNode, MetadataXMLFile, MetadataXMLFileManager * to maintain a map of URLEncodedFilenames to their filename encodings. * The process of filename encoding further affects the CollectionManager which refreshes its CollectionTree, * FileManager (move, delete, rename actions), MetadataValueTableModel, EnrichPane. */ public class FilenameEncoding { /** Display of filenames in the trees are in URL encoding, if debugging */ public static boolean DEBUGGING = false; /** Set to false by Gatherer if the locale is UTF-8, as Java's handling is * such that non-UTF8 filename encodings on a UTF-8 locale are destructively * converted so that the bytecodes in the filename are not preserved. */ public static boolean MULTIPLE_FILENAME_ENCODINGS_SUPPORTED = false; /** Also set by Gatherer. * If the OS supports multiple filename encodings, we will be working with URL strings * and the applicable separators are always the forward slash ("/") not File.separator. * If multiple filename encodings are not supported, we're dealing with File.separator. */ public static String URL_FILE_SEPARATOR = File.separator; /** gs.filenameEncoding is a special sort of metadata that is not merely to be stored along * with a file, but is to be applied in real-time on the file's name in the CollectionTree * display. Since FileNodes are constantly destroyed and reconstructed by that Tree when * its nodes are expanded and contracted, storing the filename encodings of each file along * with the file in a FileNode doesn't help because it doesn't last. Instead of rediscovering * the encoding at every stage by querying the metadataXML file, we store the encodings for * fast access: in a map of (URLEncodedFilePath, filename-encoding) pairs. * The current design of the map is to only store any active filename metadata assigned * directly at that file/folder's level, and if there is none discovered at that level, then * storing the empty string for it. Therefore, if the hashmap contains no entry for * a file, it means this still needs to be retrieved. */ public static Map map = new HashMap(); /** Compiled pattern for hex entities of characters. These are of the forn "&#x....;" with 1 to 4 digits */ public static final Pattern HEX_PATTERN = Pattern.compile("(&#x[0-9a-zA-Z]{1,4}+;)"); /** The hex entity version of the ampersand character. * We use this in place of the ampersand character in filenames in metadata.xml files to * preserve the reference to the literal ampersand in the real file name on the file system. */ public static final String HEX_ENTITY_AMPERSAND = FilenameEncoding.hexEntityForChar("&"); //"&"; //*********************** BUSY REFRESHING / REQUIRING REFRESH ********************* /** Set to true if filename encoding metadata was changed. Called by the enter keyPress * event in gui.EnrichPane and when the gs.FilenameEncoding field loses focus. */ private static boolean refreshRequired = false; synchronized public static boolean isRefreshRequired() { return refreshRequired; } synchronized public static void setRefreshRequired(boolean state) { if(MULTIPLE_FILENAME_ENCODINGS_SUPPORTED) { refreshRequired = state; } else { refreshRequired = false; } } //************************** MAP RETRIEVAL METHODS ****************************** /** Returns the cumulative gs.filenameEncoding metadata * assigned to a file inside the collection. */ public static String findFilenameEncoding( File file, String urlEncodedFilePath, boolean bruteForceLookup) { //if(bruteForceLookup) { // return findFilenameEncodingBruteForce(file, urlEncodedFilePath, bruteForceLookup); //} String encoding = ""; // Check any assigned encoding at this level, starting with the map first // and else retrieving the filename encoding from the metadata file if(!map.containsKey(urlEncodedFilePath)) { // Check for filename encoding metadata *directly* associated with the file // Now don't need to get any inherited encoding metadata here, because of // the way we're storing and retrieving encoding information from the map. ArrayList list = MetadataXMLFileManager.getMetadataAssignedDirectlyToFile(file, true); // true: gets gs.filenameEncoding only if(!list.isEmpty()) { MetadataValue metavalue = (MetadataValue)list.get(0); // get(list.size()-1); encoding = metavalue.getValue(); } // else no filename encoding set yet at this level // Now we've done a lookup at this level cache the result in the map, // including empty strings, to indicate that we've done a full lookup map.put(urlEncodedFilePath, encoding); } else { // an entry exists in the map, get it from there encoding = (String)map.get(urlEncodedFilePath); } // if no meta was specified at at the file level, look for any inherited metadata if(encoding.equals("")) { encoding = getInheritedFilenameEncoding(urlEncodedFilePath, file); } //System.err.println("\n@@@@Looked for: " + urlEncodedFilePath + " | found: " + encoding); return encoding; // found something in map, may still be "", but it's what was stored } /** Checks the file-to-encoding map for all the superfolders of the given * filename in sequence for an applicable encoding. Note that the file/folder * at the level of urlFoldername (and dir) has already been inspected. */ static public String getInheritedFilenameEncoding(String urlFoldername, File dir) { String encoding = ""; boolean done = false; // don't want to search past import folder which is as // far as we need to go to determine inherited encodings File importDir = new File(CollectionManager.getLoadedCollectionImportDirectoryPath()); if(dir.equals(importDir)) { // if the top-level dir was already checked, we're done done = true; } // For directories, first remove trailing file separator in order to start checking from higher level folders int lastIndex = urlFoldername.length()-1; char urlFileSeparatorChar = URL_FILE_SEPARATOR.charAt(0); if(urlFoldername.charAt(lastIndex) == urlFileSeparatorChar) { urlFoldername = urlFoldername.substring(0, lastIndex); } while(!done) { // get the folder that's one level up dir = dir.getParentFile(); int index = urlFoldername.lastIndexOf(URL_FILE_SEPARATOR); if(index == -1) { // no more slashes done = true; } else { urlFoldername = urlFoldername.substring(0, index); } // now look in the map to see whether there's an encoding for this folder String folder = urlFoldername + URL_FILE_SEPARATOR; if(map.containsKey(folder)) { encoding = (String)map.get(folder); // may be "" } else { // no entry in map, so look in the metadata.xml at this folder level ArrayList list = MetadataXMLFileManager.getMetadataAssignedDirectlyToFile( dir, true); // true: gets gs.filenameEncoding only if(!list.isEmpty()) { MetadataValue metavalue = (MetadataValue)list.get(0); // get(list.size()-1); encoding = metavalue.getValue(); } map.put(folder, encoding); // may be "" } if(!encoding.equals("")){ done = true; } // else if "", loop to check next folder up else if(dir.equals(importDir)) { // don't iterate past the import folder, which we've now checked done = true; } } return encoding; } /** Called by GUIManager when a collection is closed. This then empties the * file-to-encoding map which is applicable only on a per-collection basis */ static public void closeCollection() { //printFilenameMap("Closing collection. Clearing file-to-encoding map of entries:"); map.clear(); } // Useful for debugging: prints contents of file-to-encoding map static public void printFilenameMap(String heading) { System.err.println("\n********************************************"); System.err.println(heading.toUpperCase()); Iterator entries = map.entrySet().iterator(); while(entries.hasNext()) { Map.Entry entry = (Map.Entry)entries.next(); System.err.println("+ " + (String)entry.getKey() + ": " + (String)entry.getValue()); } System.err.println("********************************************\n"); } // UNUSED at present. Brute force version of the findFilenameEncoding() method // Doesn't use the map, but gets *all* the metadata assigned to a file/folder to // work out the encoding applicable to a file/folder. public static String findFilenameEncodingBruteForce(File file, String urlEncodedFilename, boolean bruteForceLookup) { System.err.println("\n***** BRUTE FORCE getFilenameEncoding() called\n"); String encoding = ""; // Check for filename encoding metadata *directly* associated with the file // Now don't need to get any inherited encoding metadata here, because of // the way we're storing and retrieving encoding information from the map. ArrayList list = MetadataXMLFileManager.getMetadataAssignedToFile(file, true); // true: gets gs.filenameEncoding only if(!list.isEmpty()) { // try to get the filename encoding meta that was assigned last to this // file, even though it makes no sense to have multiple values for it MetadataValue metavalue = (MetadataValue)list.get(list.size()-1); encoding = metavalue.getValue(); if(encoding == null) { // unlikely ??? System.err.println("**** ERROR: encoding for " + urlEncodedFilename + " is NULL!"); encoding = ""; } } // else no filename encoding set yet, perhaps //System.err.println("**** Found encoding for " + urlEncodedFilename + " " + encoding); return encoding; } //****************************** APPLYING ENCODINGS TO FILENAMES ***************************** /** URL encoded version of the byte codes of the given file's name */ public static String calcURLEncodedFilePath(File file) { if(!MULTIPLE_FILENAME_ENCODINGS_SUPPORTED) { return file.getAbsolutePath(); } else { String filename = fileToURLEncoding(file); return filename; } } /** URL encoded version of the byte codes of this file's name */ public static String calcURLEncodedFileName(String urlfilepath) { String filename = urlfilepath; if(filename.endsWith(URL_FILE_SEPARATOR)) { // directory, remove trailing slash filename = filename.substring(0, filename.length() - 1); } // remove the directory prefix (if any) to get the filename int index = filename.lastIndexOf(URL_FILE_SEPARATOR); if(index != -1) { filename = filename.substring(index+1); // skip separator } return filename; } /** Given a string representing an alias to an official encoding (and unofficial ones * starting with "Latin-"), attempts to work out what the canonical encoding for that is. * If the given encoding is unrecognised, it is returned as is. */ public static String canonicalEncodingName(String encoding) { String canonicalEncoding = encoding; try { // Latin-1 -> ISO-8859-1 String alias = canonicalEncoding.toLowerCase(); if(alias.startsWith("latin")){ canonicalEncoding = "ISO-8859" + alias.substring("latin".length()); } // canonical encoding for official aliases canonicalEncoding = Charset.forName(canonicalEncoding).name(); return canonicalEncoding; } catch (Exception e) { System.err.println("(Could not recognise encoding (alias): " + encoding + ".)"); return encoding; // no alias could be found, return the original parameter } } //************************* GETTING THE URL ENCODING OF FILENAMES ********************************* /** * Given a String containing hexentities, will convert back into the unicode version of the String. * e.g. A string like "02 Tēnā Koutou\.mp3" will be returned as "02 Tena Koutou\.mp3" with macrons on e and a * I've tested this in a separate file that imports java.util.regex.Matcher and java.util.regex.Pattern * and contains a copy of Utility.debugUnicodeString(String) with the following main function: public static void main(String args[]) { String str = "02 Tēnā Koutou\\.mp3"; // or more basic case: String str = "mmmmānnnnēpppp\\.txt"; System.err.println("About to decode hex string: " + str); String result = decodeStringContainingHexEntities(str); System.err.println("Decoded hex string: " + result + " - debug unicode form: " + debugUnicodeString(result)); } */ public static String decodeStringContainingHexEntities(String str) { String result = ""; Matcher matcher = HEX_PATTERN.matcher(str); int searchFromIndex = 0; int endMatchIndex = -1; while(matcher.find(searchFromIndex)) { String hexPart = matcher.group(); //System.err.println("Found hexpart match: " + hexPart); int startMatchIndex = matcher.start(); endMatchIndex = matcher.end(); result += str.substring(searchFromIndex, startMatchIndex); String hexNumberStr = hexPart.substring(3, hexPart.length()-1); // lose the "&#x" prefix and the ";" suffix to get just the hex number portion of the match // https://stackoverflow.com/questions/16625865/java-unicode-to-hex-string // https://stackoverflow.com/questions/11194513/convert-hex-string-to-int //System.err.println("hexNumberStr so far: " + hexNumberStr); hexNumberStr = "0x" + hexNumberStr; // e.g "0xDDDD" //int hexNumber = Integer.parseInt(hexNumberStr); int hexNumber = Integer.decode(hexNumberStr); String hexNumberAsChar = Character.toString((char) hexNumber); result += hexNumberAsChar; searchFromIndex = endMatchIndex; } if(endMatchIndex != -1) { // attach any suffix once we finished processing all the hex codes result += str.substring(endMatchIndex); //System.err.println("suffix: " + str.substring(endMatchIndex)); } else { // there were no hex codes to decode, return string as is result = str; } return result; } // Dr Bainbridge's methods /* On Linux machines that are set to using an ISO-8859 (Latin) type encoding, * we can work with URL-encoded filenames in Java. Java works with whatever * encoding the filesystem uses. Unlike systems working with UTF-8, where Java * interprets filenames as UTF-8 (a destructive process since characters invalid * for UTF-8 are replaced with the invalid character, which means the original * character's byte codes can not be regained), working with an ISO-8859-1 * system means the original byte codes of the characters are preserved, * regardless of whether the characters represent ISO-8859-1 or not. Such byte * codes are converted by the following method to the correct URL versions of * the strings that the filenames represent (that is, the correct URL representations * of the filenames in their original encodings). This is useful for interactions with * Perl as Java and Perl can use URL-encoded filenames to talk about the same files * on the file system, instead of having to work out what encoding they are in. */ public static String fileToURLEncoding(File file) { // on a UTF-8 file system, DO NOT do the stuff further below, // just return input filename param, but with any & in the filename replaced with its hex entity if(!MULTIPLE_FILENAME_ENCODINGS_SUPPORTED) { // protect ampersands in filenames by converting it to its hex entity String filepath = file.getAbsolutePath(); filepath = filepath.replace("&", HEX_ENTITY_AMPERSAND); return filepath; } String filename_url_encoded = ""; // The following test for whether the file exists or not is a problem // when a File object--whose actual file is in the process of being moved // and therefore temporarily does not 'exist' on the actual system--can't // be URL encoded: the following would return "" when a file doesn't exist. // So commenting out the test. /* if(!file.getName().equals("recycle")) { if(!file.isFile() && !file.isDirectory()) { System.err.println("*** ERROR. Java can't see file: " + file.getAbsolutePath()); return ""; } if(!file.exists()) { System.err.println("*** NOTE: File doesn't exist: " + file.getAbsolutePath()); return ""; //file.getName(); } } */ URI filename_uri = file.toURI(); try { // The trick: // 1. toASCIIString() will %xx encode values > 127 // 2. Decode the result to "ISO-8859-1" // 3. URL encode the bytes to string // Step 2 forces the string to be 8-bit values. It // doesn't matter if the starting raw filename was *not* // in the ISO-8859-1 encoding, the effect is to ensure // we have an 8-bit byte string that (numerically) // captures the right value. These numerical values are // then used to determine how to URL encode it String filename_ascii = filename_uri.toASCIIString(); // The URI.toASCIIString() call above only encodes values > 127. // But we also need to protect + and & signs in filenames. Do this by URL encoding. // But need to double URL encode, else it will get decoded too early, in methods called shortly hereafter. filename_ascii = filename_ascii.replace("+", "%252B"); // +'s ASCII code is 43 decimal, 2b in hex, 2B when uppercased filename_ascii = filename_ascii.replace("&", "%2526"); // &'s ASCII code is 36 in decimal, and 26 in hex // Before proceeding, protect & in the filename too. // &'s ASCII code is 36 in decimal, and 26 in hex, so replace with & (HEX_ENTITY_AMPERSAND) // But dangerous to do simple replace if there are &#x...; entities in the filename already! // That is, we'll want to protect & by replacing with &'s hex value, but we don't want to replace the & in "&#x....;" with the same! //filename_url_encoded = filename_url_encoded.replace("&", "&x26;");// SO THIS IS BAD //filename_url_encoded = filename_url_encoded.replace("&", hexEntityForChar("&"));// SAME, STILL BAD ///filename_ascii = escapeAllCharWithHexEntity(filename_ascii, '&'); // Good: CAREFULLY replaces & that are not part of hex entities String filename_raw_bytes = URLDecoder.decode(filename_ascii,"ISO-8859-1"); filename_url_encoded = iso_8859_1_filename_to_url_encoded(filename_raw_bytes); // For chars that were protected by being URL encoded, now convert them to the correct version we want them in. // For +: this char is special in regex, so it needs to be converted from URL encoding back to + so it will get properly escaped for regex // For &: this char is special in XML, so since the call to iso_8859_1_filename_to_url_encoded() is over, we can finally convert & to hex entity now. //filename_url_encoded = filename_url_encoded.replace("%2B", "+"); // Don't do this, won't get regex escaped when converted back to a + by caller filename_url_encoded = filename_url_encoded.replace("%2B", "+"); // + signs are special, as they will need to be escaped since the caller wants the filename representing a regex filename_url_encoded = filename_url_encoded.replace("%26", HEX_ENTITY_AMPERSAND); // convert URL encoding for ampersand into hex entity for ampersand } catch (Exception e) { e.printStackTrace(); // Give up trying to convert filename_url_encoded = file.getAbsolutePath(); } return filename_url_encoded; } // For unicode codepoints see: // http://unicode.org/Public/MAPPINGS/ISO8859/8859-1.TXT for ISO8859-1 (Latin-1) // where 0xE2 maps to codepoint 0x00E2 and is defined as "Latin small letter a with circumflex" // http://unicode.org/Public/MAPPINGS/ISO8859/8859-7.TXT for ISO8859-7 (Greek) // where 0xE2 maps to codepoint 0x03B2 and is defined as "Greek small letter beta" public static String iso_8859_1_filename_to_url_encoded(String raw_bytes_filename) throws Exception { String urlEncoded = ""; try { // By this point we have a UTF-8 encoded string that captures // what the ISO-8859-1 (Latin-1) character is that corresponded to the // 8-bit numeric value for that character in the filename // on the file system // For example: // File system char: = %E2 // Equivalent Latin 1 char: = %E2 // Mapped to UTF-8: = // Our task is to take the string the contains and ensure that // we "see" it as byte [] raw_bytes = raw_bytes_filename.getBytes("ISO-8859-1"); String unicode_filename = new String(raw_bytes,"UTF-8"); for(int i = 0; i < unicode_filename.length(); i++) { char charVal = unicode_filename.charAt(i); if ((int)charVal > 255) { urlEncoded += String.format("&#x%02X;", (int)charVal); } else if((int)charVal > 127) { urlEncoded += String.format("%%%02X", (int)charVal); } else { urlEncoded += String.format("%c", (char)charVal); } } } catch (Exception e) { //e.printStackTrace(); throw(e); } return urlEncoded; } // unused for now public static String raw_filename_to_url_encoded(String fileName) throws Exception { String urlEncoded = ""; try { byte[] bytes = fileName.getBytes(); for(int i = 0; i < bytes.length; i++) { // mask each byte (by applying & 0xFF) to make the signed // byte (in the range -128 to 127) unsigned (in the range // 0 to 255). int byteVal = (int)(bytes[i] & 0xFF); if(byteVal > 127) { urlEncoded += String.format("%%%02X", (int)byteVal); } else { urlEncoded += String.format("%c",(char)byteVal); } } } catch (Exception e) { //e.printStackTrace(); throw(e); } return urlEncoded; } // FURTHER HELPER METHODS /** * Produce the equivalent of method fileToURLEncoding(), but taking a String as input parameter. * If filename is relative, then the current directory (gli?) will be prefixed to what is returned * and should be removed manually by the caller. Alternatively, for relative paths, call the variant * relativeFilenameToURLEncoding(String), which will remove any added filepath prefix. */ public static String fullFilepathToURLEncoding(String filename) { // on a UTF-8 file system, DO NOT do the stuff further below, // just return input filename param, but with any & in the filename replaced with its hex entity if(!MULTIPLE_FILENAME_ENCODINGS_SUPPORTED) { return filename.replace("&", HEX_ENTITY_AMPERSAND); } File file = new File (filename); String filename_url_encoded = fileToURLEncoding(file); // if the current directory (".") was passed in as filename, // then the filename_url_encoded looks like /full/path/./ // In that case, remove the ./ at the end if (filename_url_encoded.endsWith(FilenameEncoding.URL_FILE_SEPARATOR+"."+FilenameEncoding.URL_FILE_SEPARATOR)) { filename_url_encoded = filename_url_encoded.substring(0, filename_url_encoded.length()-2); // cut off /. at end } return filename_url_encoded; } /** * Produce the equivalent of method fileToURLEncoding(), but taking a String as input parameter * If filename is a relative path, call this method to get it specially URL encoded. * This method will remove the current directory that is prefixed as an intermediary step. */ public static String relativeFilenameToURLEncoding(String filename) { if(!MULTIPLE_FILENAME_ENCODINGS_SUPPORTED) { // on a UTF-8 file system, DO NOT do the stuff below, just return input param return filename.replace("&", HEX_ENTITY_AMPERSAND); } String curr_directory_path = FilenameEncoding.fullFilepathToURLEncoding("."); return filenameToURLEncodingWithPrefixRemoved(filename, curr_directory_path); } /** * Produce the equivalent of method fileToURLEncoding(), but taking a String as input parameter * Convenience method that will return the specially URL encoded version of filename * with the provided removeFilePathPrefix removed */ public static String filenameToURLEncodingWithPrefixRemoved(String filename, String removeFilePathPrefix) { if(!MULTIPLE_FILENAME_ENCODINGS_SUPPORTED) { // on a UTF-8 file system, DO NOT do the stuff below, just return input param return filename.replace("&", HEX_ENTITY_AMPERSAND); } File file = new File (filename); String filename_url_encoded = fileToURLEncoding(file); // returns a full filepath // now lop off the given removeFilePathPrefix that FilenameEncoding.filenameToURLEncoding(STRING) variant would have added filename_url_encoded = filename_url_encoded.substring(removeFilePathPrefix.length()); // remove any remaining slash prefix if (filename_url_encoded.startsWith(FilenameEncoding.URL_FILE_SEPARATOR)) { filename_url_encoded = filename_url_encoded.substring(FilenameEncoding.URL_FILE_SEPARATOR.length()); } return filename_url_encoded; } // UNUSED now, but useful functions and escapeAllCharWithHexEntity() took effort to write. /** * Attempting to produce the equivalent method fileToURLEncoding(), but taking a String as input parameter * UNUSED - REPLACED by filenameToURLEncoding(String str) which reuses existing fileToURLEncoding(File) method. */ public static String stringToHex(String str) { String hex_str = ""; for(int i = 0; i < str.length(); i++) { int charCode = str.codePointAt(i); // unicode codepoint / ASCII code // ASCII table: https://cdn.sparkfun.com/assets/home_page_posts/2/1/2/1/ascii_table_black.png // If the unicode character code pt is less than the ASCII code for space and greater than for tilda, let's display the char in hex (x0000 format) if((charCode >= 20 && charCode <= 126) || charCode == 9 || charCode == 10 || charCode == 13 /*|| charCode == 36 || charCode == 43*/) { // space, tilda, TAB, LF, CR are printable, leave them in for XML element printing. And spaces and plus signs (ASCII codes 36 and 43) need to be converted to hex too hex_str += str.charAt(i); } else { hex_str += "&#x" + String.format("%x", charCode).toUpperCase() + ";"; // looks like: "&#x[up-to-4-hexdigits-in-UPPERCASE];" } } return hex_str; } /** Takes a String containing a single char and returns the hex entity for it */ public static String hexEntityForChar(String char_as_string) { int charCode = char_as_string.codePointAt(0); // unicode codepoint / ASCII code String hexCodeStr = "&#x" + String.format("%x", charCode).toUpperCase() + ";"; return hexCodeStr; } /** * Given a String containing 0 or more occurrences of CHARACTER, * this method will replace all occurrences of that CHARACTER with its hex entity variant, "&x....;" * Special care is taken where the CHARACTER to be replaced is &, * as in that case, we don't want to replace any existing hex entities already present in the String. */ public static String escapeAllCharWithHexEntity(String str, char CHARACTER) { if(str.indexOf(CHARACTER) == -1) { // nothing to replace, we're done return str; } String char_as_string = Character.toString(CHARACTER); String hexCodeString = hexEntityForChar(char_as_string); Matcher hexPatternMatch = HEX_PATTERN.matcher(str); // looks for a hex entity, which has the pattern "&#x....;" // want to replace all & with &x26; (the hex for ampsersand) IFF the & is not already a hexcode/doesn't already match HEX_PATTERN int searchIndex = 0; boolean finished = false; while(!finished) { searchIndex = str.indexOf(CHARACTER, searchIndex); if(searchIndex == -1) { finished = true; } else { // replacing ampersands, &, is a special case: don't want to replace the & of (hex) entities in the string: if(hexPatternMatch.find(searchIndex) && searchIndex == hexPatternMatch.start()) { searchIndex = hexPatternMatch.end(); } else { String tmp = str.substring(0, searchIndex) + hexCodeString; searchIndex++; if(str.length() > searchIndex) { tmp += str.substring(searchIndex); } str = tmp; searchIndex = searchIndex+ hexCodeString.length() - 1; // String has been modified, so have to update Matcher hexPatternMatch = HEX_PATTERN.matcher(str); if(searchIndex >= str.length()) { finished = true; } } } } return str; } }