Skip to content

[Java] Charset encoding handling improvements. #887

New issue

Have a question about this project? # for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “#”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? # to your account

Merged
merged 8 commits into from
Jan 8, 2022
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
import uk.co.real_logic.sbe.generation.CodeGenerator;
import org.agrona.generation.OutputManager;
import uk.co.real_logic.sbe.generation.Generators;
import uk.co.real_logic.sbe.generation.java.JavaUtil;
import uk.co.real_logic.sbe.ir.*;
import org.agrona.Verify;

Expand Down Expand Up @@ -244,29 +245,31 @@ private void generateCharacterEncodingRangeCheck(

if (null != characterEncoding)
{
switch (token.encoding().characterEncoding())
if (JavaUtil.isAsciiEncoding(characterEncoding))
{
case "ASCII":
imports.peek().add("fmt");
sb.append(String.format(
"\tfor idx, ch := range %1$s {\n" +
"\t\tif ch > 127 {\n" +
"\t\t\treturn fmt.Errorf(\"%1$s[%%d]=%%d" +
" failed ASCII validation\", idx, ch)\n" +
"\t\t}\n" +
"\t}\n",
varName));
break;

case "UTF-8":
imports.peek().add("errors");
imports.peek().add("unicode/utf8");
sb.append(String.format(
"\tif !utf8.Valid(%1$s[:]) {\n" +
"\t\treturn errors.New(\"%1$s failed UTF-8 validation\")\n" +
"\t}\n",
varName));
break;
imports.peek().add("fmt");
sb.append(String.format(
"\tfor idx, ch := range %1$s {\n" +
"\t\tif ch > 127 {\n" +
"\t\t\treturn fmt.Errorf(\"%1$s[%%d]=%%d" +
" failed ASCII validation\", idx, ch)\n" +
"\t\t}\n" +
"\t}\n",
varName));
}
else if (JavaUtil.isUtf8Encoding(characterEncoding))
{
imports.peek().add("errors");
imports.peek().add("unicode/utf8");
sb.append(String.format(
"\tif !utf8.Valid(%1$s[:]) {\n" +
"\t\treturn errors.New(\"%1$s failed UTF-8 validation\")\n" +
"\t}\n",
varName));
}
else
{
throw new IllegalArgumentException("Unsupported encoding: " + characterEncoding);
}
}
}
Expand Down Expand Up @@ -1836,7 +1839,7 @@ private void generateCompositePropertyElements(
final String containingTypeName,
final List<Token> tokens)
{
for (int i = 0; i < tokens.size();)
for (int i = 0; i < tokens.size(); )
{
final Token token = tokens.get(i);
final String propertyName = formatPropertyName(token.name());
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -921,25 +921,16 @@ private void generateDataDecodeMethods(
indent + " }\n\n" +
indent + " final byte[] tmp = new byte[dataLength];\n" +
indent + " buffer.getBytes(limit + headerLength, tmp, 0, dataLength);\n\n" +
indent + " final String value;\n" +
indent + " try\n" +
indent + " {\n" +
indent + " value = new String(tmp, \"%6$s\");\n" +
indent + " }\n" +
indent + " catch (final java.io.UnsupportedEncodingException ex)\n" +
indent + " {\n" +
indent + " throw new RuntimeException(ex);\n" +
indent + " }\n\n" +
indent + " return value;\n" +
indent + " return new String(tmp, %6$s);\n" +
indent + " }\n",
formatPropertyName(propertyName),
generateStringNotPresentCondition(token.version(), indent),
sizeOfLengthField,
PrimitiveType.UINT32 == lengthType ? "(int)" : "",
generateGet(lengthType, "limit", byteOrderStr),
characterEncoding);
charset(characterEncoding));

if (characterEncoding.contains("ASCII"))
if (isAsciiEncoding(characterEncoding))
{
new Formatter(sb).format("\n" +
indent + " public int get%1$s(final Appendable appendable)\n" +
Expand Down Expand Up @@ -1050,7 +1041,7 @@ private void generateCharArrayEncodeMethods(
{
final PrimitiveType lengthPutType = PrimitiveType.UINT32 == lengthType ? PrimitiveType.INT32 : lengthType;

if (characterEncoding.contains("ASCII"))
if (isAsciiEncoding(characterEncoding))
{
new Formatter(sb).format("\n" +
indent + " public %1$s %2$s(final String value)\n" +
Expand Down Expand Up @@ -1099,16 +1090,8 @@ private void generateCharArrayEncodeMethods(
new Formatter(sb).format("\n" +
indent + " public %1$s %2$s(final String value)\n" +
indent + " {\n" +
indent + " final byte[] bytes;\n" +
indent + " try\n" +
indent + " {\n" +
indent + " bytes = null == value || value.isEmpty() ?" +
" org.agrona.collections.ArrayUtil.EMPTY_BYTE_ARRAY : value.getBytes(\"%3$s\");\n" +
indent + " }\n" +
indent + " catch (final java.io.UnsupportedEncodingException ex)\n" +
indent + " {\n" +
indent + " throw new RuntimeException(ex);\n" +
indent + " }\n\n" +
indent + " final byte[] bytes = (null == value || value.isEmpty()) ?" +
" org.agrona.collections.ArrayUtil.EMPTY_BYTE_ARRAY : value.getBytes(%3$s);\n\n" +
indent + " final int length = bytes.length;\n" +
indent + " if (length > %4$d)\n" +
indent + " {\n" +
Expand All @@ -1123,7 +1106,7 @@ private void generateCharArrayEncodeMethods(
indent + " }\n",
className,
formatPropertyName(propertyName),
characterEncoding,
charset(characterEncoding),
maxLengthValue,
sizeOfLengthField,
generatePut(lengthPutType, "limit", "length", byteOrderStr));
Expand Down Expand Up @@ -2042,7 +2025,7 @@ private CharSequence generatePrimitiveArrayPropertyDecode(
fieldLength,
charset(encoding.characterEncoding()));

if (encoding.characterEncoding().contains("ASCII"))
if (isAsciiEncoding(encoding.characterEncoding()))
{
new Formatter(sb).format("\n" +
indent + " public int get%1$s(final Appendable value)\n" +
Expand Down Expand Up @@ -2240,7 +2223,7 @@ private void generateCharArrayEncodeMethods(
fieldLength,
offset);

if (encoding.characterEncoding().contains("ASCII"))
if (isAsciiEncoding(encoding.characterEncoding()))
{
new Formatter(sb).format("\n" +
indent + " public %1$s %2$s(final String src)\n" +
Expand Down Expand Up @@ -2274,15 +2257,10 @@ private void generateCharArrayEncodeMethods(
indent + " throw new IndexOutOfBoundsException(" +
"\"CharSequence too large for copy: byte length=\" + srcLength);\n" +
indent + " }\n\n" +
indent + " for (int i = 0; i < srcLength; ++i)\n" +
indent + " {\n" +
indent + " final char charValue = src.charAt(i);\n" +
indent + " final byte byteValue = charValue > 127 ? (byte)'?' : (byte)charValue;\n" +
indent + " buffer.putByte(offset + %4$d + i, byteValue);\n" +
indent + " }\n\n" +
indent + " for (int i = srcLength; i < length; ++i)\n" +
indent + " buffer.putStringWithoutLengthAscii(offset + %4$d, src);\n\n" +
indent + " for (int start = srcLength; start < length; ++start)\n" +
indent + " {\n" +
indent + " buffer.putByte(offset + %4$d + i, (byte)0);\n" +
indent + " buffer.putByte(offset + %4$d + start, (byte)0);\n" +
indent + " }\n\n" +
indent + " return this;\n" +
indent + " }\n",
Expand All @@ -2297,7 +2275,8 @@ private void generateCharArrayEncodeMethods(
indent + " public %s %s(final String src)\n" +
indent + " {\n" +
indent + " final int length = %d;\n" +
indent + " final byte[] bytes = null == src ? new byte[0] : src.getBytes(%s);\n" +
indent + " final byte[] bytes = (null == src || src.isEmpty()) ?" +
" org.agrona.collections.ArrayUtil.EMPTY_BYTE_ARRAY : src.getBytes(%s);\n" +
indent + " if (bytes.length > length)\n" +
indent + " {\n" +
indent + " throw new IndexOutOfBoundsException(" +
Expand Down Expand Up @@ -2387,7 +2366,7 @@ private static void generateCharacterEncodingMethod(
sb.append("\n")
.append(indent).append(" public static String ").append(propName).append("CharacterEncoding()\n")
.append(indent).append(" {\n")
.append(indent).append(" return \"").append(characterEncoding).append("\";\n")
.append(indent).append(" return ").append(charsetName(characterEncoding)).append(";\n")
.append(indent).append(" }\n");
}
}
Expand Down Expand Up @@ -3537,7 +3516,7 @@ private void appendDecoderDisplay(
}
else
{
if (characterEncoding.contains("ASCII") || characterEncoding.contains("ascii"))
if (isAsciiEncoding(characterEncoding))
{
append(sb, indent, "builder.append('\\'');");
append(sb, indent, formatGetterName(varDataToken.name()) + "(builder);");
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -18,20 +18,19 @@
import org.agrona.Strings;
import uk.co.real_logic.sbe.PrimitiveType;
import uk.co.real_logic.sbe.SbeTool;
import uk.co.real_logic.sbe.ValidationUtil;
import uk.co.real_logic.sbe.generation.Generators;
import uk.co.real_logic.sbe.ir.Token;
import uk.co.real_logic.sbe.ValidationUtil;

import java.io.IOException;
import java.lang.reflect.Field;
import java.lang.reflect.Modifier;
import java.nio.charset.Charset;
import java.nio.charset.StandardCharsets;
import java.util.EnumMap;
import java.util.HashMap;
import java.util.Map;

import static java.lang.reflect.Modifier.STATIC;

/**
* Utilities for mapping between {@link uk.co.real_logic.sbe.ir.Ir} and the Java language.
*/
Expand Down Expand Up @@ -96,19 +95,33 @@ public String toString()
/**
* Indexes known charset aliases to the name of the instance in {@link StandardCharsets}.
*/
private static final Map<String, String> STD_CHARSETS = new HashMap<>();
static final HashMap<String, String> STD_CHARSETS = new HashMap<>();

static
{
try
{
for (final Field field : StandardCharsets.class.getDeclaredFields())
{
if (Charset.class.isAssignableFrom(field.getType()) && ((field.getModifiers() & STATIC) == STATIC))
if (Charset.class.isAssignableFrom(field.getType()) && Modifier.isStatic(field.getModifiers()) &&
Modifier.isPublic(field.getModifiers()))
{
final Charset charset = (Charset)field.get(null);
STD_CHARSETS.put(charset.name(), field.getName());
charset.aliases().forEach((alias) -> STD_CHARSETS.put(alias, field.getName()));
final String name = field.getName();
String oldName = STD_CHARSETS.put(charset.name(), name);
if (null != oldName)
{
throw new IllegalStateException("Duplicate charset alias: old=" + oldName + ", new=" + name);
}
for (final String alias : charset.aliases())
{
oldName = STD_CHARSETS.put(alias, name);
if (null != oldName)
{
throw new IllegalStateException("Duplicate charset alias: old=" + oldName + ", new=" +
alias);
}
}
}
}
}
Expand Down Expand Up @@ -207,10 +220,52 @@ public static String charset(final String encoding)
}
else
{
return "java.nio.charset.Charset.forName(\"" + encoding + "\")";
final String canonicalName = Charset.isSupported(encoding) ? Charset.forName(encoding).name() : encoding;
return "java.nio.charset.Charset.forName(\"" + canonicalName + "\")";
}
}

/**
* Code to fetch the name of the {@link Charset} given the encoding.
*
* @param encoding as a string name (eg. UTF-8).
* @return the code to fetch the associated Charset name.
*/
public static String charsetName(final String encoding)
{
final String charsetName = STD_CHARSETS.get(encoding);
if (charsetName != null)
{
return "java.nio.charset.StandardCharsets." + charsetName + ".name()";
}
else
{
return "\"" + (Charset.isSupported(encoding) ? Charset.forName(encoding).name() : encoding) + "\"";
}
}

/**
* Checks if the given encoding represents an ASCII charset.
*
* @param encoding as a string name (e.g. ASCII).
* @return {@code true} if the encoding denotes an ASCII charset.
*/
public static boolean isAsciiEncoding(final String encoding)
{
return "US_ASCII".equals(STD_CHARSETS.get(encoding));
}

/**
* Checks if the given encoding represents a UTF-8 charset.
*
* @param encoding as a string name (e.g. unicode-1-1-utf-8).
* @return {@code true} if the encoding denotes a UTF-8 charset.
*/
public static boolean isUtf8Encoding(final String encoding)
{
return "UTF_8".equals(STD_CHARSETS.get(encoding));
}

/**
* Generate a literal value to be used in code generation.
*
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
import uk.co.real_logic.sbe.PrimitiveType;
import uk.co.real_logic.sbe.generation.CodeGenerator;
import uk.co.real_logic.sbe.generation.Generators;
import uk.co.real_logic.sbe.generation.java.JavaUtil;
import uk.co.real_logic.sbe.ir.Encoding;
import uk.co.real_logic.sbe.ir.Ir;
import uk.co.real_logic.sbe.ir.Signal;
Expand Down Expand Up @@ -304,20 +305,15 @@ static void generateEncoderVarData(

final String varDataType;
final String toBytesFn;
switch (characterEncoding)
if (JavaUtil.isUtf8Encoding(characterEncoding))
{
case "UTF-8":
{
varDataType = "&str";
toBytesFn = ".as_bytes()";
break;
}
default:
{
varDataType = "&[u8]";
toBytesFn = "";
break;
}
varDataType = "&str";
toBytesFn = ".as_bytes()";
}
else
{
varDataType = "&[u8]";
toBytesFn = "";
}

// function to write slice ... todo - handle character encoding ?
Expand Down Expand Up @@ -681,23 +677,20 @@ private static void generatePrimitiveConstantDecoder(
indent(sb, level, "/// characterEncoding: '%s'\n", characterEncoding);
indent(sb, level, "#[inline]\n");

switch (characterEncoding)
if (JavaUtil.isAsciiEncoding(characterEncoding))
{
case "US-ASCII":
{
indent(sb, level, "pub fn %s(&self) -> &'static [u8] {\n",
formatFunctionName(name));
indent(sb, level + 1, "b\"%s\"\n", rawConstValue);
break;
}
case "UTF-8":
{
indent(sb, level, "pub fn %s(&self) -> &'static str {\n", formatFunctionName(name));
indent(sb, level + 1, "\"%s\"\n", rawConstValue);
break;
}
default:
throw new RuntimeException("Unable to handle " + characterEncoding);
indent(sb, level, "pub fn %s(&self) -> &'static [u8] {\n",
formatFunctionName(name));
indent(sb, level + 1, "b\"%s\"\n", rawConstValue);
}
else if (JavaUtil.isUtf8Encoding(characterEncoding))
{
indent(sb, level, "pub fn %s(&self) -> &'static str {\n", formatFunctionName(name));
indent(sb, level + 1, "\"%s\"\n", rawConstValue);
}
else
{
throw new IllegalArgumentException("Unsupported encoding: " + characterEncoding);
}

indent(sb, level, "}\n\n");
Expand Down
Loading